src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2014 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 497
 498 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 499
 500 /* A character to be produced on output if encoding of the original
 501    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 502 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 503
 504 /* UTF-8 section */
 505 #define CODING_UTF_8_BOM(coding)        \
 506   ((coding)->spec.utf_8_bom)
 507
 508 /* UTF-16 section */
 509 #define CODING_UTF_16_BOM(coding)       \
 510   ((coding)->spec.utf_16.bom)
 511
 512 #define CODING_UTF_16_ENDIAN(coding)    \
 513   ((coding)->spec.utf_16.endian)
 514
 515 #define CODING_UTF_16_SURROGATE(coding) \
 516   ((coding)->spec.utf_16.surrogate)
 517
 518
 519 /* CCL section */
 520 #define CODING_CCL_DECODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 522 #define CODING_CCL_ENCODER(coding)      \
 523   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 524 #define CODING_CCL_VALIDS(coding)                                          \
 525   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 526
 527 /* Index for each coding category in `coding_categories' */
 528
 529 enum coding_category
 530   {
 531     coding_category_iso_7,
 532     coding_category_iso_7_tight,
 533     coding_category_iso_8_1,
 534     coding_category_iso_8_2,
 535     coding_category_iso_7_else,
 536     coding_category_iso_8_else,
 537     coding_category_utf_8_auto,
 538     coding_category_utf_8_nosig,
 539     coding_category_utf_8_sig,
 540     coding_category_utf_16_auto,
 541     coding_category_utf_16_be,
 542     coding_category_utf_16_le,
 543     coding_category_utf_16_be_nosig,
 544     coding_category_utf_16_le_nosig,
 545     coding_category_charset,
 546     coding_category_sjis,
 547     coding_category_big5,
 548     coding_category_ccl,
 549     coding_category_emacs_mule,
 550     /* All above are targets of code detection.  */
 551     coding_category_raw_text,
 552     coding_category_undecided,
 553     coding_category_max
 554   };
 555
 556 /* Definitions of flag bits used in detect_coding_XXXX.  */
 557 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 558 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 559 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 560 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 561 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 562 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 563 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 564 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 565 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 566 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 567 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 568 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 569 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 570 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 571 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 572 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 573 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 574 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 575 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 576 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 577
 578 /* This value is returned if detect_coding_mask () find nothing other
 579    than ASCII characters.  */
 580 #define CATEGORY_MASK_ANY               \
 581   (CATEGORY_MASK_ISO_7                  \
 582    | CATEGORY_MASK_ISO_7_TIGHT          \
 583    | CATEGORY_MASK_ISO_8_1              \
 584    | CATEGORY_MASK_ISO_8_2              \
 585    | CATEGORY_MASK_ISO_7_ELSE           \
 586    | CATEGORY_MASK_ISO_8_ELSE           \
 587    | CATEGORY_MASK_UTF_8_AUTO           \
 588    | CATEGORY_MASK_UTF_8_NOSIG          \
 589    | CATEGORY_MASK_UTF_8_SIG            \
 590    | CATEGORY_MASK_UTF_16_AUTO          \
 591    | CATEGORY_MASK_UTF_16_BE            \
 592    | CATEGORY_MASK_UTF_16_LE            \
 593    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 594    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 595    | CATEGORY_MASK_CHARSET              \
 596    | CATEGORY_MASK_SJIS                 \
 597    | CATEGORY_MASK_BIG5                 \
 598    | CATEGORY_MASK_CCL                  \
 599    | CATEGORY_MASK_EMACS_MULE)
 600
 601
 602 #define CATEGORY_MASK_ISO_7BIT \
 603   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 604
 605 #define CATEGORY_MASK_ISO_8BIT \
 606   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 607
 608 #define CATEGORY_MASK_ISO_ELSE \
 609   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 610
 611 #define CATEGORY_MASK_ISO_ESCAPE        \
 612   (CATEGORY_MASK_ISO_7                  \
 613    | CATEGORY_MASK_ISO_7_TIGHT          \
 614    | CATEGORY_MASK_ISO_7_ELSE           \
 615    | CATEGORY_MASK_ISO_8_ELSE)
 616
 617 #define CATEGORY_MASK_ISO       \
 618   (  CATEGORY_MASK_ISO_7BIT     \
 619      | CATEGORY_MASK_ISO_8BIT   \
 620      | CATEGORY_MASK_ISO_ELSE)
 621
 622 #define CATEGORY_MASK_UTF_16            \
 623   (CATEGORY_MASK_UTF_16_AUTO            \
 624    | CATEGORY_MASK_UTF_16_BE            \
 625    | CATEGORY_MASK_UTF_16_LE            \
 626    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 627    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 628
 629 #define CATEGORY_MASK_UTF_8     \
 630   (CATEGORY_MASK_UTF_8_AUTO     \
 631    | CATEGORY_MASK_UTF_8_NOSIG  \
 632    | CATEGORY_MASK_UTF_8_SIG)
 633
 634 /* Table of coding categories (Lisp symbols).  This variable is for
 635    internal use only.  */
 636 static Lisp_Object Vcoding_category_table;
 637
 638 /* Table of coding-categories ordered by priority.  */
 639 static enum coding_category coding_priorities[coding_category_max];
 640
 641 /* Nth element is a coding context for the coding system bound to the
 642    Nth coding category.  */
 643 static struct coding_system coding_categories[coding_category_max];
 644
 645 /*** Commonly used macros and functions ***/
 646
 647 #ifndef min
 648 #define min(a, b) ((a) < (b) ? (a) : (b))
 649 #endif
 650 #ifndef max
 651 #define max(a, b) ((a) > (b) ? (a) : (b))
 652 #endif
 653
 654 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 655
 656 static int
 657 encode_inhibit_flag (Lisp_Object flag)
 658 {
 659   return NILP (flag) ? -1 : EQ (flag, Qt);
 660 }
 661
 662 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 663    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 664
 665 static bool
 666 inhibit_flag (int encoded_flag, bool var)
 667 {
 668   return 0 < encoded_flag + var;
 669 }
 670
 671 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 672   do {                                                  \
 673     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 674     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 675   } while (0)
 676
 677 static void
 678 CHECK_NATNUM_CAR (Lisp_Object x)
 679 {
 680   Lisp_Object tmp = XCAR (x);
 681   CHECK_NATNUM (tmp);
 682   XSETCAR (x, tmp);
 683 }
 684
 685 static void
 686 CHECK_NATNUM_CDR (Lisp_Object x)
 687 {
 688   Lisp_Object tmp = XCDR (x);
 689   CHECK_NATNUM (tmp);
 690   XSETCDR (x, tmp);
 691 }
 692
 693
 694 /* Safely get one byte from the source text pointed by SRC which ends
 695    at SRC_END, and set C to that byte.  If there are not enough bytes
 696    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 697    and a multibyte character is found at SRC, set C to the
 698    negative value of the character code.  The caller should declare
 699    and set these variables appropriately in advance:
 700         src, src_end, multibytep */
 701
 702 #define ONE_MORE_BYTE(c)                                \
 703   do {                                                  \
 704     if (src == src_end)                                 \
 705       {                                                 \
 706         if (src_base < src)                             \
 707           record_conversion_result                      \
 708             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 709         goto no_more_source;                            \
 710       }                                                 \
 711     c = *src++;                                         \
 712     if (multibytep && (c & 0x80))                       \
 713       {                                                 \
 714         if ((c & 0xFE) == 0xC0)                         \
 715           c = ((c & 1) << 6) | *src++;                  \
 716         else                                            \
 717           {                                             \
 718             src--;                                      \
 719             c = - string_char (src, &src, NULL);        \
 720             record_conversion_result                    \
 721               (coding, CODING_RESULT_INVALID_SRC);      \
 722           }                                             \
 723       }                                                 \
 724     consumed_chars++;                                   \
 725   } while (0)
 726
 727 /* Safely get two bytes from the source text pointed by SRC which ends
 728    at SRC_END, and set C1 and C2 to those bytes while skipping the
 729    heading multibyte characters.  If there are not enough bytes in the
 730    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 731    a multibyte character is found for C2, set C2 to the negative value
 732    of the character code.  The caller should declare and set these
 733    variables appropriately in advance:
 734         src, src_end, multibytep
 735    It is intended that this macro is used in detect_coding_utf_16.  */
 736
 737 #define TWO_MORE_BYTES(c1, c2)                          \
 738   do {                                                  \
 739     do {                                                \
 740       if (src == src_end)                               \
 741         goto no_more_source;                            \
 742       c1 = *src++;                                      \
 743       if (multibytep && (c1 & 0x80))                    \
 744         {                                               \
 745           if ((c1 & 0xFE) == 0xC0)                      \
 746             c1 = ((c1 & 1) << 6) | *src++;              \
 747           else                                          \
 748             {                                           \
 749               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 750               c1 = -1;                                  \
 751             }                                           \
 752         }                                               \
 753     } while (c1 < 0);                                   \
 754     if (src == src_end)                                 \
 755       goto no_more_source;                              \
 756     c2 = *src++;                                        \
 757     if (multibytep && (c2 & 0x80))                      \
 758       {                                                 \
 759         if ((c2 & 0xFE) == 0xC0)                        \
 760           c2 = ((c2 & 1) << 6) | *src++;                \
 761         else                                            \
 762           c2 = -1;                                      \
 763       }                                                 \
 764   } while (0)
 765
 766
 767 /* Store a byte C in the place pointed by DST and increment DST to the
 768    next free point, and increment PRODUCED_CHARS.  The caller should
 769    assure that C is 0..127, and declare and set the variable `dst'
 770    appropriately in advance.
 771 */
 772
 773
 774 #define EMIT_ONE_ASCII_BYTE(c)  \
 775   do {                          \
 776     produced_chars++;           \
 777     *dst++ = (c);               \
 778   } while (0)
 779
 780
 781 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 782
 783 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 784   do {                                  \
 785     produced_chars += 2;                \
 786     *dst++ = (c1), *dst++ = (c2);       \
 787   } while (0)
 788
 789
 790 /* Store a byte C in the place pointed by DST and increment DST to the
 791    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 792    store in an appropriate multibyte form.  The caller should
 793    declare and set the variables `dst' and `multibytep' appropriately
 794    in advance.  */
 795
 796 #define EMIT_ONE_BYTE(c)                \
 797   do {                                  \
 798     produced_chars++;                   \
 799     if (multibytep)                     \
 800       {                                 \
 801         unsigned ch = (c);              \
 802         if (ch >= 0x80)                 \
 803           ch = BYTE8_TO_CHAR (ch);      \
 804         CHAR_STRING_ADVANCE (ch, dst);  \
 805       }                                 \
 806     else                                \
 807       *dst++ = (c);                     \
 808   } while (0)
 809
 810
 811 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 812
 813 #define EMIT_TWO_BYTES(c1, c2)          \
 814   do {                                  \
 815     produced_chars += 2;                \
 816     if (multibytep)                     \
 817       {                                 \
 818         unsigned ch;                    \
 819                                         \
 820         ch = (c1);                      \
 821         if (ch >= 0x80)                 \
 822           ch = BYTE8_TO_CHAR (ch);      \
 823         CHAR_STRING_ADVANCE (ch, dst);  \
 824         ch = (c2);                      \
 825         if (ch >= 0x80)                 \
 826           ch = BYTE8_TO_CHAR (ch);      \
 827         CHAR_STRING_ADVANCE (ch, dst);  \
 828       }                                 \
 829     else                                \
 830       {                                 \
 831         *dst++ = (c1);                  \
 832         *dst++ = (c2);                  \
 833       }                                 \
 834   } while (0)
 835
 836
 837 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 838   do {                                  \
 839     EMIT_ONE_BYTE (c1);                 \
 840     EMIT_TWO_BYTES (c2, c3);            \
 841   } while (0)
 842
 843
 844 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 845   do {                                          \
 846     EMIT_TWO_BYTES (c1, c2);                    \
 847     EMIT_TWO_BYTES (c3, c4);                    \
 848   } while (0)
 849
 850
 851 static void
 852 record_conversion_result (struct coding_system *coding,
 853                           enum coding_result_code result)
 854 {
 855   coding->result = result;
 856   switch (result)
 857     {
 858     case CODING_RESULT_INSUFFICIENT_SRC:
 859       Vlast_code_conversion_error = Qinsufficient_source;
 860       break;
 861     case CODING_RESULT_INVALID_SRC:
 862       Vlast_code_conversion_error = Qinvalid_source;
 863       break;
 864     case CODING_RESULT_INTERRUPT:
 865       Vlast_code_conversion_error = Qinterrupted;
 866       break;
 867     case CODING_RESULT_INSUFFICIENT_DST:
 868       /* Don't record this error in Vlast_code_conversion_error
 869          because it happens just temporarily and is resolved when the
 870          whole conversion is finished.  */
 871       break;
 872     case CODING_RESULT_SUCCESS:
 873       break;
 874     default:
 875       Vlast_code_conversion_error = intern ("Unknown error");
 876     }
 877 }
 878
 879 /* These wrapper macros are used to preserve validity of pointers into
 880    buffer text across calls to decode_char, encode_char, etc, which
 881    could cause relocation of buffers if it loads a charset map,
 882    because loading a charset map allocates large structures.  */
 883
 884 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 885   do {                                                                       \
 886     ptrdiff_t offset;                                                        \
 887                                                                              \
 888     charset_map_loaded = 0;                                                  \
 889     c = DECODE_CHAR (charset, code);                                         \
 890     if (charset_map_loaded                                                   \
 891         && (offset = coding_change_source (coding)))                         \
 892       {                                                                      \
 893         src += offset;                                                       \
 894         src_base += offset;                                                  \
 895         src_end += offset;                                                   \
 896       }                                                                      \
 897   } while (0)
 898
 899 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 900   do {                                                                  \
 901     ptrdiff_t offset;                                                   \
 902                                                                         \
 903     charset_map_loaded = 0;                                             \
 904     code = ENCODE_CHAR (charset, c);                                    \
 905     if (charset_map_loaded                                              \
 906         && (offset = coding_change_destination (coding)))               \
 907       {                                                                 \
 908         dst += offset;                                                  \
 909         dst_end += offset;                                              \
 910       }                                                                 \
 911   } while (0)
 912
 913 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 914   do {                                                                  \
 915     ptrdiff_t offset;                                                   \
 916                                                                         \
 917     charset_map_loaded = 0;                                             \
 918     charset = char_charset (c, charset_list, code_return);              \
 919     if (charset_map_loaded                                              \
 920         && (offset = coding_change_destination (coding)))               \
 921       {                                                                 \
 922         dst += offset;                                                  \
 923         dst_end += offset;                                              \
 924       }                                                                 \
 925   } while (0)
 926
 927 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 928   do {                                                                  \
 929     ptrdiff_t offset;                                                   \
 930                                                                         \
 931     charset_map_loaded = 0;                                             \
 932     result = CHAR_CHARSET_P (c, charset);                               \
 933     if (charset_map_loaded                                              \
 934         && (offset = coding_change_destination (coding)))               \
 935       {                                                                 \
 936         dst += offset;                                                  \
 937         dst_end += offset;                                              \
 938       }                                                                 \
 939   } while (0)
 940
 941
 942 /* If there are at least BYTES length of room at dst, allocate memory
 943    for coding->destination and update dst and dst_end.  We don't have
 944    to take care of coding->source which will be relocated.  It is
 945    handled by calling coding_set_source in encode_coding.  */
 946
 947 #define ASSURE_DESTINATION(bytes)                               \
 948   do {                                                          \
 949     if (dst + (bytes) >= dst_end)                               \
 950       {                                                         \
 951         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 952                                                                 \
 953         dst = alloc_destination (coding, more_bytes, dst);      \
 954         dst_end = coding->destination + coding->dst_bytes;      \
 955       }                                                         \
 956   } while (0)
 957
 958
 959 /* Store multibyte form of the character C in P, and advance P to the
 960    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 961    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 962    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 963
 964 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 965
 966 /* Return the character code of character whose multibyte form is at
 967    P, and advance P to the end of the multibyte form.  This used to be
 968    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 969    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 970
 971 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 972
 973 /* Set coding->source from coding->src_object.  */
 974
 975 static void
 976 coding_set_source (struct coding_system *coding)
 977 {
 978   if (BUFFERP (coding->src_object))
 979     {
 980       struct buffer *buf = XBUFFER (coding->src_object);
 981
 982       if (coding->src_pos < 0)
 983         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 984       else
 985         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 986     }
 987   else if (STRINGP (coding->src_object))
 988     {
 989       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 990     }
 991   else
 992     {
 993       /* Otherwise, the source is C string and is never relocated
 994          automatically.  Thus we don't have to update anything.  */
 995     }
 996 }
 997
 998
 999 /* Set coding->source from coding->src_object, and return how many
1000    bytes coding->source was changed.  */
1001
1002 static ptrdiff_t
1003 coding_change_source (struct coding_system *coding)
1004 {
1005   const unsigned char *orig = coding->source;
1006   coding_set_source (coding);
1007   return coding->source - orig;
1008 }
1009
1010
1011 /* Set coding->destination from coding->dst_object.  */
1012
1013 static void
1014 coding_set_destination (struct coding_system *coding)
1015 {
1016   if (BUFFERP (coding->dst_object))
1017     {
1018       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1019         {
1020           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1021           coding->dst_bytes = (GAP_END_ADDR
1022                                - (coding->src_bytes - coding->consumed)
1023                                - coding->destination);
1024         }
1025       else
1026         {
1027           /* We are sure that coding->dst_pos_byte is before the gap
1028              of the buffer. */
1029           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1030                                  + coding->dst_pos_byte - BEG_BYTE);
1031           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1032                                - coding->destination);
1033         }
1034     }
1035   else
1036     {
1037       /* Otherwise, the destination is C string and is never relocated
1038          automatically.  Thus we don't have to update anything.  */
1039     }
1040 }
1041
1042
1043 /* Set coding->destination from coding->dst_object, and return how
1044    many bytes coding->destination was changed.  */
1045
1046 static ptrdiff_t
1047 coding_change_destination (struct coding_system *coding)
1048 {
1049   const unsigned char *orig = coding->destination;
1050   coding_set_destination (coding);
1051   return coding->destination - orig;
1052 }
1053
1054
1055 static void
1056 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1057 {
1058   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1059     string_overflow ();
1060   coding->destination = xrealloc (coding->destination,
1061                                   coding->dst_bytes + bytes);
1062   coding->dst_bytes += bytes;
1063 }
1064
1065 static void
1066 coding_alloc_by_making_gap (struct coding_system *coding,
1067                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1068 {
1069   if (EQ (coding->src_object, coding->dst_object))
1070     {
1071       /* The gap may contain the produced data at the head and not-yet
1072          consumed data at the tail.  To preserve those data, we at
1073          first make the gap size to zero, then increase the gap
1074          size.  */
1075       ptrdiff_t add = GAP_SIZE;
1076
1077       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1078       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1079       make_gap (bytes);
1080       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1081       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1082     }
1083   else
1084     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1085 }
1086
1087
1088 static unsigned char *
1089 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1090                    unsigned char *dst)
1091 {
1092   ptrdiff_t offset = dst - coding->destination;
1093
1094   if (BUFFERP (coding->dst_object))
1095     {
1096       struct buffer *buf = XBUFFER (coding->dst_object);
1097
1098       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1099     }
1100   else
1101     coding_alloc_by_realloc (coding, nbytes);
1102   coding_set_destination (coding);
1103   dst = coding->destination + offset;
1104   return dst;
1105 }
1106
1107 /** Macros for annotations.  */
1108
1109 /* An annotation data is stored in the array coding->charbuf in this
1110    format:
1111      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1112    LENGTH is the number of elements in the annotation.
1113    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1114    NCHARS is the number of characters in the text annotated.
1115
1116    The format of the following elements depend on ANNOTATION_MASK.
1117
1118    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1119    follows:
1120      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1121
1122    NBYTES is the number of bytes specified in the header part of
1123    old-style emacs-mule encoding, or 0 for the other kind of
1124    composition.
1125
1126    METHOD is one of enum composition_method.
1127
1128    Optional COMPOSITION-COMPONENTS are characters and composition
1129    rules.
1130
1131    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1132    follows.
1133
1134    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1135    recover from an invalid annotation, and should be skipped by
1136    produce_annotation.  */
1137
1138 /* Maximum length of the header of annotation data.  */
1139 #define MAX_ANNOTATION_LENGTH 5
1140
1141 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1142   do {                                                  \
1143     *(buf)++ = -(len);                                  \
1144     *(buf)++ = (mask);                                  \
1145     *(buf)++ = (nchars);                                \
1146     coding->annotated = 1;                              \
1147   } while (0);
1148
1149 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1150   do {                                                                      \
1151     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1152     *buf++ = nbytes;                                                        \
1153     *buf++ = method;                                                        \
1154   } while (0)
1155
1156
1157 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1158   do {                                                                  \
1159     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1160     *buf++ = id;                                                        \
1161   } while (0)
1162
1163
1164 /* Bitmasks for coding->eol_seen.  */
1165
1166 #define EOL_SEEN_NONE   0
1167 #define EOL_SEEN_LF     1
1168 #define EOL_SEEN_CR     2
1169 #define EOL_SEEN_CRLF   4
1170
1171 \f
1172 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1173
1174
1175
1176 \f
1177 /*** 3. UTF-8 ***/
1178
1179 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1180    Return true if a text is encoded in UTF-8.  */
1181
1182 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1183 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1184 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1185 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1186 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1187 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1188
1189 #define UTF_8_BOM_1 0xEF
1190 #define UTF_8_BOM_2 0xBB
1191 #define UTF_8_BOM_3 0xBF
1192
1193 /* Unlike the other detect_coding_XXX, this function counts number of
1194    characters and check EOL format.  */
1195
1196 static bool
1197 detect_coding_utf_8 (struct coding_system *coding,
1198                      struct coding_detection_info *detect_info)
1199 {
1200   const unsigned char *src = coding->source, *src_base;
1201   const unsigned char *src_end = coding->source + coding->src_bytes;
1202   bool multibytep = coding->src_multibyte;
1203   ptrdiff_t consumed_chars = 0;
1204   bool bom_found = 0;
1205   ptrdiff_t nchars = coding->head_ascii;
1206   int eol_seen = coding->eol_seen;
1207
1208   detect_info->checked |= CATEGORY_MASK_UTF_8;
1209   /* A coding system of this category is always ASCII compatible.  */
1210   src += nchars;
1211
1212   if (src == coding->source     /* BOM should be at the head.  */
1213       && src + 3 < src_end      /* BOM is 3-byte long.  */
1214       && src[0] == UTF_8_BOM_1
1215       && src[1] == UTF_8_BOM_2
1216       && src[2] == UTF_8_BOM_3)
1217     {
1218       bom_found = 1;
1219       src += 3;
1220       nchars++;
1221     }
1222
1223   while (1)
1224     {
1225       int c, c1, c2, c3, c4;
1226
1227       src_base = src;
1228       ONE_MORE_BYTE (c);
1229       if (c < 0 || UTF_8_1_OCTET_P (c))
1230         {
1231           nchars++;
1232           if (c == '\r')
1233             {
1234               if (src < src_end && *src == '\n')
1235                 {
1236                   eol_seen |= EOL_SEEN_CRLF;
1237                   src++;
1238                   nchars++;
1239                 }
1240               else
1241                 eol_seen |= EOL_SEEN_CR;
1242             }
1243           else if (c == '\n')
1244             eol_seen |= EOL_SEEN_LF;
1245           continue;
1246         }
1247       ONE_MORE_BYTE (c1);
1248       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1249         break;
1250       if (UTF_8_2_OCTET_LEADING_P (c))
1251         {
1252           nchars++;
1253           continue;
1254         }
1255       ONE_MORE_BYTE (c2);
1256       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1257         break;
1258       if (UTF_8_3_OCTET_LEADING_P (c))
1259         {
1260           nchars++;
1261           continue;
1262         }
1263       ONE_MORE_BYTE (c3);
1264       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1265         break;
1266       if (UTF_8_4_OCTET_LEADING_P (c))
1267         {
1268           nchars++;
1269           continue;
1270         }
1271       ONE_MORE_BYTE (c4);
1272       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1273         break;
1274       if (UTF_8_5_OCTET_LEADING_P (c))
1275         {
1276           nchars++;
1277           continue;
1278         }
1279       break;
1280     }
1281   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1282   return 0;
1283
1284  no_more_source:
1285   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1286     {
1287       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1288       return 0;
1289     }
1290   if (bom_found)
1291     {
1292       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1293       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1294     }
1295   else
1296     {
1297       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1298       if (nchars < src_end - coding->source)
1299         /* The found characters are less than source bytes, which
1300            means that we found a valid non-ASCII characters.  */
1301         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1302     }
1303   coding->detected_utf8_bytes = src_base - coding->source;
1304   coding->detected_utf8_chars = nchars;
1305   return 1;
1306 }
1307
1308
1309 static void
1310 decode_coding_utf_8 (struct coding_system *coding)
1311 {
1312   const unsigned char *src = coding->source + coding->consumed;
1313   const unsigned char *src_end = coding->source + coding->src_bytes;
1314   const unsigned char *src_base;
1315   int *charbuf = coding->charbuf + coding->charbuf_used;
1316   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1317   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1318   bool multibytep = coding->src_multibyte;
1319   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1320   bool eol_dos
1321     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1322   int byte_after_cr = -1;
1323
1324   if (bom != utf_without_bom)
1325     {
1326       int c1, c2, c3;
1327
1328       src_base = src;
1329       ONE_MORE_BYTE (c1);
1330       if (! UTF_8_3_OCTET_LEADING_P (c1))
1331         src = src_base;
1332       else
1333         {
1334           ONE_MORE_BYTE (c2);
1335           if (! UTF_8_EXTRA_OCTET_P (c2))
1336             src = src_base;
1337           else
1338             {
1339               ONE_MORE_BYTE (c3);
1340               if (! UTF_8_EXTRA_OCTET_P (c3))
1341                 src = src_base;
1342               else
1343                 {
1344                   if ((c1 != UTF_8_BOM_1)
1345                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1346                     src = src_base;
1347                   else
1348                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1349                 }
1350             }
1351         }
1352     }
1353   CODING_UTF_8_BOM (coding) = utf_without_bom;
1354
1355   while (1)
1356     {
1357       int c, c1, c2, c3, c4, c5;
1358
1359       src_base = src;
1360       consumed_chars_base = consumed_chars;
1361
1362       if (charbuf >= charbuf_end)
1363         {
1364           if (byte_after_cr >= 0)
1365             src_base--;
1366           break;
1367         }
1368
1369       /* In the simple case, rapidly handle ordinary characters */
1370       if (multibytep && ! eol_dos
1371           && charbuf < charbuf_end - 6 && src < src_end - 6)
1372         {
1373           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1374             {
1375               c1 = *src;
1376               if (c1 & 0x80)
1377                 break;
1378               src++;
1379               consumed_chars++;
1380               *charbuf++ = c1;
1381
1382               c1 = *src;
1383               if (c1 & 0x80)
1384                 break;
1385               src++;
1386               consumed_chars++;
1387               *charbuf++ = c1;
1388
1389               c1 = *src;
1390               if (c1 & 0x80)
1391                 break;
1392               src++;
1393               consumed_chars++;
1394               *charbuf++ = c1;
1395
1396               c1 = *src;
1397               if (c1 & 0x80)
1398                 break;
1399               src++;
1400               consumed_chars++;
1401               *charbuf++ = c1;
1402             }
1403           /* If we handled at least one character, restart the main loop.  */
1404           if (src != src_base)
1405             continue;
1406         }
1407
1408       if (byte_after_cr >= 0)
1409         c1 = byte_after_cr, byte_after_cr = -1;
1410       else
1411         ONE_MORE_BYTE (c1);
1412       if (c1 < 0)
1413         {
1414           c = - c1;
1415         }
1416       else if (UTF_8_1_OCTET_P (c1))
1417         {
1418           if (eol_dos && c1 == '\r')
1419             ONE_MORE_BYTE (byte_after_cr);
1420           c = c1;
1421         }
1422       else
1423         {
1424           ONE_MORE_BYTE (c2);
1425           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1426             goto invalid_code;
1427           if (UTF_8_2_OCTET_LEADING_P (c1))
1428             {
1429               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1430               /* Reject overlong sequences here and below.  Encoders
1431                  producing them are incorrect, they can be misleading,
1432                  and they mess up read/write invariance.  */
1433               if (c < 128)
1434                 goto invalid_code;
1435             }
1436           else
1437             {
1438               ONE_MORE_BYTE (c3);
1439               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1440                 goto invalid_code;
1441               if (UTF_8_3_OCTET_LEADING_P (c1))
1442                 {
1443                   c = (((c1 & 0xF) << 12)
1444                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1445                   if (c < 0x800
1446                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1447                     goto invalid_code;
1448                 }
1449               else
1450                 {
1451                   ONE_MORE_BYTE (c4);
1452                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1453                     goto invalid_code;
1454                   if (UTF_8_4_OCTET_LEADING_P (c1))
1455                     {
1456                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1457                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1458                     if (c < 0x10000)
1459                       goto invalid_code;
1460                     }
1461                   else
1462                     {
1463                       ONE_MORE_BYTE (c5);
1464                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1465                         goto invalid_code;
1466                       if (UTF_8_5_OCTET_LEADING_P (c1))
1467                         {
1468                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1469                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1470                                | (c5 & 0x3F));
1471                           if ((c > MAX_CHAR) || (c < 0x200000))
1472                             goto invalid_code;
1473                         }
1474                       else
1475                         goto invalid_code;
1476                     }
1477                 }
1478             }
1479         }
1480
1481       *charbuf++ = c;
1482       continue;
1483
1484     invalid_code:
1485       src = src_base;
1486       consumed_chars = consumed_chars_base;
1487       ONE_MORE_BYTE (c);
1488       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1489       coding->errors++;
1490     }
1491
1492  no_more_source:
1493   coding->consumed_char += consumed_chars_base;
1494   coding->consumed = src_base - coding->source;
1495   coding->charbuf_used = charbuf - coding->charbuf;
1496 }
1497
1498
1499 static bool
1500 encode_coding_utf_8 (struct coding_system *coding)
1501 {
1502   bool multibytep = coding->dst_multibyte;
1503   int *charbuf = coding->charbuf;
1504   int *charbuf_end = charbuf + coding->charbuf_used;
1505   unsigned char *dst = coding->destination + coding->produced;
1506   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1507   ptrdiff_t produced_chars = 0;
1508   int c;
1509
1510   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1511     {
1512       ASSURE_DESTINATION (3);
1513       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1514       CODING_UTF_8_BOM (coding) = utf_without_bom;
1515     }
1516
1517   if (multibytep)
1518     {
1519       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1520
1521       while (charbuf < charbuf_end)
1522         {
1523           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1524
1525           ASSURE_DESTINATION (safe_room);
1526           c = *charbuf++;
1527           if (CHAR_BYTE8_P (c))
1528             {
1529               c = CHAR_TO_BYTE8 (c);
1530               EMIT_ONE_BYTE (c);
1531             }
1532           else
1533             {
1534               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1535               for (p = str; p < pend; p++)
1536                 EMIT_ONE_BYTE (*p);
1537             }
1538         }
1539     }
1540   else
1541     {
1542       int safe_room = MAX_MULTIBYTE_LENGTH;
1543
1544       while (charbuf < charbuf_end)
1545         {
1546           ASSURE_DESTINATION (safe_room);
1547           c = *charbuf++;
1548           if (CHAR_BYTE8_P (c))
1549             *dst++ = CHAR_TO_BYTE8 (c);
1550           else
1551             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1552           produced_chars++;
1553         }
1554     }
1555   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1556   coding->produced_char += produced_chars;
1557   coding->produced = dst - coding->destination;
1558   return 0;
1559 }
1560
1561
1562 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1563    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1564
1565 #define UTF_16_HIGH_SURROGATE_P(val) \
1566   (((val) & 0xFC00) == 0xD800)
1567
1568 #define UTF_16_LOW_SURROGATE_P(val) \
1569   (((val) & 0xFC00) == 0xDC00)
1570
1571
1572 static bool
1573 detect_coding_utf_16 (struct coding_system *coding,
1574                       struct coding_detection_info *detect_info)
1575 {
1576   const unsigned char *src = coding->source;
1577   const unsigned char *src_end = coding->source + coding->src_bytes;
1578   bool multibytep = coding->src_multibyte;
1579   int c1, c2;
1580
1581   detect_info->checked |= CATEGORY_MASK_UTF_16;
1582   if (coding->mode & CODING_MODE_LAST_BLOCK
1583       && (coding->src_chars & 1))
1584     {
1585       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1586       return 0;
1587     }
1588
1589   TWO_MORE_BYTES (c1, c2);
1590   if ((c1 == 0xFF) && (c2 == 0xFE))
1591     {
1592       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1593                              | CATEGORY_MASK_UTF_16_AUTO);
1594       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1595                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1596                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1597     }
1598   else if ((c1 == 0xFE) && (c2 == 0xFF))
1599     {
1600       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1601                              | CATEGORY_MASK_UTF_16_AUTO);
1602       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1603                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1604                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1605     }
1606   else if (c2 < 0)
1607     {
1608       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1609       return 0;
1610     }
1611   else
1612     {
1613       /* We check the dispersion of Eth and Oth bytes where E is even and
1614          O is odd.  If both are high, we assume binary data.*/
1615       unsigned char e[256], o[256];
1616       unsigned e_num = 1, o_num = 1;
1617
1618       memset (e, 0, 256);
1619       memset (o, 0, 256);
1620       e[c1] = 1;
1621       o[c2] = 1;
1622
1623       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1624                                 |CATEGORY_MASK_UTF_16_BE
1625                                 | CATEGORY_MASK_UTF_16_LE);
1626
1627       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1628              != CATEGORY_MASK_UTF_16)
1629         {
1630           TWO_MORE_BYTES (c1, c2);
1631           if (c2 < 0)
1632             break;
1633           if (! e[c1])
1634             {
1635               e[c1] = 1;
1636               e_num++;
1637               if (e_num >= 128)
1638                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1639             }
1640           if (! o[c2])
1641             {
1642               o[c2] = 1;
1643               o_num++;
1644               if (o_num >= 128)
1645                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1646             }
1647         }
1648       return 0;
1649     }
1650
1651  no_more_source:
1652   return 1;
1653 }
1654
1655 static void
1656 decode_coding_utf_16 (struct coding_system *coding)
1657 {
1658   const unsigned char *src = coding->source + coding->consumed;
1659   const unsigned char *src_end = coding->source + coding->src_bytes;
1660   const unsigned char *src_base;
1661   int *charbuf = coding->charbuf + coding->charbuf_used;
1662   /* We may produces at most 3 chars in one loop.  */
1663   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1664   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1665   bool multibytep = coding->src_multibyte;
1666   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1667   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1668   int surrogate = CODING_UTF_16_SURROGATE (coding);
1669   bool eol_dos
1670     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1671   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1672
1673   if (bom == utf_with_bom)
1674     {
1675       int c, c1, c2;
1676
1677       src_base = src;
1678       ONE_MORE_BYTE (c1);
1679       ONE_MORE_BYTE (c2);
1680       c = (c1 << 8) | c2;
1681
1682       if (endian == utf_16_big_endian
1683           ? c != 0xFEFF : c != 0xFFFE)
1684         {
1685           /* The first two bytes are not BOM.  Treat them as bytes
1686              for a normal character.  */
1687           src = src_base;
1688           coding->errors++;
1689         }
1690       CODING_UTF_16_BOM (coding) = utf_without_bom;
1691     }
1692   else if (bom == utf_detect_bom)
1693     {
1694       /* We have already tried to detect BOM and failed in
1695          detect_coding.  */
1696       CODING_UTF_16_BOM (coding) = utf_without_bom;
1697     }
1698
1699   while (1)
1700     {
1701       int c, c1, c2;
1702
1703       src_base = src;
1704       consumed_chars_base = consumed_chars;
1705
1706       if (charbuf >= charbuf_end)
1707         {
1708           if (byte_after_cr1 >= 0)
1709             src_base -= 2;
1710           break;
1711         }
1712
1713       if (byte_after_cr1 >= 0)
1714         c1 = byte_after_cr1, byte_after_cr1 = -1;
1715       else
1716         ONE_MORE_BYTE (c1);
1717       if (c1 < 0)
1718         {
1719           *charbuf++ = -c1;
1720           continue;
1721         }
1722       if (byte_after_cr2 >= 0)
1723         c2 = byte_after_cr2, byte_after_cr2 = -1;
1724       else
1725         ONE_MORE_BYTE (c2);
1726       if (c2 < 0)
1727         {
1728           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1729           *charbuf++ = -c2;
1730           continue;
1731         }
1732       c = (endian == utf_16_big_endian
1733            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1734
1735       if (surrogate)
1736         {
1737           if (! UTF_16_LOW_SURROGATE_P (c))
1738             {
1739               if (endian == utf_16_big_endian)
1740                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1741               else
1742                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1743               *charbuf++ = c1;
1744               *charbuf++ = c2;
1745               coding->errors++;
1746               if (UTF_16_HIGH_SURROGATE_P (c))
1747                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1748               else
1749                 *charbuf++ = c;
1750             }
1751           else
1752             {
1753               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1754               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1755               *charbuf++ = 0x10000 + c;
1756             }
1757         }
1758       else
1759         {
1760           if (UTF_16_HIGH_SURROGATE_P (c))
1761             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1762           else
1763             {
1764               if (eol_dos && c == '\r')
1765                 {
1766                   ONE_MORE_BYTE (byte_after_cr1);
1767                   ONE_MORE_BYTE (byte_after_cr2);
1768                 }
1769               *charbuf++ = c;
1770             }
1771         }
1772     }
1773
1774  no_more_source:
1775   coding->consumed_char += consumed_chars_base;
1776   coding->consumed = src_base - coding->source;
1777   coding->charbuf_used = charbuf - coding->charbuf;
1778 }
1779
1780 static bool
1781 encode_coding_utf_16 (struct coding_system *coding)
1782 {
1783   bool multibytep = coding->dst_multibyte;
1784   int *charbuf = coding->charbuf;
1785   int *charbuf_end = charbuf + coding->charbuf_used;
1786   unsigned char *dst = coding->destination + coding->produced;
1787   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1788   int safe_room = 8;
1789   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1790   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1791   ptrdiff_t produced_chars = 0;
1792   int c;
1793
1794   if (bom != utf_without_bom)
1795     {
1796       ASSURE_DESTINATION (safe_room);
1797       if (big_endian)
1798         EMIT_TWO_BYTES (0xFE, 0xFF);
1799       else
1800         EMIT_TWO_BYTES (0xFF, 0xFE);
1801       CODING_UTF_16_BOM (coding) = utf_without_bom;
1802     }
1803
1804   while (charbuf < charbuf_end)
1805     {
1806       ASSURE_DESTINATION (safe_room);
1807       c = *charbuf++;
1808       if (c > MAX_UNICODE_CHAR)
1809         c = coding->default_char;
1810
1811       if (c < 0x10000)
1812         {
1813           if (big_endian)
1814             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1815           else
1816             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1817         }
1818       else
1819         {
1820           int c1, c2;
1821
1822           c -= 0x10000;
1823           c1 = (c >> 10) + 0xD800;
1824           c2 = (c & 0x3FF) + 0xDC00;
1825           if (big_endian)
1826             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1827           else
1828             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1829         }
1830     }
1831   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1832   coding->produced = dst - coding->destination;
1833   coding->produced_char += produced_chars;
1834   return 0;
1835 }
1836
1837 \f
1838 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1839
1840 /* Emacs' internal format for representation of multiple character
1841    sets is a kind of multi-byte encoding, i.e. characters are
1842    represented by variable-length sequences of one-byte codes.
1843
1844    ASCII characters and control characters (e.g. `tab', `newline') are
1845    represented by one-byte sequences which are their ASCII codes, in
1846    the range 0x00 through 0x7F.
1847
1848    8-bit characters of the range 0x80..0x9F are represented by
1849    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1850    code + 0x20).
1851
1852    8-bit characters of the range 0xA0..0xFF are represented by
1853    one-byte sequences which are their 8-bit code.
1854
1855    The other characters are represented by a sequence of `base
1856    leading-code', optional `extended leading-code', and one or two
1857    `position-code's.  The length of the sequence is determined by the
1858    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1859    whereas extended leading-code and position-code take the range 0xA0
1860    through 0xFF.  See `charset.h' for more details about leading-code
1861    and position-code.
1862
1863    --- CODE RANGE of Emacs' internal format ---
1864    character set        range
1865    -------------        -----
1866    ascii                0x00..0x7F
1867    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1868    eight-bit-graphic    0xA0..0xBF
1869    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1870    ---------------------------------------------
1871
1872    As this is the internal character representation, the format is
1873    usually not used externally (i.e. in a file or in a data sent to a
1874    process).  But, it is possible to have a text externally in this
1875    format (i.e. by encoding by the coding system `emacs-mule').
1876
1877    In that case, a sequence of one-byte codes has a slightly different
1878    form.
1879
1880    At first, all characters in eight-bit-control are represented by
1881    one-byte sequences which are their 8-bit code.
1882
1883    Next, character composition data are represented by the byte
1884    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1885    where,
1886         METHOD is 0xF2 plus one of composition method (enum
1887         composition_method),
1888
1889         BYTES is 0xA0 plus a byte length of this composition data,
1890
1891         CHARS is 0xA0 plus a number of characters composed by this
1892         data,
1893
1894         COMPONENTs are characters of multibyte form or composition
1895         rules encoded by two-byte of ASCII codes.
1896
1897    In addition, for backward compatibility, the following formats are
1898    also recognized as composition data on decoding.
1899
1900    0x80 MSEQ ...
1901    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1902
1903    Here,
1904         MSEQ is a multibyte form but in these special format:
1905           ASCII: 0xA0 ASCII_CODE+0x80,
1906           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1907         RULE is a one byte code of the range 0xA0..0xF0 that
1908         represents a composition rule.
1909   */
1910
1911 char emacs_mule_bytes[256];
1912
1913
1914 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1915    Return true if a text is encoded in 'emacs-mule'.  */
1916
1917 static bool
1918 detect_coding_emacs_mule (struct coding_system *coding,
1919                           struct coding_detection_info *detect_info)
1920 {
1921   const unsigned char *src = coding->source, *src_base;
1922   const unsigned char *src_end = coding->source + coding->src_bytes;
1923   bool multibytep = coding->src_multibyte;
1924   ptrdiff_t consumed_chars = 0;
1925   int c;
1926   int found = 0;
1927
1928   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1929   /* A coding system of this category is always ASCII compatible.  */
1930   src += coding->head_ascii;
1931
1932   while (1)
1933     {
1934       src_base = src;
1935       ONE_MORE_BYTE (c);
1936       if (c < 0)
1937         continue;
1938       if (c == 0x80)
1939         {
1940           /* Perhaps the start of composite character.  We simply skip
1941              it because analyzing it is too heavy for detecting.  But,
1942              at least, we check that the composite character
1943              constitutes of more than 4 bytes.  */
1944           const unsigned char *src_start;
1945
1946         repeat:
1947           src_start = src;
1948           do
1949             {
1950               ONE_MORE_BYTE (c);
1951             }
1952           while (c >= 0xA0);
1953
1954           if (src - src_start <= 4)
1955             break;
1956           found = CATEGORY_MASK_EMACS_MULE;
1957           if (c == 0x80)
1958             goto repeat;
1959         }
1960
1961       if (c < 0x80)
1962         {
1963           if (c < 0x20
1964               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1965             break;
1966         }
1967       else
1968         {
1969           int more_bytes = emacs_mule_bytes[c] - 1;
1970
1971           while (more_bytes > 0)
1972             {
1973               ONE_MORE_BYTE (c);
1974               if (c < 0xA0)
1975                 {
1976                   src--;        /* Unread the last byte.  */
1977                   break;
1978                 }
1979               more_bytes--;
1980             }
1981           if (more_bytes != 0)
1982             break;
1983           found = CATEGORY_MASK_EMACS_MULE;
1984         }
1985     }
1986   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1987   return 0;
1988
1989  no_more_source:
1990   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1991     {
1992       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1993       return 0;
1994     }
1995   detect_info->found |= found;
1996   return 1;
1997 }
1998
1999
2000 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2001    character.  If CMP_STATUS indicates that we must expect MSEQ or
2002    RULE described above, decode it and return the negative value of
2003    the decoded character or rule.  If an invalid byte is found, return
2004    -1.  If SRC is too short, return -2.  */
2005
2006 static int
2007 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2008                  int *nbytes, int *nchars, int *id,
2009                  struct composition_status *cmp_status)
2010 {
2011   const unsigned char *src_end = coding->source + coding->src_bytes;
2012   const unsigned char *src_base = src;
2013   bool multibytep = coding->src_multibyte;
2014   int charset_ID;
2015   unsigned code;
2016   int c;
2017   ptrdiff_t consumed_chars = 0;
2018   bool mseq_found = 0;
2019
2020   ONE_MORE_BYTE (c);
2021   if (c < 0)
2022     {
2023       c = -c;
2024       charset_ID = emacs_mule_charset[0];
2025     }
2026   else
2027     {
2028       if (c >= 0xA0)
2029         {
2030           if (cmp_status->state != COMPOSING_NO
2031               && cmp_status->old_form)
2032             {
2033               if (cmp_status->state == COMPOSING_CHAR)
2034                 {
2035                   if (c == 0xA0)
2036                     {
2037                       ONE_MORE_BYTE (c);
2038                       c -= 0x80;
2039                       if (c < 0)
2040                         goto invalid_code;
2041                     }
2042                   else
2043                     c -= 0x20;
2044                   mseq_found = 1;
2045                 }
2046               else
2047                 {
2048                   *nbytes = src - src_base;
2049                   *nchars = consumed_chars;
2050                   return -c;
2051                 }
2052             }
2053           else
2054             goto invalid_code;
2055         }
2056
2057       switch (emacs_mule_bytes[c])
2058         {
2059         case 2:
2060           if ((charset_ID = emacs_mule_charset[c]) < 0)
2061             goto invalid_code;
2062           ONE_MORE_BYTE (c);
2063           if (c < 0xA0)
2064             goto invalid_code;
2065           code = c & 0x7F;
2066           break;
2067
2068         case 3:
2069           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2070               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2071             {
2072               ONE_MORE_BYTE (c);
2073               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2074                 goto invalid_code;
2075               ONE_MORE_BYTE (c);
2076               if (c < 0xA0)
2077                 goto invalid_code;
2078               code = c & 0x7F;
2079             }
2080           else
2081             {
2082               if ((charset_ID = emacs_mule_charset[c]) < 0)
2083                 goto invalid_code;
2084               ONE_MORE_BYTE (c);
2085               if (c < 0xA0)
2086                 goto invalid_code;
2087               code = (c & 0x7F) << 8;
2088               ONE_MORE_BYTE (c);
2089               if (c < 0xA0)
2090                 goto invalid_code;
2091               code |= c & 0x7F;
2092             }
2093           break;
2094
2095         case 4:
2096           ONE_MORE_BYTE (c);
2097           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2098             goto invalid_code;
2099           ONE_MORE_BYTE (c);
2100           if (c < 0xA0)
2101             goto invalid_code;
2102           code = (c & 0x7F) << 8;
2103           ONE_MORE_BYTE (c);
2104           if (c < 0xA0)
2105             goto invalid_code;
2106           code |= c & 0x7F;
2107           break;
2108
2109         case 1:
2110           code = c;
2111           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2112           break;
2113
2114         default:
2115           emacs_abort ();
2116         }
2117       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2118                           CHARSET_FROM_ID (charset_ID), code, c);
2119       if (c < 0)
2120         goto invalid_code;
2121     }
2122   *nbytes = src - src_base;
2123   *nchars = consumed_chars;
2124   if (id)
2125     *id = charset_ID;
2126   return (mseq_found ? -c : c);
2127
2128  no_more_source:
2129   return -2;
2130
2131  invalid_code:
2132   return -1;
2133 }
2134
2135
2136 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2137
2138 /* Handle these composition sequence ('|': the end of header elements,
2139    BYTES and CHARS >= 0xA0):
2140
2141    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2142    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2143    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2144
2145    and these old form:
2146
2147    (4) relative composition: 0x80 | MSEQ ... MSEQ
2148    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2149
2150    When the starter 0x80 and the following header elements are found,
2151    this annotation header is produced.
2152
2153         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2154
2155    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2156    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2157
2158    Then, upon reading the following elements, these codes are produced
2159    until the composition end is found:
2160
2161    (1) CHAR ... CHAR
2162    (2) ALT ... ALT CHAR ... CHAR
2163    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2164    (4) CHAR ... CHAR
2165    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2166
2167    When the composition end is found, LENGTH and NCHARS in the
2168    annotation header is updated as below:
2169
2170    (1) LENGTH: unchanged, NCHARS: unchanged
2171    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2172    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2173    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2174    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2175
2176    If an error is found while composing, the annotation header is
2177    changed to the original composition header (plus filler -1s) as
2178    below:
2179
2180    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2181    (5)          [ 0x80 0xFF -1 -1- -1 ]
2182
2183    and the sequence [ -2 DECODED-RULE ] is changed to the original
2184    byte sequence as below:
2185         o the original byte sequence is B: [ B -1 ]
2186         o the original byte sequence is B1 B2: [ B1 B2 ]
2187
2188    Most of the routines are implemented by macros because many
2189    variables and labels in the caller decode_coding_emacs_mule must be
2190    accessible, and they are usually called just once (thus doesn't
2191    increase the size of compiled object).  */
2192
2193 /* Decode a composition rule represented by C as a component of
2194    composition sequence of Emacs 20 style.  Set RULE to the decoded
2195    rule. */
2196
2197 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2198   do {                                                  \
2199     int gref, nref;                                     \
2200                                                         \
2201     c -= 0xA0;                                          \
2202     if (c < 0 || c >= 81)                               \
2203       goto invalid_code;                                \
2204     gref = c / 9, nref = c % 9;                         \
2205     if (gref == 4) gref = 10;                           \
2206     if (nref == 4) nref = 10;                           \
2207     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2208   } while (0)
2209
2210
2211 /* Decode a composition rule represented by C and the following byte
2212    at SRC as a component of composition sequence of Emacs 21 style.
2213    Set RULE to the decoded rule.  */
2214
2215 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2216   do {                                                  \
2217     int gref, nref;                                     \
2218                                                         \
2219     gref = c - 0x20;                                    \
2220     if (gref < 0 || gref >= 81)                         \
2221       goto invalid_code;                                \
2222     ONE_MORE_BYTE (c);                                  \
2223     nref = c - 0x20;                                    \
2224     if (nref < 0 || nref >= 81)                         \
2225       goto invalid_code;                                \
2226     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2227   } while (0)
2228
2229
2230 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2231    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2232    byte length of this composition information, CHARS is the number of
2233    characters composed by this composition.  */
2234
2235 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2236   do {                                                                  \
2237     enum composition_method method = c - 0xF2;                          \
2238     int nbytes, nchars;                                                 \
2239                                                                         \
2240     ONE_MORE_BYTE (c);                                                  \
2241     if (c < 0)                                                          \
2242       goto invalid_code;                                                \
2243     nbytes = c - 0xA0;                                                  \
2244     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2245       goto invalid_code;                                                \
2246     ONE_MORE_BYTE (c);                                                  \
2247     nchars = c - 0xA0;                                                  \
2248     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2249       goto invalid_code;                                                \
2250     cmp_status->old_form = 0;                                           \
2251     cmp_status->method = method;                                        \
2252     if (method == COMPOSITION_RELATIVE)                                 \
2253       cmp_status->state = COMPOSING_CHAR;                               \
2254     else                                                                \
2255       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2256     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2257     cmp_status->nchars = nchars;                                        \
2258     cmp_status->ncomps = nbytes - 4;                                    \
2259     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2260   } while (0)
2261
2262
2263 /* Start of Emacs 20 style format for relative composition.  */
2264
2265 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2266   do {                                                          \
2267     cmp_status->old_form = 1;                                   \
2268     cmp_status->method = COMPOSITION_RELATIVE;                  \
2269     cmp_status->state = COMPOSING_CHAR;                         \
2270     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2271     cmp_status->nchars = cmp_status->ncomps = 0;                \
2272     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2273   } while (0)
2274
2275
2276 /* Start of Emacs 20 style format for rule-base composition.  */
2277
2278 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2279   do {                                                          \
2280     cmp_status->old_form = 1;                                   \
2281     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2282     cmp_status->state = COMPOSING_CHAR;                         \
2283     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2284     cmp_status->nchars = cmp_status->ncomps = 0;                \
2285     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2286   } while (0)
2287
2288
2289 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2290   do {                                                  \
2291     const unsigned char *current_src = src;             \
2292                                                         \
2293     ONE_MORE_BYTE (c);                                  \
2294     if (c < 0)                                          \
2295       goto invalid_code;                                \
2296     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2297         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2298       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2299     else if (c < 0xA0)                                  \
2300       goto invalid_code;                                \
2301     else if (c < 0xC0)                                  \
2302       {                                                 \
2303         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2304         /* Re-read C as a composition component.  */    \
2305         src = current_src;                              \
2306       }                                                 \
2307     else if (c == 0xFF)                                 \
2308       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2309     else                                                \
2310       goto invalid_code;                                \
2311   } while (0)
2312
2313 #define EMACS_MULE_COMPOSITION_END()                            \
2314   do {                                                          \
2315     int idx = - cmp_status->length;                             \
2316                                                                 \
2317     if (cmp_status->old_form)                                   \
2318       charbuf[idx + 2] = cmp_status->nchars;                    \
2319     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2320       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2321     cmp_status->state = COMPOSING_NO;                           \
2322   } while (0)
2323
2324
2325 static int
2326 emacs_mule_finish_composition (int *charbuf,
2327                                struct composition_status *cmp_status)
2328 {
2329   int idx = - cmp_status->length;
2330   int new_chars;
2331
2332   if (cmp_status->old_form && cmp_status->nchars > 0)
2333     {
2334       charbuf[idx + 2] = cmp_status->nchars;
2335       new_chars = 0;
2336       if (cmp_status->method == COMPOSITION_WITH_RULE
2337           && cmp_status->state == COMPOSING_CHAR)
2338         {
2339           /* The last rule was invalid.  */
2340           int rule = charbuf[-1] + 0xA0;
2341
2342           charbuf[-2] = BYTE8_TO_CHAR (rule);
2343           charbuf[-1] = -1;
2344           new_chars = 1;
2345         }
2346     }
2347   else
2348     {
2349       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2350
2351       if (cmp_status->method == COMPOSITION_WITH_RULE)
2352         {
2353           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2354           charbuf[idx++] = -3;
2355           charbuf[idx++] = 0;
2356           new_chars = 1;
2357         }
2358       else
2359         {
2360           int nchars = charbuf[idx + 1] + 0xA0;
2361           int nbytes = charbuf[idx + 2] + 0xA0;
2362
2363           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2364           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2365           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2366           charbuf[idx++] = -1;
2367           new_chars = 4;
2368         }
2369     }
2370   cmp_status->state = COMPOSING_NO;
2371   return new_chars;
2372 }
2373
2374 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2375   do {                                                                    \
2376     if (cmp_status->state != COMPOSING_NO)                                \
2377       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2378   } while (0)
2379
2380
2381 static void
2382 decode_coding_emacs_mule (struct coding_system *coding)
2383 {
2384   const unsigned char *src = coding->source + coding->consumed;
2385   const unsigned char *src_end = coding->source + coding->src_bytes;
2386   const unsigned char *src_base;
2387   int *charbuf = coding->charbuf + coding->charbuf_used;
2388   /* We may produce two annotations (charset and composition) in one
2389      loop and one more charset annotation at the end.  */
2390   int *charbuf_end
2391     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2392       /* We can produce up to 2 characters in a loop.  */
2393       - 1;
2394   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2395   bool multibytep = coding->src_multibyte;
2396   ptrdiff_t char_offset = coding->produced_char;
2397   ptrdiff_t last_offset = char_offset;
2398   int last_id = charset_ascii;
2399   bool eol_dos
2400     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2401   int byte_after_cr = -1;
2402   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2403
2404   if (cmp_status->state != COMPOSING_NO)
2405     {
2406       int i;
2407
2408       if (charbuf_end - charbuf < cmp_status->length)
2409         emacs_abort ();
2410       for (i = 0; i < cmp_status->length; i++)
2411         *charbuf++ = cmp_status->carryover[i];
2412       coding->annotated = 1;
2413     }
2414
2415   while (1)
2416     {
2417       int c, id IF_LINT (= 0);
2418
2419       src_base = src;
2420       consumed_chars_base = consumed_chars;
2421
2422       if (charbuf >= charbuf_end)
2423         {
2424           if (byte_after_cr >= 0)
2425             src_base--;
2426           break;
2427         }
2428
2429       if (byte_after_cr >= 0)
2430         c = byte_after_cr, byte_after_cr = -1;
2431       else
2432         ONE_MORE_BYTE (c);
2433
2434       if (c < 0 || c == 0x80)
2435         {
2436           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2437           if (c < 0)
2438             {
2439               *charbuf++ = -c;
2440               char_offset++;
2441             }
2442           else
2443             DECODE_EMACS_MULE_COMPOSITION_START ();
2444           continue;
2445         }
2446
2447       if (c < 0x80)
2448         {
2449           if (eol_dos && c == '\r')
2450             ONE_MORE_BYTE (byte_after_cr);
2451           id = charset_ascii;
2452           if (cmp_status->state != COMPOSING_NO)
2453             {
2454               if (cmp_status->old_form)
2455                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2456               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2457                 cmp_status->ncomps--;
2458             }
2459         }
2460       else
2461         {
2462           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2463           /* emacs_mule_char can load a charset map from a file, which
2464              allocates a large structure and might cause buffer text
2465              to be relocated as result.  Thus, we need to remember the
2466              original pointer to buffer text, and fix up all related
2467              pointers after the call.  */
2468           const unsigned char *orig = coding->source;
2469           ptrdiff_t offset;
2470
2471           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2472                                cmp_status);
2473           offset = coding->source - orig;
2474           if (offset)
2475             {
2476               src += offset;
2477               src_base += offset;
2478               src_end += offset;
2479             }
2480           if (c < 0)
2481             {
2482               if (c == -1)
2483                 goto invalid_code;
2484               if (c == -2)
2485                 break;
2486             }
2487           src = src_base + nbytes;
2488           consumed_chars = consumed_chars_base + nchars;
2489           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2490             cmp_status->ncomps -= nchars;
2491         }
2492
2493       /* Now if C >= 0, we found a normally encoded character, if C <
2494          0, we found an old-style composition component character or
2495          rule.  */
2496
2497       if (cmp_status->state == COMPOSING_NO)
2498         {
2499           if (last_id != id)
2500             {
2501               if (last_id != charset_ascii)
2502                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2503                                   last_id);
2504               last_id = id;
2505               last_offset = char_offset;
2506             }
2507           *charbuf++ = c;
2508           char_offset++;
2509         }
2510       else if (cmp_status->state == COMPOSING_CHAR)
2511         {
2512           if (cmp_status->old_form)
2513             {
2514               if (c >= 0)
2515                 {
2516                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2517                   *charbuf++ = c;
2518                   char_offset++;
2519                 }
2520               else
2521                 {
2522                   *charbuf++ = -c;
2523                   cmp_status->nchars++;
2524                   cmp_status->length++;
2525                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2526                     EMACS_MULE_COMPOSITION_END ();
2527                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2528                     cmp_status->state = COMPOSING_RULE;
2529                 }
2530             }
2531           else
2532             {
2533               *charbuf++ = c;
2534               cmp_status->length++;
2535               cmp_status->nchars--;
2536               if (cmp_status->nchars == 0)
2537                 EMACS_MULE_COMPOSITION_END ();
2538             }
2539         }
2540       else if (cmp_status->state == COMPOSING_RULE)
2541         {
2542           int rule;
2543
2544           if (c >= 0)
2545             {
2546               EMACS_MULE_COMPOSITION_END ();
2547               *charbuf++ = c;
2548               char_offset++;
2549             }
2550           else
2551             {
2552               c = -c;
2553               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2554               if (rule < 0)
2555                 goto invalid_code;
2556               *charbuf++ = -2;
2557               *charbuf++ = rule;
2558               cmp_status->length += 2;
2559               cmp_status->state = COMPOSING_CHAR;
2560             }
2561         }
2562       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2563         {
2564           *charbuf++ = c;
2565           cmp_status->length++;
2566           if (cmp_status->ncomps == 0)
2567             cmp_status->state = COMPOSING_CHAR;
2568           else if (cmp_status->ncomps > 0)
2569             {
2570               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2571                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2572             }
2573           else
2574             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2575         }
2576       else                      /* COMPOSING_COMPONENT_RULE */
2577         {
2578           int rule;
2579
2580           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2581           if (rule < 0)
2582             goto invalid_code;
2583           *charbuf++ = -2;
2584           *charbuf++ = rule;
2585           cmp_status->length += 2;
2586           cmp_status->ncomps--;
2587           if (cmp_status->ncomps > 0)
2588             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2589           else
2590             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2591         }
2592       continue;
2593
2594     invalid_code:
2595       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2596       src = src_base;
2597       consumed_chars = consumed_chars_base;
2598       ONE_MORE_BYTE (c);
2599       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2600       char_offset++;
2601       coding->errors++;
2602     }
2603
2604  no_more_source:
2605   if (cmp_status->state != COMPOSING_NO)
2606     {
2607       if (coding->mode & CODING_MODE_LAST_BLOCK)
2608         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2609       else
2610         {
2611           int i;
2612
2613           charbuf -= cmp_status->length;
2614           for (i = 0; i < cmp_status->length; i++)
2615             cmp_status->carryover[i] = charbuf[i];
2616         }
2617     }
2618   if (last_id != charset_ascii)
2619     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2620   coding->consumed_char += consumed_chars_base;
2621   coding->consumed = src_base - coding->source;
2622   coding->charbuf_used = charbuf - coding->charbuf;
2623 }
2624
2625
2626 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2627   do {                                          \
2628     if (id < 0xA0)                              \
2629       codes[0] = id, codes[1] = 0;              \
2630     else if (id < 0xE0)                         \
2631       codes[0] = 0x9A, codes[1] = id;           \
2632     else if (id < 0xF0)                         \
2633       codes[0] = 0x9B, codes[1] = id;           \
2634     else if (id < 0xF5)                         \
2635       codes[0] = 0x9C, codes[1] = id;           \
2636     else                                        \
2637       codes[0] = 0x9D, codes[1] = id;           \
2638   } while (0);
2639
2640
2641 static bool
2642 encode_coding_emacs_mule (struct coding_system *coding)
2643 {
2644   bool multibytep = coding->dst_multibyte;
2645   int *charbuf = coding->charbuf;
2646   int *charbuf_end = charbuf + coding->charbuf_used;
2647   unsigned char *dst = coding->destination + coding->produced;
2648   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2649   int safe_room = 8;
2650   ptrdiff_t produced_chars = 0;
2651   Lisp_Object attrs, charset_list;
2652   int c;
2653   int preferred_charset_id = -1;
2654
2655   CODING_GET_INFO (coding, attrs, charset_list);
2656   if (! EQ (charset_list, Vemacs_mule_charset_list))
2657     {
2658       charset_list = Vemacs_mule_charset_list;
2659       ASET (attrs, coding_attr_charset_list, charset_list);
2660     }
2661
2662   while (charbuf < charbuf_end)
2663     {
2664       ASSURE_DESTINATION (safe_room);
2665       c = *charbuf++;
2666
2667       if (c < 0)
2668         {
2669           /* Handle an annotation.  */
2670           switch (*charbuf)
2671             {
2672             case CODING_ANNOTATE_COMPOSITION_MASK:
2673               /* Not yet implemented.  */
2674               break;
2675             case CODING_ANNOTATE_CHARSET_MASK:
2676               preferred_charset_id = charbuf[3];
2677               if (preferred_charset_id >= 0
2678                   && NILP (Fmemq (make_number (preferred_charset_id),
2679                                   charset_list)))
2680                 preferred_charset_id = -1;
2681               break;
2682             default:
2683               emacs_abort ();
2684             }
2685           charbuf += -c - 1;
2686           continue;
2687         }
2688
2689       if (ASCII_CHAR_P (c))
2690         EMIT_ONE_ASCII_BYTE (c);
2691       else if (CHAR_BYTE8_P (c))
2692         {
2693           c = CHAR_TO_BYTE8 (c);
2694           EMIT_ONE_BYTE (c);
2695         }
2696       else
2697         {
2698           struct charset *charset;
2699           unsigned code;
2700           int dimension;
2701           int emacs_mule_id;
2702           unsigned char leading_codes[2];
2703
2704           if (preferred_charset_id >= 0)
2705             {
2706               bool result;
2707
2708               charset = CHARSET_FROM_ID (preferred_charset_id);
2709               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2710               if (result)
2711                 code = ENCODE_CHAR (charset, c);
2712               else
2713                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2714                                      &code, charset);
2715             }
2716           else
2717             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2718                                  &code, charset);
2719           if (! charset)
2720             {
2721               c = coding->default_char;
2722               if (ASCII_CHAR_P (c))
2723                 {
2724                   EMIT_ONE_ASCII_BYTE (c);
2725                   continue;
2726                 }
2727               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2728                                    &code, charset);
2729             }
2730           dimension = CHARSET_DIMENSION (charset);
2731           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2732           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2733           EMIT_ONE_BYTE (leading_codes[0]);
2734           if (leading_codes[1])
2735             EMIT_ONE_BYTE (leading_codes[1]);
2736           if (dimension == 1)
2737             EMIT_ONE_BYTE (code | 0x80);
2738           else
2739             {
2740               code |= 0x8080;
2741               EMIT_ONE_BYTE (code >> 8);
2742               EMIT_ONE_BYTE (code & 0xFF);
2743             }
2744         }
2745     }
2746   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2747   coding->produced_char += produced_chars;
2748   coding->produced = dst - coding->destination;
2749   return 0;
2750 }
2751
2752 \f
2753 /*** 7. ISO2022 handlers ***/
2754
2755 /* The following note describes the coding system ISO2022 briefly.
2756    Since the intention of this note is to help understand the
2757    functions in this file, some parts are NOT ACCURATE or are OVERLY
2758    SIMPLIFIED.  For thorough understanding, please refer to the
2759    original document of ISO2022.  This is equivalent to the standard
2760    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2761
2762    ISO2022 provides many mechanisms to encode several character sets
2763    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2764    is encoded using bytes less than 128.  This may make the encoded
2765    text a little bit longer, but the text passes more easily through
2766    several types of gateway, some of which strip off the MSB (Most
2767    Significant Bit).
2768
2769    There are two kinds of character sets: control character sets and
2770    graphic character sets.  The former contain control characters such
2771    as `newline' and `escape' to provide control functions (control
2772    functions are also provided by escape sequences).  The latter
2773    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2774    two control character sets and many graphic character sets.
2775
2776    Graphic character sets are classified into one of the following
2777    four classes, according to the number of bytes (DIMENSION) and
2778    number of characters in one dimension (CHARS) of the set:
2779    - DIMENSION1_CHARS94
2780    - DIMENSION1_CHARS96
2781    - DIMENSION2_CHARS94
2782    - DIMENSION2_CHARS96
2783
2784    In addition, each character set is assigned an identification tag,
2785    unique for each set, called the "final character" (denoted as <F>
2786    hereafter).  The <F> of each character set is decided by ECMA(*)
2787    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2788    (0x30..0x3F are for private use only).
2789
2790    Note (*): ECMA = European Computer Manufacturers Association
2791
2792    Here are examples of graphic character sets [NAME(<F>)]:
2793         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2794         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2795         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2796         o DIMENSION2_CHARS96 -- none for the moment
2797
2798    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2799         C0 [0x00..0x1F] -- control character plane 0
2800         GL [0x20..0x7F] -- graphic character plane 0
2801         C1 [0x80..0x9F] -- control character plane 1
2802         GR [0xA0..0xFF] -- graphic character plane 1
2803
2804    A control character set is directly designated and invoked to C0 or
2805    C1 by an escape sequence.  The most common case is that:
2806    - ISO646's  control character set is designated/invoked to C0, and
2807    - ISO6429's control character set is designated/invoked to C1,
2808    and usually these designations/invocations are omitted in encoded
2809    text.  In a 7-bit environment, only C0 can be used, and a control
2810    character for C1 is encoded by an appropriate escape sequence to
2811    fit into the environment.  All control characters for C1 are
2812    defined to have corresponding escape sequences.
2813
2814    A graphic character set is at first designated to one of four
2815    graphic registers (G0 through G3), then these graphic registers are
2816    invoked to GL or GR.  These designations and invocations can be
2817    done independently.  The most common case is that G0 is invoked to
2818    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2819    these invocations and designations are omitted in encoded text.
2820    In a 7-bit environment, only GL can be used.
2821
2822    When a graphic character set of CHARS94 is invoked to GL, codes
2823    0x20 and 0x7F of the GL area work as control characters SPACE and
2824    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2825    be used.
2826
2827    There are two ways of invocation: locking-shift and single-shift.
2828    With locking-shift, the invocation lasts until the next different
2829    invocation, whereas with single-shift, the invocation affects the
2830    following character only and doesn't affect the locking-shift
2831    state.  Invocations are done by the following control characters or
2832    escape sequences:
2833
2834    ----------------------------------------------------------------------
2835    abbrev  function                  cntrl escape seq   description
2836    ----------------------------------------------------------------------
2837    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2838    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2839    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2840    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2841    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2842    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2843    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2844    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2845    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2846    ----------------------------------------------------------------------
2847    (*) These are not used by any known coding system.
2848
2849    Control characters for these functions are defined by macros
2850    ISO_CODE_XXX in `coding.h'.
2851
2852    Designations are done by the following escape sequences:
2853    ----------------------------------------------------------------------
2854    escape sequence      description
2855    ----------------------------------------------------------------------
2856    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2857    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2858    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2859    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2860    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2861    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2862    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2863    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2864    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2865    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2866    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2867    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2868    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2869    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2870    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2871    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2872    ----------------------------------------------------------------------
2873
2874    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2875    of dimension 1, chars 94, and final character <F>, etc...
2876
2877    Note (*): Although these designations are not allowed in ISO2022,
2878    Emacs accepts them on decoding, and produces them on encoding
2879    CHARS96 character sets in a coding system which is characterized as
2880    7-bit environment, non-locking-shift, and non-single-shift.
2881
2882    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2883    '(' must be omitted.  We refer to this as "short-form" hereafter.
2884
2885    Now you may notice that there are a lot of ways of encoding the
2886    same multilingual text in ISO2022.  Actually, there exist many
2887    coding systems such as Compound Text (used in X11's inter client
2888    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2889    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2890    localized platforms), and all of these are variants of ISO2022.
2891
2892    In addition to the above, Emacs handles two more kinds of escape
2893    sequences: ISO6429's direction specification and Emacs' private
2894    sequence for specifying character composition.
2895
2896    ISO6429's direction specification takes the following form:
2897         o CSI ']'      -- end of the current direction
2898         o CSI '0' ']'  -- end of the current direction
2899         o CSI '1' ']'  -- start of left-to-right text
2900         o CSI '2' ']'  -- start of right-to-left text
2901    The control character CSI (0x9B: control sequence introducer) is
2902    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2903
2904    Character composition specification takes the following form:
2905         o ESC '0' -- start relative composition
2906         o ESC '1' -- end composition
2907         o ESC '2' -- start rule-base composition (*)
2908         o ESC '3' -- start relative composition with alternate chars  (**)
2909         o ESC '4' -- start rule-base composition with alternate chars  (**)
2910   Since these are not standard escape sequences of any ISO standard,
2911   the use of them with these meanings is restricted to Emacs only.
2912
2913   (*) This form is used only in Emacs 20.7 and older versions,
2914   but newer versions can safely decode it.
2915   (**) This form is used only in Emacs 21.1 and newer versions,
2916   and older versions can't decode it.
2917
2918   Here's a list of example usages of these composition escape
2919   sequences (categorized by `enum composition_method').
2920
2921   COMPOSITION_RELATIVE:
2922         ESC 0 CHAR [ CHAR ] ESC 1
2923   COMPOSITION_WITH_RULE:
2924         ESC 2 CHAR [ RULE CHAR ] ESC 1
2925   COMPOSITION_WITH_ALTCHARS:
2926         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2927   COMPOSITION_WITH_RULE_ALTCHARS:
2928         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2929
2930 static enum iso_code_class_type iso_code_class[256];
2931
2932 #define SAFE_CHARSET_P(coding, id)      \
2933   ((id) <= (coding)->max_charset_id     \
2934    && (coding)->safe_charsets[id] != 255)
2935
2936 static void
2937 setup_iso_safe_charsets (Lisp_Object attrs)
2938 {
2939   Lisp_Object charset_list, safe_charsets;
2940   Lisp_Object request;
2941   Lisp_Object reg_usage;
2942   Lisp_Object tail;
2943   EMACS_INT reg94, reg96;
2944   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2945   int max_charset_id;
2946
2947   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2948   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2949       && ! EQ (charset_list, Viso_2022_charset_list))
2950     {
2951       charset_list = Viso_2022_charset_list;
2952       ASET (attrs, coding_attr_charset_list, charset_list);
2953       ASET (attrs, coding_attr_safe_charsets, Qnil);
2954     }
2955
2956   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2957     return;
2958
2959   max_charset_id = 0;
2960   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2961     {
2962       int id = XINT (XCAR (tail));
2963       if (max_charset_id < id)
2964         max_charset_id = id;
2965     }
2966
2967   safe_charsets = make_uninit_string (max_charset_id + 1);
2968   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2969   request = AREF (attrs, coding_attr_iso_request);
2970   reg_usage = AREF (attrs, coding_attr_iso_usage);
2971   reg94 = XINT (XCAR (reg_usage));
2972   reg96 = XINT (XCDR (reg_usage));
2973
2974   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2975     {
2976       Lisp_Object id;
2977       Lisp_Object reg;
2978       struct charset *charset;
2979
2980       id = XCAR (tail);
2981       charset = CHARSET_FROM_ID (XINT (id));
2982       reg = Fcdr (Fassq (id, request));
2983       if (! NILP (reg))
2984         SSET (safe_charsets, XINT (id), XINT (reg));
2985       else if (charset->iso_chars_96)
2986         {
2987           if (reg96 < 4)
2988             SSET (safe_charsets, XINT (id), reg96);
2989         }
2990       else
2991         {
2992           if (reg94 < 4)
2993             SSET (safe_charsets, XINT (id), reg94);
2994         }
2995     }
2996   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2997 }
2998
2999
3000 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3001    Return true if a text is encoded in one of ISO-2022 based coding
3002    systems.  */
3003
3004 static bool
3005 detect_coding_iso_2022 (struct coding_system *coding,
3006                         struct coding_detection_info *detect_info)
3007 {
3008   const unsigned char *src = coding->source, *src_base = src;
3009   const unsigned char *src_end = coding->source + coding->src_bytes;
3010   bool multibytep = coding->src_multibyte;
3011   bool single_shifting = 0;
3012   int id;
3013   int c, c1;
3014   ptrdiff_t consumed_chars = 0;
3015   int i;
3016   int rejected = 0;
3017   int found = 0;
3018   int composition_count = -1;
3019
3020   detect_info->checked |= CATEGORY_MASK_ISO;
3021
3022   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3023     {
3024       struct coding_system *this = &(coding_categories[i]);
3025       Lisp_Object attrs, val;
3026
3027       if (this->id < 0)
3028         continue;
3029       attrs = CODING_ID_ATTRS (this->id);
3030       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3031           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3032         setup_iso_safe_charsets (attrs);
3033       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3034       this->max_charset_id = SCHARS (val) - 1;
3035       this->safe_charsets = SDATA (val);
3036     }
3037
3038   /* A coding system of this category is always ASCII compatible.  */
3039   src += coding->head_ascii;
3040
3041   while (rejected != CATEGORY_MASK_ISO)
3042     {
3043       src_base = src;
3044       ONE_MORE_BYTE (c);
3045       switch (c)
3046         {
3047         case ISO_CODE_ESC:
3048           if (inhibit_iso_escape_detection)
3049             break;
3050           single_shifting = 0;
3051           ONE_MORE_BYTE (c);
3052           if (c == 'N' || c == 'O')
3053             {
3054               /* ESC <Fe> for SS2 or SS3.  */
3055               single_shifting = 1;
3056               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3057             }
3058           else if (c == '1')
3059             {
3060               /* End of composition.  */
3061               if (composition_count < 0
3062                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3063                 /* Invalid */
3064                 break;
3065               composition_count = -1;
3066               found |= CATEGORY_MASK_ISO;
3067             }
3068           else if (c >= '0' && c <= '4')
3069             {
3070               /* ESC <Fp> for start/end composition.  */
3071               composition_count = 0;
3072             }
3073           else
3074             {
3075               if (c >= '(' && c <= '/')
3076                 {
3077                   /* Designation sequence for a charset of dimension 1.  */
3078                   ONE_MORE_BYTE (c1);
3079                   if (c1 < ' ' || c1 >= 0x80
3080                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3081                     /* Invalid designation sequence.  Just ignore.  */
3082                     break;
3083                 }
3084               else if (c == '$')
3085                 {
3086                   /* Designation sequence for a charset of dimension 2.  */
3087                   ONE_MORE_BYTE (c);
3088                   if (c >= '@' && c <= 'B')
3089                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3090                     id = iso_charset_table[1][0][c];
3091                   else if (c >= '(' && c <= '/')
3092                     {
3093                       ONE_MORE_BYTE (c1);
3094                       if (c1 < ' ' || c1 >= 0x80
3095                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3096                         /* Invalid designation sequence.  Just ignore.  */
3097                         break;
3098                     }
3099                   else
3100                     /* Invalid designation sequence.  Just ignore it.  */
3101                     break;
3102                 }
3103               else
3104                 {
3105                   /* Invalid escape sequence.  Just ignore it.  */
3106                   break;
3107                 }
3108
3109               /* We found a valid designation sequence for CHARSET.  */
3110               rejected |= CATEGORY_MASK_ISO_8BIT;
3111               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3112                                   id))
3113                 found |= CATEGORY_MASK_ISO_7;
3114               else
3115                 rejected |= CATEGORY_MASK_ISO_7;
3116               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3117                                   id))
3118                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3119               else
3120                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3121               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3122                                   id))
3123                 found |= CATEGORY_MASK_ISO_7_ELSE;
3124               else
3125                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3126               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3127                                   id))
3128                 found |= CATEGORY_MASK_ISO_8_ELSE;
3129               else
3130                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3131             }
3132           break;
3133
3134         case ISO_CODE_SO:
3135         case ISO_CODE_SI:
3136           /* Locking shift out/in.  */
3137           if (inhibit_iso_escape_detection)
3138             break;
3139           single_shifting = 0;
3140           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3141           break;
3142
3143         case ISO_CODE_CSI:
3144           /* Control sequence introducer.  */
3145           single_shifting = 0;
3146           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3147           found |= CATEGORY_MASK_ISO_8_ELSE;
3148           goto check_extra_latin;
3149
3150         case ISO_CODE_SS2:
3151         case ISO_CODE_SS3:
3152           /* Single shift.   */
3153           if (inhibit_iso_escape_detection)
3154             break;
3155           single_shifting = 0;
3156           rejected |= CATEGORY_MASK_ISO_7BIT;
3157           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3158               & CODING_ISO_FLAG_SINGLE_SHIFT)
3159             {
3160               found |= CATEGORY_MASK_ISO_8_1;
3161               single_shifting = 1;
3162             }
3163           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3164               & CODING_ISO_FLAG_SINGLE_SHIFT)
3165             {
3166               found |= CATEGORY_MASK_ISO_8_2;
3167               single_shifting = 1;
3168             }
3169           if (single_shifting)
3170             break;
3171           goto check_extra_latin;
3172
3173         default:
3174           if (c < 0)
3175             continue;
3176           if (c < 0x80)
3177             {
3178               if (composition_count >= 0)
3179                 composition_count++;
3180               single_shifting = 0;
3181               break;
3182             }
3183           if (c >= 0xA0)
3184             {
3185               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3186               found |= CATEGORY_MASK_ISO_8_1;
3187               /* Check the length of succeeding codes of the range
3188                  0xA0..0FF.  If the byte length is even, we include
3189                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3190                  only when we are not single shifting.  */
3191               if (! single_shifting
3192                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3193                 {
3194                   ptrdiff_t len = 1;
3195                   while (src < src_end)
3196                     {
3197                       src_base = src;
3198                       ONE_MORE_BYTE (c);
3199                       if (c < 0xA0)
3200                         {
3201                           src = src_base;
3202                           break;
3203                         }
3204                       len++;
3205                     }
3206
3207                   if (len & 1 && src < src_end)
3208                     {
3209                       rejected |= CATEGORY_MASK_ISO_8_2;
3210                       if (composition_count >= 0)
3211                         composition_count += len;
3212                     }
3213                   else
3214                     {
3215                       found |= CATEGORY_MASK_ISO_8_2;
3216                       if (composition_count >= 0)
3217                         composition_count += len / 2;
3218                     }
3219                 }
3220               break;
3221             }
3222         check_extra_latin:
3223           if (! VECTORP (Vlatin_extra_code_table)
3224               || NILP (AREF (Vlatin_extra_code_table, c)))
3225             {
3226               rejected = CATEGORY_MASK_ISO;
3227               break;
3228             }
3229           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3230               & CODING_ISO_FLAG_LATIN_EXTRA)
3231             found |= CATEGORY_MASK_ISO_8_1;
3232           else
3233             rejected |= CATEGORY_MASK_ISO_8_1;
3234           rejected |= CATEGORY_MASK_ISO_8_2;
3235           break;
3236         }
3237     }
3238   detect_info->rejected |= CATEGORY_MASK_ISO;
3239   return 0;
3240
3241  no_more_source:
3242   detect_info->rejected |= rejected;
3243   detect_info->found |= (found & ~rejected);
3244   return 1;
3245 }
3246
3247
3248 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3249    escape sequence should be kept.  */
3250 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3251   do {                                                                  \
3252     int id, prev;                                                       \
3253                                                                         \
3254     if (final < '0' || final >= 128                                     \
3255         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3256         || !SAFE_CHARSET_P (coding, id))                                \
3257       {                                                                 \
3258         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3259         chars_96 = -1;                                                  \
3260         break;                                                          \
3261       }                                                                 \
3262     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3263     if (id == charset_jisx0201_roman)                                   \
3264       {                                                                 \
3265         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3266           id = charset_ascii;                                           \
3267       }                                                                 \
3268     else if (id == charset_jisx0208_1978)                               \
3269       {                                                                 \
3270         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3271           id = charset_jisx0208;                                        \
3272       }                                                                 \
3273     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3274     /* If there was an invalid designation to REG previously, and this  \
3275        designation is ASCII to REG, we should keep this designation     \
3276        sequence.  */                                                    \
3277     if (prev == -2 && id == charset_ascii)                              \
3278       chars_96 = -1;                                                    \
3279   } while (0)
3280
3281
3282 /* Handle these composition sequence (ALT: alternate char):
3283
3284    (1) relative composition: ESC 0 CHAR ... ESC 1
3285    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3286    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3287    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3288
3289    When the start sequence (ESC 0/2/3/4) is found, this annotation
3290    header is produced.
3291
3292         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3293
3294    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3295    produced until the end sequence (ESC 1) is found:
3296
3297    (1) CHAR ... CHAR
3298    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3299    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3300    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3301
3302    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3303    annotation header is updated as below:
3304
3305    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3306    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3307    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3308    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3309
3310    If an error is found while composing, the annotation header is
3311    changed to:
3312
3313         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3314
3315    and the sequence [ -2 DECODED-RULE ] is changed to the original
3316    byte sequence as below:
3317         o the original byte sequence is B: [ B -1 ]
3318         o the original byte sequence is B1 B2: [ B1 B2 ]
3319    and the sequence [ -1 -1 ] is changed to the original byte
3320    sequence:
3321         [ ESC '0' ]
3322 */
3323
3324 /* Decode a composition rule C1 and maybe one more byte from the
3325    source, and set RULE to the encoded composition rule.  If the rule
3326    is invalid, goto invalid_code.  */
3327
3328 #define DECODE_COMPOSITION_RULE(rule)                                   \
3329   do {                                                                  \
3330     rule = c1 - 32;                                                     \
3331     if (rule < 0)                                                       \
3332       goto invalid_code;                                                \
3333     if (rule < 81)              /* old format (before ver.21) */        \
3334       {                                                                 \
3335         int gref = (rule) / 9;                                          \
3336         int nref = (rule) % 9;                                          \
3337         if (gref == 4) gref = 10;                                       \
3338         if (nref == 4) nref = 10;                                       \
3339         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3340       }                                                                 \
3341     else                        /* new format (after ver.21) */         \
3342       {                                                                 \
3343         int b;                                                          \
3344                                                                         \
3345         ONE_MORE_BYTE (b);                                              \
3346         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3347           goto invalid_code;                                            \
3348         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3349         rule += 0x100;   /* Distinguish it from the old format.  */     \
3350       }                                                                 \
3351   } while (0)
3352
3353 #define ENCODE_COMPOSITION_RULE(rule)                           \
3354   do {                                                          \
3355     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3356                                                                 \
3357     if (rule < 0x100)           /* old format */                \
3358       {                                                         \
3359         if (gref == 10) gref = 4;                               \
3360         if (nref == 10) nref = 4;                               \
3361         charbuf[idx] = 32 + gref * 9 + nref;                    \
3362         charbuf[idx + 1] = -1;                                  \
3363         new_chars++;                                            \
3364       }                                                         \
3365     else                                /* new format */        \
3366       {                                                         \
3367         charbuf[idx] = 32 + 81 + gref;                          \
3368         charbuf[idx + 1] = 32 + nref;                           \
3369         new_chars += 2;                                         \
3370       }                                                         \
3371   } while (0)
3372
3373 /* Finish the current composition as invalid.  */
3374
3375 static int
3376 finish_composition (int *charbuf, struct composition_status *cmp_status)
3377 {
3378   int idx = - cmp_status->length;
3379   int new_chars;
3380
3381   /* Recover the original ESC sequence */
3382   charbuf[idx++] = ISO_CODE_ESC;
3383   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3384                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3385                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3386                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3387                     : '4');
3388   charbuf[idx++] = -2;
3389   charbuf[idx++] = 0;
3390   charbuf[idx++] = -1;
3391   new_chars = cmp_status->nchars;
3392   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3393     for (; idx < 0; idx++)
3394       {
3395         int elt = charbuf[idx];
3396
3397         if (elt == -2)
3398           {
3399             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3400             idx++;
3401           }
3402         else if (elt == -1)
3403           {
3404             charbuf[idx++] = ISO_CODE_ESC;
3405             charbuf[idx] = '0';
3406             new_chars += 2;
3407           }
3408       }
3409   cmp_status->state = COMPOSING_NO;
3410   return new_chars;
3411 }
3412
3413 /* If characters are under composition, finish the composition.  */
3414 #define MAYBE_FINISH_COMPOSITION()                              \
3415   do {                                                          \
3416     if (cmp_status->state != COMPOSING_NO)                      \
3417       char_offset += finish_composition (charbuf, cmp_status);  \
3418   } while (0)
3419
3420 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3421
3422    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3423    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3424    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3425    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3426
3427    Produce this annotation sequence now:
3428
3429    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3430 */
3431
3432 #define DECODE_COMPOSITION_START(c1)                                       \
3433   do {                                                                     \
3434     if (c1 == '0'                                                          \
3435         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3436              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3437             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3438                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3439       {                                                                    \
3440         *charbuf++ = -1;                                                   \
3441         *charbuf++= -1;                                                    \
3442         cmp_status->state = COMPOSING_CHAR;                                \
3443         cmp_status->length += 2;                                           \
3444       }                                                                    \
3445     else                                                                   \
3446       {                                                                    \
3447         MAYBE_FINISH_COMPOSITION ();                                       \
3448         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3449                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3450                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3451                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3452         cmp_status->state                                                  \
3453           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3454         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3455         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3456         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3457         coding->annotated = 1;                                             \
3458       }                                                                    \
3459   } while (0)
3460
3461
3462 /* Handle composition end sequence ESC 1.  */
3463
3464 #define DECODE_COMPOSITION_END()                                        \
3465   do {                                                                  \
3466     if (cmp_status->nchars == 0                                         \
3467         || ((cmp_status->state == COMPOSING_CHAR)                       \
3468             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3469       {                                                                 \
3470         MAYBE_FINISH_COMPOSITION ();                                    \
3471         goto invalid_code;                                              \
3472       }                                                                 \
3473     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3474       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3475     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3476       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3477     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3478     char_offset += cmp_status->nchars;                                  \
3479     cmp_status->state = COMPOSING_NO;                                   \
3480   } while (0)
3481
3482 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3483
3484 #define STORE_COMPOSITION_RULE(rule)    \
3485   do {                                  \
3486     *charbuf++ = -2;                    \
3487     *charbuf++ = rule;                  \
3488     cmp_status->length += 2;            \
3489     cmp_status->state--;                \
3490   } while (0)
3491
3492 /* Store a composed char or a component char C in charbuf, and update
3493    cmp_status.  */
3494
3495 #define STORE_COMPOSITION_CHAR(c)                                       \
3496   do {                                                                  \
3497     *charbuf++ = (c);                                                   \
3498     cmp_status->length++;                                               \
3499     if (cmp_status->state == COMPOSING_CHAR)                            \
3500       cmp_status->nchars++;                                             \
3501     else                                                                \
3502       cmp_status->ncomps++;                                             \
3503     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3504         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3505             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3506       cmp_status->state++;                                              \
3507   } while (0)
3508
3509
3510 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3511
3512 static void
3513 decode_coding_iso_2022 (struct coding_system *coding)
3514 {
3515   const unsigned char *src = coding->source + coding->consumed;
3516   const unsigned char *src_end = coding->source + coding->src_bytes;
3517   const unsigned char *src_base;
3518   int *charbuf = coding->charbuf + coding->charbuf_used;
3519   /* We may produce two annotations (charset and composition) in one
3520      loop and one more charset annotation at the end.  */
3521   int *charbuf_end
3522     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3523   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3524   bool multibytep = coding->src_multibyte;
3525   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3526   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3527   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3528   int charset_id_2, charset_id_3;
3529   struct charset *charset;
3530   int c;
3531   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3532   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3533   ptrdiff_t char_offset = coding->produced_char;
3534   ptrdiff_t last_offset = char_offset;
3535   int last_id = charset_ascii;
3536   bool eol_dos
3537     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3538   int byte_after_cr = -1;
3539   int i;
3540
3541   setup_iso_safe_charsets (attrs);
3542   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3543
3544   if (cmp_status->state != COMPOSING_NO)
3545     {
3546       if (charbuf_end - charbuf < cmp_status->length)
3547         emacs_abort ();
3548       for (i = 0; i < cmp_status->length; i++)
3549         *charbuf++ = cmp_status->carryover[i];
3550       coding->annotated = 1;
3551     }
3552
3553   while (1)
3554     {
3555       int c1, c2, c3;
3556
3557       src_base = src;
3558       consumed_chars_base = consumed_chars;
3559
3560       if (charbuf >= charbuf_end)
3561         {
3562           if (byte_after_cr >= 0)
3563             src_base--;
3564           break;
3565         }
3566
3567       if (byte_after_cr >= 0)
3568         c1 = byte_after_cr, byte_after_cr = -1;
3569       else
3570         ONE_MORE_BYTE (c1);
3571       if (c1 < 0)
3572         goto invalid_code;
3573
3574       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3575         {
3576           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3577           char_offset++;
3578           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3579           continue;
3580         }
3581
3582       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3583         {
3584           if (c1 == ISO_CODE_ESC)
3585             {
3586               if (src + 1 >= src_end)
3587                 goto no_more_source;
3588               *charbuf++ = ISO_CODE_ESC;
3589               char_offset++;
3590               if (src[0] == '%' && src[1] == '@')
3591                 {
3592                   src += 2;
3593                   consumed_chars += 2;
3594                   char_offset += 2;
3595                   /* We are sure charbuf can contain two more chars. */
3596                   *charbuf++ = '%';
3597                   *charbuf++ = '@';
3598                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3599                 }
3600             }
3601           else
3602             {
3603               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3604               char_offset++;
3605             }
3606           continue;
3607         }
3608
3609       if ((cmp_status->state == COMPOSING_RULE
3610            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3611           && c1 != ISO_CODE_ESC)
3612         {
3613           int rule;
3614
3615           DECODE_COMPOSITION_RULE (rule);
3616           STORE_COMPOSITION_RULE (rule);
3617           continue;
3618         }
3619
3620       /* We produce at most one character.  */
3621       switch (iso_code_class [c1])
3622         {
3623         case ISO_0x20_or_0x7F:
3624           if (charset_id_0 < 0
3625               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3626             /* This is SPACE or DEL.  */
3627             charset = CHARSET_FROM_ID (charset_ascii);
3628           else
3629             charset = CHARSET_FROM_ID (charset_id_0);
3630           break;
3631
3632         case ISO_graphic_plane_0:
3633           if (charset_id_0 < 0)
3634             charset = CHARSET_FROM_ID (charset_ascii);
3635           else
3636             charset = CHARSET_FROM_ID (charset_id_0);
3637           break;
3638
3639         case ISO_0xA0_or_0xFF:
3640           if (charset_id_1 < 0
3641               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3642               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3643             goto invalid_code;
3644           /* This is a graphic character, we fall down ... */
3645
3646         case ISO_graphic_plane_1:
3647           if (charset_id_1 < 0)
3648             goto invalid_code;
3649           charset = CHARSET_FROM_ID (charset_id_1);
3650           break;
3651
3652         case ISO_control_0:
3653           if (eol_dos && c1 == '\r')
3654             ONE_MORE_BYTE (byte_after_cr);
3655           MAYBE_FINISH_COMPOSITION ();
3656           charset = CHARSET_FROM_ID (charset_ascii);
3657           break;
3658
3659         case ISO_control_1:
3660           goto invalid_code;
3661
3662         case ISO_shift_out:
3663           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3664               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3665             goto invalid_code;
3666           CODING_ISO_INVOCATION (coding, 0) = 1;
3667           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3668           continue;
3669
3670         case ISO_shift_in:
3671           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3672             goto invalid_code;
3673           CODING_ISO_INVOCATION (coding, 0) = 0;
3674           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3675           continue;
3676
3677         case ISO_single_shift_2_7:
3678           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3679             goto invalid_code;
3680         case ISO_single_shift_2:
3681           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3682             goto invalid_code;
3683           /* SS2 is handled as an escape sequence of ESC 'N' */
3684           c1 = 'N';
3685           goto label_escape_sequence;
3686
3687         case ISO_single_shift_3:
3688           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3689             goto invalid_code;
3690           /* SS2 is handled as an escape sequence of ESC 'O' */
3691           c1 = 'O';
3692           goto label_escape_sequence;
3693
3694         case ISO_control_sequence_introducer:
3695           /* CSI is handled as an escape sequence of ESC '[' ...  */
3696           c1 = '[';
3697           goto label_escape_sequence;
3698
3699         case ISO_escape:
3700           ONE_MORE_BYTE (c1);
3701         label_escape_sequence:
3702           /* Escape sequences handled here are invocation,
3703              designation, direction specification, and character
3704              composition specification.  */
3705           switch (c1)
3706             {
3707             case '&':           /* revision of following character set */
3708               ONE_MORE_BYTE (c1);
3709               if (!(c1 >= '@' && c1 <= '~'))
3710                 goto invalid_code;
3711               ONE_MORE_BYTE (c1);
3712               if (c1 != ISO_CODE_ESC)
3713                 goto invalid_code;
3714               ONE_MORE_BYTE (c1);
3715               goto label_escape_sequence;
3716
3717             case '$':           /* designation of 2-byte character set */
3718               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3719                 goto invalid_code;
3720               {
3721                 int reg, chars96;
3722
3723                 ONE_MORE_BYTE (c1);
3724                 if (c1 >= '@' && c1 <= 'B')
3725                   {     /* designation of JISX0208.1978, GB2312.1980,
3726                            or JISX0208.1980 */
3727                     reg = 0, chars96 = 0;
3728                   }
3729                 else if (c1 >= 0x28 && c1 <= 0x2B)
3730                   { /* designation of DIMENSION2_CHARS94 character set */
3731                     reg = c1 - 0x28, chars96 = 0;
3732                     ONE_MORE_BYTE (c1);
3733                   }
3734                 else if (c1 >= 0x2C && c1 <= 0x2F)
3735                   { /* designation of DIMENSION2_CHARS96 character set */
3736                     reg = c1 - 0x2C, chars96 = 1;
3737                     ONE_MORE_BYTE (c1);
3738                   }
3739                 else
3740                   goto invalid_code;
3741                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3742                 /* We must update these variables now.  */
3743                 if (reg == 0)
3744                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3745                 else if (reg == 1)
3746                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3747                 if (chars96 < 0)
3748                   goto invalid_code;
3749               }
3750               continue;
3751
3752             case 'n':           /* invocation of locking-shift-2 */
3753               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3754                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3755                 goto invalid_code;
3756               CODING_ISO_INVOCATION (coding, 0) = 2;
3757               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3758               continue;
3759
3760             case 'o':           /* invocation of locking-shift-3 */
3761               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3762                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3763                 goto invalid_code;
3764               CODING_ISO_INVOCATION (coding, 0) = 3;
3765               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3766               continue;
3767
3768             case 'N':           /* invocation of single-shift-2 */
3769               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3770                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3771                 goto invalid_code;
3772               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3773               if (charset_id_2 < 0)
3774                 charset = CHARSET_FROM_ID (charset_ascii);
3775               else
3776                 charset = CHARSET_FROM_ID (charset_id_2);
3777               ONE_MORE_BYTE (c1);
3778               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3779                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3780                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3781                           ? c1 >= 0x80 : c1 < 0x80)))
3782                 goto invalid_code;
3783               break;
3784
3785             case 'O':           /* invocation of single-shift-3 */
3786               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3787                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3788                 goto invalid_code;
3789               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3790               if (charset_id_3 < 0)
3791                 charset = CHARSET_FROM_ID (charset_ascii);
3792               else
3793                 charset = CHARSET_FROM_ID (charset_id_3);
3794               ONE_MORE_BYTE (c1);
3795               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3796                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3797                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3798                           ? c1 >= 0x80 : c1 < 0x80)))
3799                 goto invalid_code;
3800               break;
3801
3802             case '0': case '2': case '3': case '4': /* start composition */
3803               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3804                 goto invalid_code;
3805               if (last_id != charset_ascii)
3806                 {
3807                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3808                   last_id = charset_ascii;
3809                   last_offset = char_offset;
3810                 }
3811               DECODE_COMPOSITION_START (c1);
3812               continue;
3813
3814             case '1':           /* end composition */
3815               if (cmp_status->state == COMPOSING_NO)
3816                 goto invalid_code;
3817               DECODE_COMPOSITION_END ();
3818               continue;
3819
3820             case '[':           /* specification of direction */
3821               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3822                 goto invalid_code;
3823               /* For the moment, nested direction is not supported.
3824                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3825                  left-to-right, and nonzero means right-to-left.  */
3826               ONE_MORE_BYTE (c1);
3827               switch (c1)
3828                 {
3829                 case ']':       /* end of the current direction */
3830                   coding->mode &= ~CODING_MODE_DIRECTION;
3831
3832                 case '0':       /* end of the current direction */
3833                 case '1':       /* start of left-to-right direction */
3834                   ONE_MORE_BYTE (c1);
3835                   if (c1 == ']')
3836                     coding->mode &= ~CODING_MODE_DIRECTION;
3837                   else
3838                     goto invalid_code;
3839                   break;
3840
3841                 case '2':       /* start of right-to-left direction */
3842                   ONE_MORE_BYTE (c1);
3843                   if (c1 == ']')
3844                     coding->mode |= CODING_MODE_DIRECTION;
3845                   else
3846                     goto invalid_code;
3847                   break;
3848
3849                 default:
3850                   goto invalid_code;
3851                 }
3852               continue;
3853
3854             case '%':
3855               ONE_MORE_BYTE (c1);
3856               if (c1 == '/')
3857                 {
3858                   /* CTEXT extended segment:
3859                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3860                      We keep these bytes as is for the moment.
3861                      They may be decoded by post-read-conversion.  */
3862                   int dim, M, L;
3863                   int size;
3864
3865                   ONE_MORE_BYTE (dim);
3866                   if (dim < '0' || dim > '4')
3867                     goto invalid_code;
3868                   ONE_MORE_BYTE (M);
3869                   if (M < 128)
3870                     goto invalid_code;
3871                   ONE_MORE_BYTE (L);
3872                   if (L < 128)
3873                     goto invalid_code;
3874                   size = ((M - 128) * 128) + (L - 128);
3875                   if (charbuf + 6 > charbuf_end)
3876                     goto break_loop;
3877                   *charbuf++ = ISO_CODE_ESC;
3878                   *charbuf++ = '%';
3879                   *charbuf++ = '/';
3880                   *charbuf++ = dim;
3881                   *charbuf++ = BYTE8_TO_CHAR (M);
3882                   *charbuf++ = BYTE8_TO_CHAR (L);
3883                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3884                 }
3885               else if (c1 == 'G')
3886                 {
3887                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3888                      ESC % G --UTF-8-BYTES-- ESC % @
3889                      We keep these bytes as is for the moment.
3890                      They may be decoded by post-read-conversion.  */
3891                   if (charbuf + 3 > charbuf_end)
3892                     goto break_loop;
3893                   *charbuf++ = ISO_CODE_ESC;
3894                   *charbuf++ = '%';
3895                   *charbuf++ = 'G';
3896                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3897                 }
3898               else
3899                 goto invalid_code;
3900               continue;
3901               break;
3902
3903             default:
3904               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3905                 goto invalid_code;
3906               {
3907                 int reg, chars96;
3908
3909                 if (c1 >= 0x28 && c1 <= 0x2B)
3910                   { /* designation of DIMENSION1_CHARS94 character set */
3911                     reg = c1 - 0x28, chars96 = 0;
3912                     ONE_MORE_BYTE (c1);
3913                   }
3914                 else if (c1 >= 0x2C && c1 <= 0x2F)
3915                   { /* designation of DIMENSION1_CHARS96 character set */
3916                     reg = c1 - 0x2C, chars96 = 1;
3917                     ONE_MORE_BYTE (c1);
3918                   }
3919                 else
3920                   goto invalid_code;
3921                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3922                 /* We must update these variables now.  */
3923                 if (reg == 0)
3924                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3925                 else if (reg == 1)
3926                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3927                 if (chars96 < 0)
3928                   goto invalid_code;
3929               }
3930               continue;
3931             }
3932           break;
3933
3934         default:
3935           emacs_abort ();
3936         }
3937
3938       if (cmp_status->state == COMPOSING_NO
3939           && charset->id != charset_ascii
3940           && last_id != charset->id)
3941         {
3942           if (last_id != charset_ascii)
3943             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3944           last_id = charset->id;
3945           last_offset = char_offset;
3946         }
3947
3948       /* Now we know CHARSET and 1st position code C1 of a character.
3949          Produce a decoded character while getting 2nd and 3rd
3950          position codes C2, C3 if necessary.  */
3951       if (CHARSET_DIMENSION (charset) > 1)
3952         {
3953           ONE_MORE_BYTE (c2);
3954           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3955               || ((c1 & 0x80) != (c2 & 0x80)))
3956             /* C2 is not in a valid range.  */
3957             goto invalid_code;
3958           if (CHARSET_DIMENSION (charset) == 2)
3959             c1 = (c1 << 8) | c2;
3960           else
3961             {
3962               ONE_MORE_BYTE (c3);
3963               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3964                   || ((c1 & 0x80) != (c3 & 0x80)))
3965                 /* C3 is not in a valid range.  */
3966                 goto invalid_code;
3967               c1 = (c1 << 16) | (c2 << 8) | c2;
3968             }
3969         }
3970       c1 &= 0x7F7F7F;
3971       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3972       if (c < 0)
3973         {
3974           MAYBE_FINISH_COMPOSITION ();
3975           for (; src_base < src; src_base++, char_offset++)
3976             {
3977               if (ASCII_BYTE_P (*src_base))
3978                 *charbuf++ = *src_base;
3979               else
3980                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3981             }
3982         }
3983       else if (cmp_status->state == COMPOSING_NO)
3984         {
3985           *charbuf++ = c;
3986           char_offset++;
3987         }
3988       else if ((cmp_status->state == COMPOSING_CHAR
3989                 ? cmp_status->nchars
3990                 : cmp_status->ncomps)
3991                >= MAX_COMPOSITION_COMPONENTS)
3992         {
3993           /* Too long composition.  */
3994           MAYBE_FINISH_COMPOSITION ();
3995           *charbuf++ = c;
3996           char_offset++;
3997         }
3998       else
3999         STORE_COMPOSITION_CHAR (c);
4000       continue;
4001
4002     invalid_code:
4003       MAYBE_FINISH_COMPOSITION ();
4004       src = src_base;
4005       consumed_chars = consumed_chars_base;
4006       ONE_MORE_BYTE (c);
4007       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4008       char_offset++;
4009       coding->errors++;
4010       /* Reset the invocation and designation status to the safest
4011          one; i.e. designate ASCII to the graphic register 0, and
4012          invoke that register to the graphic plane 0.  This typically
4013          helps the case that an designation sequence for ASCII "ESC (
4014          B" is somehow broken (e.g. broken by a newline).  */
4015       CODING_ISO_INVOCATION (coding, 0) = 0;
4016       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
4017       charset_id_0 = charset_ascii;
4018       continue;
4019
4020     break_loop:
4021       break;
4022     }
4023
4024  no_more_source:
4025   if (cmp_status->state != COMPOSING_NO)
4026     {
4027       if (coding->mode & CODING_MODE_LAST_BLOCK)
4028         MAYBE_FINISH_COMPOSITION ();
4029       else
4030         {
4031           charbuf -= cmp_status->length;
4032           for (i = 0; i < cmp_status->length; i++)
4033             cmp_status->carryover[i] = charbuf[i];
4034         }
4035     }
4036   else if (last_id != charset_ascii)
4037     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4038   coding->consumed_char += consumed_chars_base;
4039   coding->consumed = src_base - coding->source;
4040   coding->charbuf_used = charbuf - coding->charbuf;
4041 }
4042
4043
4044 /* ISO2022 encoding stuff.  */
4045
4046 /*
4047    It is not enough to say just "ISO2022" on encoding, we have to
4048    specify more details.  In Emacs, each coding system of ISO2022
4049    variant has the following specifications:
4050         1. Initial designation to G0 thru G3.
4051         2. Allows short-form designation?
4052         3. ASCII should be designated to G0 before control characters?
4053         4. ASCII should be designated to G0 at end of line?
4054         5. 7-bit environment or 8-bit environment?
4055         6. Use locking-shift?
4056         7. Use Single-shift?
4057    And the following two are only for Japanese:
4058         8. Use ASCII in place of JIS0201-1976-Roman?
4059         9. Use JISX0208-1983 in place of JISX0208-1978?
4060    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4061    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4062    details.
4063 */
4064
4065 /* Produce codes (escape sequence) for designating CHARSET to graphic
4066    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4067    '@', 'A', or 'B' and the coding system CODING allows, produce
4068    designation sequence of short-form.  */
4069
4070 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4071   do {                                                                  \
4072     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4073     const char *intermediate_char_94 = "()*+";                          \
4074     const char *intermediate_char_96 = ",-./";                          \
4075     int revision = -1;                                                  \
4076                                                                         \
4077     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4078       revision = CHARSET_ISO_REVISION (charset);                        \
4079                                                                         \
4080     if (revision >= 0)                                                  \
4081       {                                                                 \
4082         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4083         EMIT_ONE_BYTE ('@' + revision);                                 \
4084       }                                                                 \
4085     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4086     if (CHARSET_DIMENSION (charset) == 1)                               \
4087       {                                                                 \
4088         int b;                                                          \
4089         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4090           b = intermediate_char_94[reg];                                \
4091         else                                                            \
4092           b = intermediate_char_96[reg];                                \
4093         EMIT_ONE_ASCII_BYTE (b);                                        \
4094       }                                                                 \
4095     else                                                                \
4096       {                                                                 \
4097         EMIT_ONE_ASCII_BYTE ('$');                                      \
4098         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4099           {                                                             \
4100             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4101                 || reg != 0                                             \
4102                 || final_char < '@' || final_char > 'B')                \
4103               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4104           }                                                             \
4105         else                                                            \
4106           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4107       }                                                                 \
4108     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4109                                                                         \
4110     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4111   } while (0)
4112
4113
4114 /* The following two macros produce codes (control character or escape
4115    sequence) for ISO2022 single-shift functions (single-shift-2 and
4116    single-shift-3).  */
4117
4118 #define ENCODE_SINGLE_SHIFT_2                                           \
4119   do {                                                                  \
4120     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4121       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4122     else                                                                \
4123       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4124     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4125   } while (0)
4126
4127
4128 #define ENCODE_SINGLE_SHIFT_3                                           \
4129   do {                                                                  \
4130     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4131       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4132     else                                                                \
4133       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4134     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4135   } while (0)
4136
4137
4138 /* The following four macros produce codes (control character or
4139    escape sequence) for ISO2022 locking-shift functions (shift-in,
4140    shift-out, locking-shift-2, and locking-shift-3).  */
4141
4142 #define ENCODE_SHIFT_IN                                 \
4143   do {                                                  \
4144     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4145     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4146   } while (0)
4147
4148
4149 #define ENCODE_SHIFT_OUT                                \
4150   do {                                                  \
4151     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4152     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4153   } while (0)
4154
4155
4156 #define ENCODE_LOCKING_SHIFT_2                          \
4157   do {                                                  \
4158     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4159     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4160   } while (0)
4161
4162
4163 #define ENCODE_LOCKING_SHIFT_3                          \
4164   do {                                                  \
4165     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4166     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4167   } while (0)
4168
4169
4170 /* Produce codes for a DIMENSION1 character whose character set is
4171    CHARSET and whose position-code is C1.  Designation and invocation
4172    sequences are also produced in advance if necessary.  */
4173
4174 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4175   do {                                                                  \
4176     int id = CHARSET_ID (charset);                                      \
4177                                                                         \
4178     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4179         && id == charset_ascii)                                         \
4180       {                                                                 \
4181         id = charset_jisx0201_roman;                                    \
4182         charset = CHARSET_FROM_ID (id);                                 \
4183       }                                                                 \
4184                                                                         \
4185     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4186       {                                                                 \
4187         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4188           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4189         else                                                            \
4190           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4191         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4192         break;                                                          \
4193       }                                                                 \
4194     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4195       {                                                                 \
4196         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4197         break;                                                          \
4198       }                                                                 \
4199     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4200       {                                                                 \
4201         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4202         break;                                                          \
4203       }                                                                 \
4204     else                                                                \
4205       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4206          must invoke it, or, at first, designate it to some graphic     \
4207          register.  Then repeat the loop to actually produce the        \
4208          character.  */                                                 \
4209       dst = encode_invocation_designation (charset, coding, dst,        \
4210                                            &produced_chars);            \
4211   } while (1)
4212
4213
4214 /* Produce codes for a DIMENSION2 character whose character set is
4215    CHARSET and whose position-codes are C1 and C2.  Designation and
4216    invocation codes are also produced in advance if necessary.  */
4217
4218 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4219   do {                                                                  \
4220     int id = CHARSET_ID (charset);                                      \
4221                                                                         \
4222     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4223         && id == charset_jisx0208)                                      \
4224       {                                                                 \
4225         id = charset_jisx0208_1978;                                     \
4226         charset = CHARSET_FROM_ID (id);                                 \
4227       }                                                                 \
4228                                                                         \
4229     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4230       {                                                                 \
4231         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4232           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4233         else                                                            \
4234           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4235         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4236         break;                                                          \
4237       }                                                                 \
4238     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4239       {                                                                 \
4240         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4241         break;                                                          \
4242       }                                                                 \
4243     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4244       {                                                                 \
4245         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4246         break;                                                          \
4247       }                                                                 \
4248     else                                                                \
4249       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4250          must invoke it, or, at first, designate it to some graphic     \
4251          register.  Then repeat the loop to actually produce the        \
4252          character.  */                                                 \
4253       dst = encode_invocation_designation (charset, coding, dst,        \
4254                                            &produced_chars);            \
4255   } while (1)
4256
4257
4258 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4259   do {                                                                     \
4260     unsigned code;                                                         \
4261     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4262                                                                            \
4263     if (CHARSET_DIMENSION (charset) == 1)                                  \
4264       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4265     else                                                                   \
4266       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4267   } while (0)
4268
4269
4270 /* Produce designation and invocation codes at a place pointed by DST
4271    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4272    Return new DST.  */
4273
4274 static unsigned char *
4275 encode_invocation_designation (struct charset *charset,
4276                                struct coding_system *coding,
4277                                unsigned char *dst, ptrdiff_t *p_nchars)
4278 {
4279   bool multibytep = coding->dst_multibyte;
4280   ptrdiff_t produced_chars = *p_nchars;
4281   int reg;                      /* graphic register number */
4282   int id = CHARSET_ID (charset);
4283
4284   /* At first, check designations.  */
4285   for (reg = 0; reg < 4; reg++)
4286     if (id == CODING_ISO_DESIGNATION (coding, reg))
4287       break;
4288
4289   if (reg >= 4)
4290     {
4291       /* CHARSET is not yet designated to any graphic registers.  */
4292       /* At first check the requested designation.  */
4293       reg = CODING_ISO_REQUEST (coding, id);
4294       if (reg < 0)
4295         /* Since CHARSET requests no special designation, designate it
4296            to graphic register 0.  */
4297         reg = 0;
4298
4299       ENCODE_DESIGNATION (charset, reg, coding);
4300     }
4301
4302   if (CODING_ISO_INVOCATION (coding, 0) != reg
4303       && CODING_ISO_INVOCATION (coding, 1) != reg)
4304     {
4305       /* Since the graphic register REG is not invoked to any graphic
4306          planes, invoke it to graphic plane 0.  */
4307       switch (reg)
4308         {
4309         case 0:                 /* graphic register 0 */
4310           ENCODE_SHIFT_IN;
4311           break;
4312
4313         case 1:                 /* graphic register 1 */
4314           ENCODE_SHIFT_OUT;
4315           break;
4316
4317         case 2:                 /* graphic register 2 */
4318           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4319             ENCODE_SINGLE_SHIFT_2;
4320           else
4321             ENCODE_LOCKING_SHIFT_2;
4322           break;
4323
4324         case 3:                 /* graphic register 3 */
4325           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4326             ENCODE_SINGLE_SHIFT_3;
4327           else
4328             ENCODE_LOCKING_SHIFT_3;
4329           break;
4330         }
4331     }
4332
4333   *p_nchars = produced_chars;
4334   return dst;
4335 }
4336
4337
4338 /* Produce codes for designation and invocation to reset the graphic
4339    planes and registers to initial state.  */
4340 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4341   do {                                                                  \
4342     int reg;                                                            \
4343     struct charset *charset;                                            \
4344                                                                         \
4345     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4346       ENCODE_SHIFT_IN;                                                  \
4347     for (reg = 0; reg < 4; reg++)                                       \
4348       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4349           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4350               != CODING_ISO_INITIAL (coding, reg)))                     \
4351         {                                                               \
4352           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4353           ENCODE_DESIGNATION (charset, reg, coding);                    \
4354         }                                                               \
4355   } while (0)
4356
4357
4358 /* Produce designation sequences of charsets in the line started from
4359    CHARBUF to a place pointed by DST, and return the number of
4360    produced bytes.  DST should not directly point a buffer text area
4361    which may be relocated by char_charset call.
4362
4363    If the current block ends before any end-of-line, we may fail to
4364    find all the necessary designations.  */
4365
4366 static ptrdiff_t
4367 encode_designation_at_bol (struct coding_system *coding,
4368                            int *charbuf, int *charbuf_end,
4369                            unsigned char *dst)
4370 {
4371   unsigned char *orig = dst;
4372   struct charset *charset;
4373   /* Table of charsets to be designated to each graphic register.  */
4374   int r[4];
4375   int c, found = 0, reg;
4376   ptrdiff_t produced_chars = 0;
4377   bool multibytep = coding->dst_multibyte;
4378   Lisp_Object attrs;
4379   Lisp_Object charset_list;
4380
4381   attrs = CODING_ID_ATTRS (coding->id);
4382   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4383   if (EQ (charset_list, Qiso_2022))
4384     charset_list = Viso_2022_charset_list;
4385
4386   for (reg = 0; reg < 4; reg++)
4387     r[reg] = -1;
4388
4389   while (charbuf < charbuf_end && found < 4)
4390     {
4391       int id;
4392
4393       c = *charbuf++;
4394       if (c == '\n')
4395         break;
4396       charset = char_charset (c, charset_list, NULL);
4397       id = CHARSET_ID (charset);
4398       reg = CODING_ISO_REQUEST (coding, id);
4399       if (reg >= 0 && r[reg] < 0)
4400         {
4401           found++;
4402           r[reg] = id;
4403         }
4404     }
4405
4406   if (found)
4407     {
4408       for (reg = 0; reg < 4; reg++)
4409         if (r[reg] >= 0
4410             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4411           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4412     }
4413
4414   return dst - orig;
4415 }
4416
4417 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4418
4419 static bool
4420 encode_coding_iso_2022 (struct coding_system *coding)
4421 {
4422   bool multibytep = coding->dst_multibyte;
4423   int *charbuf = coding->charbuf;
4424   int *charbuf_end = charbuf + coding->charbuf_used;
4425   unsigned char *dst = coding->destination + coding->produced;
4426   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4427   int safe_room = 16;
4428   bool bol_designation
4429     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4430        && CODING_ISO_BOL (coding));
4431   ptrdiff_t produced_chars = 0;
4432   Lisp_Object attrs, eol_type, charset_list;
4433   bool ascii_compatible;
4434   int c;
4435   int preferred_charset_id = -1;
4436
4437   CODING_GET_INFO (coding, attrs, charset_list);
4438   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4439   if (VECTORP (eol_type))
4440     eol_type = Qunix;
4441
4442   setup_iso_safe_charsets (attrs);
4443   /* Charset list may have been changed.  */
4444   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4445   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4446
4447   ascii_compatible
4448     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4449        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4450                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4451
4452   while (charbuf < charbuf_end)
4453     {
4454       ASSURE_DESTINATION (safe_room);
4455
4456       if (bol_designation)
4457         {
4458           /* We have to produce designation sequences if any now.  */
4459           unsigned char desig_buf[16];
4460           ptrdiff_t nbytes;
4461           ptrdiff_t offset;
4462
4463           charset_map_loaded = 0;
4464           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4465                                               desig_buf);
4466           if (charset_map_loaded
4467               && (offset = coding_change_destination (coding)))
4468             {
4469               dst += offset;
4470               dst_end += offset;
4471             }
4472           memcpy (dst, desig_buf, nbytes);
4473           dst += nbytes;
4474           /* We are sure that designation sequences are all ASCII bytes.  */
4475           produced_chars += nbytes;
4476           bol_designation = 0;
4477           ASSURE_DESTINATION (safe_room);
4478         }
4479
4480       c = *charbuf++;
4481
4482       if (c < 0)
4483         {
4484           /* Handle an annotation.  */
4485           switch (*charbuf)
4486             {
4487             case CODING_ANNOTATE_COMPOSITION_MASK:
4488               /* Not yet implemented.  */
4489               break;
4490             case CODING_ANNOTATE_CHARSET_MASK:
4491               preferred_charset_id = charbuf[2];
4492               if (preferred_charset_id >= 0
4493                   && NILP (Fmemq (make_number (preferred_charset_id),
4494                                   charset_list)))
4495                 preferred_charset_id = -1;
4496               break;
4497             default:
4498               emacs_abort ();
4499             }
4500           charbuf += -c - 1;
4501           continue;
4502         }
4503
4504       /* Now encode the character C.  */
4505       if (c < 0x20 || c == 0x7F)
4506         {
4507           if (c == '\n'
4508               || (c == '\r' && EQ (eol_type, Qmac)))
4509             {
4510               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4511                 ENCODE_RESET_PLANE_AND_REGISTER ();
4512               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4513                 {
4514                   int i;
4515
4516                   for (i = 0; i < 4; i++)
4517                     CODING_ISO_DESIGNATION (coding, i)
4518                       = CODING_ISO_INITIAL (coding, i);
4519                 }
4520               bol_designation = ((CODING_ISO_FLAGS (coding)
4521                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4522                                  != 0);
4523             }
4524           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4525             ENCODE_RESET_PLANE_AND_REGISTER ();
4526           EMIT_ONE_ASCII_BYTE (c);
4527         }
4528       else if (ASCII_CHAR_P (c))
4529         {
4530           if (ascii_compatible)
4531             EMIT_ONE_ASCII_BYTE (c);
4532           else
4533             {
4534               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4535               ENCODE_ISO_CHARACTER (charset, c);
4536             }
4537         }
4538       else if (CHAR_BYTE8_P (c))
4539         {
4540           c = CHAR_TO_BYTE8 (c);
4541           EMIT_ONE_BYTE (c);
4542         }
4543       else
4544         {
4545           struct charset *charset;
4546
4547           if (preferred_charset_id >= 0)
4548             {
4549               bool result;
4550
4551               charset = CHARSET_FROM_ID (preferred_charset_id);
4552               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4553               if (! result)
4554                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4555                                      NULL, charset);
4556             }
4557           else
4558             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4559                                  NULL, charset);
4560           if (!charset)
4561             {
4562               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4563                 {
4564                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4565                   charset = CHARSET_FROM_ID (charset_ascii);
4566                 }
4567               else
4568                 {
4569                   c = coding->default_char;
4570                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4571                                        charset_list, NULL, charset);
4572                 }
4573             }
4574           ENCODE_ISO_CHARACTER (charset, c);
4575         }
4576     }
4577
4578   if (coding->mode & CODING_MODE_LAST_BLOCK
4579       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4580     {
4581       ASSURE_DESTINATION (safe_room);
4582       ENCODE_RESET_PLANE_AND_REGISTER ();
4583     }
4584   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4585   CODING_ISO_BOL (coding) = bol_designation;
4586   coding->produced_char += produced_chars;
4587   coding->produced = dst - coding->destination;
4588   return 0;
4589 }
4590
4591 \f
4592 /*** 8,9. SJIS and BIG5 handlers ***/
4593
4594 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4595    quite widely.  So, for the moment, Emacs supports them in the bare
4596    C code.  But, in the future, they may be supported only by CCL.  */
4597
4598 /* SJIS is a coding system encoding three character sets: ASCII, right
4599    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4600    as is.  A character of charset katakana-jisx0201 is encoded by
4601    "position-code + 0x80".  A character of charset japanese-jisx0208
4602    is encoded in 2-byte but two position-codes are divided and shifted
4603    so that it fit in the range below.
4604
4605    --- CODE RANGE of SJIS ---
4606    (character set)      (range)
4607    ASCII                0x00 .. 0x7F
4608    KATAKANA-JISX0201    0xA0 .. 0xDF
4609    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4610             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4611    -------------------------------
4612
4613 */
4614
4615 /* BIG5 is a coding system encoding two character sets: ASCII and
4616    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4617    character set and is encoded in two-byte.
4618
4619    --- CODE RANGE of BIG5 ---
4620    (character set)      (range)
4621    ASCII                0x00 .. 0x7F
4622    Big5 (1st byte)      0xA1 .. 0xFE
4623         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4624    --------------------------
4625
4626   */
4627
4628 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4629    Return true if a text is encoded in SJIS.  */
4630
4631 static bool
4632 detect_coding_sjis (struct coding_system *coding,
4633                     struct coding_detection_info *detect_info)
4634 {
4635   const unsigned char *src = coding->source, *src_base;
4636   const unsigned char *src_end = coding->source + coding->src_bytes;
4637   bool multibytep = coding->src_multibyte;
4638   ptrdiff_t consumed_chars = 0;
4639   int found = 0;
4640   int c;
4641   Lisp_Object attrs, charset_list;
4642   int max_first_byte_of_2_byte_code;
4643
4644   CODING_GET_INFO (coding, attrs, charset_list);
4645   max_first_byte_of_2_byte_code
4646     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4647
4648   detect_info->checked |= CATEGORY_MASK_SJIS;
4649   /* A coding system of this category is always ASCII compatible.  */
4650   src += coding->head_ascii;
4651
4652   while (1)
4653     {
4654       src_base = src;
4655       ONE_MORE_BYTE (c);
4656       if (c < 0x80)
4657         continue;
4658       if ((c >= 0x81 && c <= 0x9F)
4659           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4660         {
4661           ONE_MORE_BYTE (c);
4662           if (c < 0x40 || c == 0x7F || c > 0xFC)
4663             break;
4664           found = CATEGORY_MASK_SJIS;
4665         }
4666       else if (c >= 0xA0 && c < 0xE0)
4667         found = CATEGORY_MASK_SJIS;
4668       else
4669         break;
4670     }
4671   detect_info->rejected |= CATEGORY_MASK_SJIS;
4672   return 0;
4673
4674  no_more_source:
4675   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4676     {
4677       detect_info->rejected |= CATEGORY_MASK_SJIS;
4678       return 0;
4679     }
4680   detect_info->found |= found;
4681   return 1;
4682 }
4683
4684 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4685    Return true if a text is encoded in BIG5.  */
4686
4687 static bool
4688 detect_coding_big5 (struct coding_system *coding,
4689                     struct coding_detection_info *detect_info)
4690 {
4691   const unsigned char *src = coding->source, *src_base;
4692   const unsigned char *src_end = coding->source + coding->src_bytes;
4693   bool multibytep = coding->src_multibyte;
4694   ptrdiff_t consumed_chars = 0;
4695   int found = 0;
4696   int c;
4697
4698   detect_info->checked |= CATEGORY_MASK_BIG5;
4699   /* A coding system of this category is always ASCII compatible.  */
4700   src += coding->head_ascii;
4701
4702   while (1)
4703     {
4704       src_base = src;
4705       ONE_MORE_BYTE (c);
4706       if (c < 0x80)
4707         continue;
4708       if (c >= 0xA1)
4709         {
4710           ONE_MORE_BYTE (c);
4711           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4712             return 0;
4713           found = CATEGORY_MASK_BIG5;
4714         }
4715       else
4716         break;
4717     }
4718   detect_info->rejected |= CATEGORY_MASK_BIG5;
4719   return 0;
4720
4721  no_more_source:
4722   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4723     {
4724       detect_info->rejected |= CATEGORY_MASK_BIG5;
4725       return 0;
4726     }
4727   detect_info->found |= found;
4728   return 1;
4729 }
4730
4731 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4732
4733 static void
4734 decode_coding_sjis (struct coding_system *coding)
4735 {
4736   const unsigned char *src = coding->source + coding->consumed;
4737   const unsigned char *src_end = coding->source + coding->src_bytes;
4738   const unsigned char *src_base;
4739   int *charbuf = coding->charbuf + coding->charbuf_used;
4740   /* We may produce one charset annotation in one loop and one more at
4741      the end.  */
4742   int *charbuf_end
4743     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4744   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4745   bool multibytep = coding->src_multibyte;
4746   struct charset *charset_roman, *charset_kanji, *charset_kana;
4747   struct charset *charset_kanji2;
4748   Lisp_Object attrs, charset_list, val;
4749   ptrdiff_t char_offset = coding->produced_char;
4750   ptrdiff_t last_offset = char_offset;
4751   int last_id = charset_ascii;
4752   bool eol_dos
4753     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4754   int byte_after_cr = -1;
4755
4756   CODING_GET_INFO (coding, attrs, charset_list);
4757
4758   val = charset_list;
4759   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4760   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4761   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4762   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4763
4764   while (1)
4765     {
4766       int c, c1;
4767       struct charset *charset;
4768
4769       src_base = src;
4770       consumed_chars_base = consumed_chars;
4771
4772       if (charbuf >= charbuf_end)
4773         {
4774           if (byte_after_cr >= 0)
4775             src_base--;
4776           break;
4777         }
4778
4779       if (byte_after_cr >= 0)
4780         c = byte_after_cr, byte_after_cr = -1;
4781       else
4782         ONE_MORE_BYTE (c);
4783       if (c < 0)
4784         goto invalid_code;
4785       if (c < 0x80)
4786         {
4787           if (eol_dos && c == '\r')
4788             ONE_MORE_BYTE (byte_after_cr);
4789           charset = charset_roman;
4790         }
4791       else if (c == 0x80 || c == 0xA0)
4792         goto invalid_code;
4793       else if (c >= 0xA1 && c <= 0xDF)
4794         {
4795           /* SJIS -> JISX0201-Kana */
4796           c &= 0x7F;
4797           charset = charset_kana;
4798         }
4799       else if (c <= 0xEF)
4800         {
4801           /* SJIS -> JISX0208 */
4802           ONE_MORE_BYTE (c1);
4803           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4804             goto invalid_code;
4805           c = (c << 8) | c1;
4806           SJIS_TO_JIS (c);
4807           charset = charset_kanji;
4808         }
4809       else if (c <= 0xFC && charset_kanji2)
4810         {
4811           /* SJIS -> JISX0213-2 */
4812           ONE_MORE_BYTE (c1);
4813           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4814             goto invalid_code;
4815           c = (c << 8) | c1;
4816           SJIS_TO_JIS2 (c);
4817           charset = charset_kanji2;
4818         }
4819       else
4820         goto invalid_code;
4821       if (charset->id != charset_ascii
4822           && last_id != charset->id)
4823         {
4824           if (last_id != charset_ascii)
4825             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4826           last_id = charset->id;
4827           last_offset = char_offset;
4828         }
4829       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4830       *charbuf++ = c;
4831       char_offset++;
4832       continue;
4833
4834     invalid_code:
4835       src = src_base;
4836       consumed_chars = consumed_chars_base;
4837       ONE_MORE_BYTE (c);
4838       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4839       char_offset++;
4840       coding->errors++;
4841     }
4842
4843  no_more_source:
4844   if (last_id != charset_ascii)
4845     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4846   coding->consumed_char += consumed_chars_base;
4847   coding->consumed = src_base - coding->source;
4848   coding->charbuf_used = charbuf - coding->charbuf;
4849 }
4850
4851 static void
4852 decode_coding_big5 (struct coding_system *coding)
4853 {
4854   const unsigned char *src = coding->source + coding->consumed;
4855   const unsigned char *src_end = coding->source + coding->src_bytes;
4856   const unsigned char *src_base;
4857   int *charbuf = coding->charbuf + coding->charbuf_used;
4858   /* We may produce one charset annotation in one loop and one more at
4859      the end.  */
4860   int *charbuf_end
4861     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4862   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4863   bool multibytep = coding->src_multibyte;
4864   struct charset *charset_roman, *charset_big5;
4865   Lisp_Object attrs, charset_list, val;
4866   ptrdiff_t char_offset = coding->produced_char;
4867   ptrdiff_t last_offset = char_offset;
4868   int last_id = charset_ascii;
4869   bool eol_dos
4870     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4871   int byte_after_cr = -1;
4872
4873   CODING_GET_INFO (coding, attrs, charset_list);
4874   val = charset_list;
4875   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4876   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4877
4878   while (1)
4879     {
4880       int c, c1;
4881       struct charset *charset;
4882
4883       src_base = src;
4884       consumed_chars_base = consumed_chars;
4885
4886       if (charbuf >= charbuf_end)
4887         {
4888           if (byte_after_cr >= 0)
4889             src_base--;
4890           break;
4891         }
4892
4893       if (byte_after_cr >= 0)
4894         c = byte_after_cr, byte_after_cr = -1;
4895       else
4896         ONE_MORE_BYTE (c);
4897
4898       if (c < 0)
4899         goto invalid_code;
4900       if (c < 0x80)
4901         {
4902           if (eol_dos && c == '\r')
4903             ONE_MORE_BYTE (byte_after_cr);
4904           charset = charset_roman;
4905         }
4906       else
4907         {
4908           /* BIG5 -> Big5 */
4909           if (c < 0xA1 || c > 0xFE)
4910             goto invalid_code;
4911           ONE_MORE_BYTE (c1);
4912           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4913             goto invalid_code;
4914           c = c << 8 | c1;
4915           charset = charset_big5;
4916         }
4917       if (charset->id != charset_ascii
4918           && last_id != charset->id)
4919         {
4920           if (last_id != charset_ascii)
4921             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4922           last_id = charset->id;
4923           last_offset = char_offset;
4924         }
4925       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4926       *charbuf++ = c;
4927       char_offset++;
4928       continue;
4929
4930     invalid_code:
4931       src = src_base;
4932       consumed_chars = consumed_chars_base;
4933       ONE_MORE_BYTE (c);
4934       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4935       char_offset++;
4936       coding->errors++;
4937     }
4938
4939  no_more_source:
4940   if (last_id != charset_ascii)
4941     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4942   coding->consumed_char += consumed_chars_base;
4943   coding->consumed = src_base - coding->source;
4944   coding->charbuf_used = charbuf - coding->charbuf;
4945 }
4946
4947 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4948    This function can encode charsets `ascii', `katakana-jisx0201',
4949    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4950    are sure that all these charsets are registered as official charset
4951    (i.e. do not have extended leading-codes).  Characters of other
4952    charsets are produced without any encoding.  */
4953
4954 static bool
4955 encode_coding_sjis (struct coding_system *coding)
4956 {
4957   bool multibytep = coding->dst_multibyte;
4958   int *charbuf = coding->charbuf;
4959   int *charbuf_end = charbuf + coding->charbuf_used;
4960   unsigned char *dst = coding->destination + coding->produced;
4961   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4962   int safe_room = 4;
4963   ptrdiff_t produced_chars = 0;
4964   Lisp_Object attrs, charset_list, val;
4965   bool ascii_compatible;
4966   struct charset *charset_kanji, *charset_kana;
4967   struct charset *charset_kanji2;
4968   int c;
4969
4970   CODING_GET_INFO (coding, attrs, charset_list);
4971   val = XCDR (charset_list);
4972   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4973   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4974   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4975
4976   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4977
4978   while (charbuf < charbuf_end)
4979     {
4980       ASSURE_DESTINATION (safe_room);
4981       c = *charbuf++;
4982       /* Now encode the character C.  */
4983       if (ASCII_CHAR_P (c) && ascii_compatible)
4984         EMIT_ONE_ASCII_BYTE (c);
4985       else if (CHAR_BYTE8_P (c))
4986         {
4987           c = CHAR_TO_BYTE8 (c);
4988           EMIT_ONE_BYTE (c);
4989         }
4990       else
4991         {
4992           unsigned code;
4993           struct charset *charset;
4994           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4995                                &code, charset);
4996
4997           if (!charset)
4998             {
4999               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5000                 {
5001                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5002                   charset = CHARSET_FROM_ID (charset_ascii);
5003                 }
5004               else
5005                 {
5006                   c = coding->default_char;
5007                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5008                                        charset_list, &code, charset);
5009                 }
5010             }
5011           if (code == CHARSET_INVALID_CODE (charset))
5012             emacs_abort ();
5013           if (charset == charset_kanji)
5014             {
5015               int c1, c2;
5016               JIS_TO_SJIS (code);
5017               c1 = code >> 8, c2 = code & 0xFF;
5018               EMIT_TWO_BYTES (c1, c2);
5019             }
5020           else if (charset == charset_kana)
5021             EMIT_ONE_BYTE (code | 0x80);
5022           else if (charset_kanji2 && charset == charset_kanji2)
5023             {
5024               int c1, c2;
5025
5026               c1 = code >> 8;
5027               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5028                   || c1 == 0x28
5029                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5030                 {
5031                   JIS_TO_SJIS2 (code);
5032                   c1 = code >> 8, c2 = code & 0xFF;
5033                   EMIT_TWO_BYTES (c1, c2);
5034                 }
5035               else
5036                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5037             }
5038           else
5039             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5040         }
5041     }
5042   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5043   coding->produced_char += produced_chars;
5044   coding->produced = dst - coding->destination;
5045   return 0;
5046 }
5047
5048 static bool
5049 encode_coding_big5 (struct coding_system *coding)
5050 {
5051   bool multibytep = coding->dst_multibyte;
5052   int *charbuf = coding->charbuf;
5053   int *charbuf_end = charbuf + coding->charbuf_used;
5054   unsigned char *dst = coding->destination + coding->produced;
5055   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5056   int safe_room = 4;
5057   ptrdiff_t produced_chars = 0;
5058   Lisp_Object attrs, charset_list, val;
5059   bool ascii_compatible;
5060   struct charset *charset_big5;
5061   int c;
5062
5063   CODING_GET_INFO (coding, attrs, charset_list);
5064   val = XCDR (charset_list);
5065   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5066   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5067
5068   while (charbuf < charbuf_end)
5069     {
5070       ASSURE_DESTINATION (safe_room);
5071       c = *charbuf++;
5072       /* Now encode the character C.  */
5073       if (ASCII_CHAR_P (c) && ascii_compatible)
5074         EMIT_ONE_ASCII_BYTE (c);
5075       else if (CHAR_BYTE8_P (c))
5076         {
5077           c = CHAR_TO_BYTE8 (c);
5078           EMIT_ONE_BYTE (c);
5079         }
5080       else
5081         {
5082           unsigned code;
5083           struct charset *charset;
5084           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5085                                &code, charset);
5086
5087           if (! charset)
5088             {
5089               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5090                 {
5091                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5092                   charset = CHARSET_FROM_ID (charset_ascii);
5093                 }
5094               else
5095                 {
5096                   c = coding->default_char;
5097                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5098                                        charset_list, &code, charset);
5099                 }
5100             }
5101           if (code == CHARSET_INVALID_CODE (charset))
5102             emacs_abort ();
5103           if (charset == charset_big5)
5104             {
5105               int c1, c2;
5106
5107               c1 = code >> 8, c2 = code & 0xFF;
5108               EMIT_TWO_BYTES (c1, c2);
5109             }
5110           else
5111             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5112         }
5113     }
5114   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5115   coding->produced_char += produced_chars;
5116   coding->produced = dst - coding->destination;
5117   return 0;
5118 }
5119
5120 \f
5121 /*** 10. CCL handlers ***/
5122
5123 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5124    Return true if a text is encoded in a coding system of which
5125    encoder/decoder are written in CCL program.  */
5126
5127 static bool
5128 detect_coding_ccl (struct coding_system *coding,
5129                    struct coding_detection_info *detect_info)
5130 {
5131   const unsigned char *src = coding->source, *src_base;
5132   const unsigned char *src_end = coding->source + coding->src_bytes;
5133   bool multibytep = coding->src_multibyte;
5134   ptrdiff_t consumed_chars = 0;
5135   int found = 0;
5136   unsigned char *valids;
5137   ptrdiff_t head_ascii = coding->head_ascii;
5138   Lisp_Object attrs;
5139
5140   detect_info->checked |= CATEGORY_MASK_CCL;
5141
5142   coding = &coding_categories[coding_category_ccl];
5143   valids = CODING_CCL_VALIDS (coding);
5144   attrs = CODING_ID_ATTRS (coding->id);
5145   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5146     src += head_ascii;
5147
5148   while (1)
5149     {
5150       int c;
5151
5152       src_base = src;
5153       ONE_MORE_BYTE (c);
5154       if (c < 0 || ! valids[c])
5155         break;
5156       if ((valids[c] > 1))
5157         found = CATEGORY_MASK_CCL;
5158     }
5159   detect_info->rejected |= CATEGORY_MASK_CCL;
5160   return 0;
5161
5162  no_more_source:
5163   detect_info->found |= found;
5164   return 1;
5165 }
5166
5167 static void
5168 decode_coding_ccl (struct coding_system *coding)
5169 {
5170   const unsigned char *src = coding->source + coding->consumed;
5171   const unsigned char *src_end = coding->source + coding->src_bytes;
5172   int *charbuf = coding->charbuf + coding->charbuf_used;
5173   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5174   ptrdiff_t consumed_chars = 0;
5175   bool multibytep = coding->src_multibyte;
5176   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5177   int source_charbuf[1024];
5178   int source_byteidx[1025];
5179   Lisp_Object attrs, charset_list;
5180
5181   CODING_GET_INFO (coding, attrs, charset_list);
5182
5183   while (1)
5184     {
5185       const unsigned char *p = src;
5186       ptrdiff_t offset;
5187       int i = 0;
5188
5189       if (multibytep)
5190         {
5191           while (i < 1024 && p < src_end)
5192             {
5193               source_byteidx[i] = p - src;
5194               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5195             }
5196           source_byteidx[i] = p - src;
5197         }
5198       else
5199         while (i < 1024 && p < src_end)
5200           source_charbuf[i++] = *p++;
5201
5202       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5203         ccl->last_block = true;
5204       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5205       charset_map_loaded = 0;
5206       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5207                   charset_list);
5208       if (charset_map_loaded
5209           && (offset = coding_change_source (coding)))
5210         {
5211           p += offset;
5212           src += offset;
5213           src_end += offset;
5214         }
5215       charbuf += ccl->produced;
5216       if (multibytep)
5217         src += source_byteidx[ccl->consumed];
5218       else
5219         src += ccl->consumed;
5220       consumed_chars += ccl->consumed;
5221       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5222         break;
5223     }
5224
5225   switch (ccl->status)
5226     {
5227     case CCL_STAT_SUSPEND_BY_SRC:
5228       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5229       break;
5230     case CCL_STAT_SUSPEND_BY_DST:
5231       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5232       break;
5233     case CCL_STAT_QUIT:
5234     case CCL_STAT_INVALID_CMD:
5235       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5236       break;
5237     default:
5238       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5239       break;
5240     }
5241   coding->consumed_char += consumed_chars;
5242   coding->consumed = src - coding->source;
5243   coding->charbuf_used = charbuf - coding->charbuf;
5244 }
5245
5246 static bool
5247 encode_coding_ccl (struct coding_system *coding)
5248 {
5249   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5250   bool multibytep = coding->dst_multibyte;
5251   int *charbuf = coding->charbuf;
5252   int *charbuf_end = charbuf + coding->charbuf_used;
5253   unsigned char *dst = coding->destination + coding->produced;
5254   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5255   int destination_charbuf[1024];
5256   ptrdiff_t produced_chars = 0;
5257   int i;
5258   Lisp_Object attrs, charset_list;
5259
5260   CODING_GET_INFO (coding, attrs, charset_list);
5261   if (coding->consumed_char == coding->src_chars
5262       && coding->mode & CODING_MODE_LAST_BLOCK)
5263     ccl->last_block = true;
5264
5265   do
5266     {
5267       ptrdiff_t offset;
5268
5269       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5270       charset_map_loaded = 0;
5271       ccl_driver (ccl, charbuf, destination_charbuf,
5272                   charbuf_end - charbuf, 1024, charset_list);
5273       if (charset_map_loaded
5274           && (offset = coding_change_destination (coding)))
5275         dst += offset;
5276       if (multibytep)
5277         {
5278           ASSURE_DESTINATION (ccl->produced * 2);
5279           for (i = 0; i < ccl->produced; i++)
5280             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5281         }
5282       else
5283         {
5284           ASSURE_DESTINATION (ccl->produced);
5285           for (i = 0; i < ccl->produced; i++)
5286             *dst++ = destination_charbuf[i] & 0xFF;
5287           produced_chars += ccl->produced;
5288         }
5289       charbuf += ccl->consumed;
5290       if (ccl->status == CCL_STAT_QUIT
5291           || ccl->status == CCL_STAT_INVALID_CMD)
5292         break;
5293     }
5294   while (charbuf < charbuf_end);
5295
5296   switch (ccl->status)
5297     {
5298     case CCL_STAT_SUSPEND_BY_SRC:
5299       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5300       break;
5301     case CCL_STAT_SUSPEND_BY_DST:
5302       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5303       break;
5304     case CCL_STAT_QUIT:
5305     case CCL_STAT_INVALID_CMD:
5306       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5307       break;
5308     default:
5309       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5310       break;
5311     }
5312
5313   coding->produced_char += produced_chars;
5314   coding->produced = dst - coding->destination;
5315   return 0;
5316 }
5317
5318 \f
5319 /*** 10, 11. no-conversion handlers ***/
5320
5321 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5322
5323 static void
5324 decode_coding_raw_text (struct coding_system *coding)
5325 {
5326   bool eol_dos
5327     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5328
5329   coding->chars_at_source = 1;
5330   coding->consumed_char = coding->src_chars;
5331   coding->consumed = coding->src_bytes;
5332   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5333     {
5334       coding->consumed_char--;
5335       coding->consumed--;
5336       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5337     }
5338   else
5339     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5340 }
5341
5342 static bool
5343 encode_coding_raw_text (struct coding_system *coding)
5344 {
5345   bool multibytep = coding->dst_multibyte;
5346   int *charbuf = coding->charbuf;
5347   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5348   unsigned char *dst = coding->destination + coding->produced;
5349   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5350   ptrdiff_t produced_chars = 0;
5351   int c;
5352
5353   if (multibytep)
5354     {
5355       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5356
5357       if (coding->src_multibyte)
5358         while (charbuf < charbuf_end)
5359           {
5360             ASSURE_DESTINATION (safe_room);
5361             c = *charbuf++;
5362             if (ASCII_CHAR_P (c))
5363               EMIT_ONE_ASCII_BYTE (c);
5364             else if (CHAR_BYTE8_P (c))
5365               {
5366                 c = CHAR_TO_BYTE8 (c);
5367                 EMIT_ONE_BYTE (c);
5368               }
5369             else
5370               {
5371                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5372
5373                 CHAR_STRING_ADVANCE (c, p1);
5374                 do
5375                   {
5376                     EMIT_ONE_BYTE (*p0);
5377                     p0++;
5378                   }
5379                 while (p0 < p1);
5380               }
5381           }
5382       else
5383         while (charbuf < charbuf_end)
5384           {
5385             ASSURE_DESTINATION (safe_room);
5386             c = *charbuf++;
5387             EMIT_ONE_BYTE (c);
5388           }
5389     }
5390   else
5391     {
5392       if (coding->src_multibyte)
5393         {
5394           int safe_room = MAX_MULTIBYTE_LENGTH;
5395
5396           while (charbuf < charbuf_end)
5397             {
5398               ASSURE_DESTINATION (safe_room);
5399               c = *charbuf++;
5400               if (ASCII_CHAR_P (c))
5401                 *dst++ = c;
5402               else if (CHAR_BYTE8_P (c))
5403                 *dst++ = CHAR_TO_BYTE8 (c);
5404               else
5405                 CHAR_STRING_ADVANCE (c, dst);
5406             }
5407         }
5408       else
5409         {
5410           ASSURE_DESTINATION (charbuf_end - charbuf);
5411           while (charbuf < charbuf_end && dst < dst_end)
5412             *dst++ = *charbuf++;
5413         }
5414       produced_chars = dst - (coding->destination + coding->produced);
5415     }
5416   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5417   coding->produced_char += produced_chars;
5418   coding->produced = dst - coding->destination;
5419   return 0;
5420 }
5421
5422 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5423    Return true if a text is encoded in a charset-based coding system.  */
5424
5425 static bool
5426 detect_coding_charset (struct coding_system *coding,
5427                        struct coding_detection_info *detect_info)
5428 {
5429   const unsigned char *src = coding->source, *src_base;
5430   const unsigned char *src_end = coding->source + coding->src_bytes;
5431   bool multibytep = coding->src_multibyte;
5432   ptrdiff_t consumed_chars = 0;
5433   Lisp_Object attrs, valids, name;
5434   int found = 0;
5435   ptrdiff_t head_ascii = coding->head_ascii;
5436   bool check_latin_extra = 0;
5437
5438   detect_info->checked |= CATEGORY_MASK_CHARSET;
5439
5440   coding = &coding_categories[coding_category_charset];
5441   attrs = CODING_ID_ATTRS (coding->id);
5442   valids = AREF (attrs, coding_attr_charset_valids);
5443   name = CODING_ID_NAME (coding->id);
5444   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5445                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5446       || strncmp (SSDATA (SYMBOL_NAME (name)),
5447                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5448     check_latin_extra = 1;
5449
5450   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5451     src += head_ascii;
5452
5453   while (1)
5454     {
5455       int c;
5456       Lisp_Object val;
5457       struct charset *charset;
5458       int dim, idx;
5459
5460       src_base = src;
5461       ONE_MORE_BYTE (c);
5462       if (c < 0)
5463         continue;
5464       val = AREF (valids, c);
5465       if (NILP (val))
5466         break;
5467       if (c >= 0x80)
5468         {
5469           if (c < 0xA0
5470               && check_latin_extra
5471               && (!VECTORP (Vlatin_extra_code_table)
5472                   || NILP (AREF (Vlatin_extra_code_table, c))))
5473             break;
5474           found = CATEGORY_MASK_CHARSET;
5475         }
5476       if (INTEGERP (val))
5477         {
5478           charset = CHARSET_FROM_ID (XFASTINT (val));
5479           dim = CHARSET_DIMENSION (charset);
5480           for (idx = 1; idx < dim; idx++)
5481             {
5482               if (src == src_end)
5483                 goto too_short;
5484               ONE_MORE_BYTE (c);
5485               if (c < charset->code_space[(dim - 1 - idx) * 4]
5486                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5487                 break;
5488             }
5489           if (idx < dim)
5490             break;
5491         }
5492       else
5493         {
5494           idx = 1;
5495           for (; CONSP (val); val = XCDR (val))
5496             {
5497               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5498               dim = CHARSET_DIMENSION (charset);
5499               while (idx < dim)
5500                 {
5501                   if (src == src_end)
5502                     goto too_short;
5503                   ONE_MORE_BYTE (c);
5504                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5505                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5506                     break;
5507                   idx++;
5508                 }
5509               if (idx == dim)
5510                 {
5511                   val = Qnil;
5512                   break;
5513                 }
5514             }
5515           if (CONSP (val))
5516             break;
5517         }
5518     }
5519  too_short:
5520   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5521   return 0;
5522
5523  no_more_source:
5524   detect_info->found |= found;
5525   return 1;
5526 }
5527
5528 static void
5529 decode_coding_charset (struct coding_system *coding)
5530 {
5531   const unsigned char *src = coding->source + coding->consumed;
5532   const unsigned char *src_end = coding->source + coding->src_bytes;
5533   const unsigned char *src_base;
5534   int *charbuf = coding->charbuf + coding->charbuf_used;
5535   /* We may produce one charset annotation in one loop and one more at
5536      the end.  */
5537   int *charbuf_end
5538     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5539   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5540   bool multibytep = coding->src_multibyte;
5541   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5542   Lisp_Object valids;
5543   ptrdiff_t char_offset = coding->produced_char;
5544   ptrdiff_t last_offset = char_offset;
5545   int last_id = charset_ascii;
5546   bool eol_dos
5547     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5548   int byte_after_cr = -1;
5549
5550   valids = AREF (attrs, coding_attr_charset_valids);
5551
5552   while (1)
5553     {
5554       int c;
5555       Lisp_Object val;
5556       struct charset *charset;
5557       int dim;
5558       int len = 1;
5559       unsigned code;
5560
5561       src_base = src;
5562       consumed_chars_base = consumed_chars;
5563
5564       if (charbuf >= charbuf_end)
5565         {
5566           if (byte_after_cr >= 0)
5567             src_base--;
5568           break;
5569         }
5570
5571       if (byte_after_cr >= 0)
5572         {
5573           c = byte_after_cr;
5574           byte_after_cr = -1;
5575         }
5576       else
5577         {
5578           ONE_MORE_BYTE (c);
5579           if (eol_dos && c == '\r')
5580             ONE_MORE_BYTE (byte_after_cr);
5581         }
5582       if (c < 0)
5583         goto invalid_code;
5584       code = c;
5585
5586       val = AREF (valids, c);
5587       if (! INTEGERP (val) && ! CONSP (val))
5588         goto invalid_code;
5589       if (INTEGERP (val))
5590         {
5591           charset = CHARSET_FROM_ID (XFASTINT (val));
5592           dim = CHARSET_DIMENSION (charset);
5593           while (len < dim)
5594             {
5595               ONE_MORE_BYTE (c);
5596               code = (code << 8) | c;
5597               len++;
5598             }
5599           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5600                               charset, code, c);
5601         }
5602       else
5603         {
5604           /* VAL is a list of charset IDs.  It is assured that the
5605              list is sorted by charset dimensions (smaller one
5606              comes first).  */
5607           while (CONSP (val))
5608             {
5609               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5610               dim = CHARSET_DIMENSION (charset);
5611               while (len < dim)
5612                 {
5613                   ONE_MORE_BYTE (c);
5614                   code = (code << 8) | c;
5615                   len++;
5616                 }
5617               CODING_DECODE_CHAR (coding, src, src_base,
5618                                   src_end, charset, code, c);
5619               if (c >= 0)
5620                 break;
5621               val = XCDR (val);
5622             }
5623         }
5624       if (c < 0)
5625         goto invalid_code;
5626       if (charset->id != charset_ascii
5627           && last_id != charset->id)
5628         {
5629           if (last_id != charset_ascii)
5630             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5631           last_id = charset->id;
5632           last_offset = char_offset;
5633         }
5634
5635       *charbuf++ = c;
5636       char_offset++;
5637       continue;
5638
5639     invalid_code:
5640       src = src_base;
5641       consumed_chars = consumed_chars_base;
5642       ONE_MORE_BYTE (c);
5643       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5644       char_offset++;
5645       coding->errors++;
5646     }
5647
5648  no_more_source:
5649   if (last_id != charset_ascii)
5650     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5651   coding->consumed_char += consumed_chars_base;
5652   coding->consumed = src_base - coding->source;
5653   coding->charbuf_used = charbuf - coding->charbuf;
5654 }
5655
5656 static bool
5657 encode_coding_charset (struct coding_system *coding)
5658 {
5659   bool multibytep = coding->dst_multibyte;
5660   int *charbuf = coding->charbuf;
5661   int *charbuf_end = charbuf + coding->charbuf_used;
5662   unsigned char *dst = coding->destination + coding->produced;
5663   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5664   int safe_room = MAX_MULTIBYTE_LENGTH;
5665   ptrdiff_t produced_chars = 0;
5666   Lisp_Object attrs, charset_list;
5667   bool ascii_compatible;
5668   int c;
5669
5670   CODING_GET_INFO (coding, attrs, charset_list);
5671   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5672
5673   while (charbuf < charbuf_end)
5674     {
5675       struct charset *charset;
5676       unsigned code;
5677
5678       ASSURE_DESTINATION (safe_room);
5679       c = *charbuf++;
5680       if (ascii_compatible && ASCII_CHAR_P (c))
5681         EMIT_ONE_ASCII_BYTE (c);
5682       else if (CHAR_BYTE8_P (c))
5683         {
5684           c = CHAR_TO_BYTE8 (c);
5685           EMIT_ONE_BYTE (c);
5686         }
5687       else
5688         {
5689           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5690                                &code, charset);
5691
5692           if (charset)
5693             {
5694               if (CHARSET_DIMENSION (charset) == 1)
5695                 EMIT_ONE_BYTE (code);
5696               else if (CHARSET_DIMENSION (charset) == 2)
5697                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5698               else if (CHARSET_DIMENSION (charset) == 3)
5699                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5700               else
5701                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5702                                  (code >> 8) & 0xFF, code & 0xFF);
5703             }
5704           else
5705             {
5706               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5707                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5708               else
5709                 c = coding->default_char;
5710               EMIT_ONE_BYTE (c);
5711             }
5712         }
5713     }
5714
5715   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5716   coding->produced_char += produced_chars;
5717   coding->produced = dst - coding->destination;
5718   return 0;
5719 }
5720
5721 \f
5722 /*** 7. C library functions ***/
5723
5724 /* Setup coding context CODING from information about CODING_SYSTEM.
5725    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5726    CODING_SYSTEM is invalid, signal an error.  */
5727
5728 void
5729 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5730 {
5731   Lisp_Object attrs;
5732   Lisp_Object eol_type;
5733   Lisp_Object coding_type;
5734   Lisp_Object val;
5735
5736   if (NILP (coding_system))
5737     coding_system = Qundecided;
5738
5739   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5740
5741   attrs = CODING_ID_ATTRS (coding->id);
5742   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5743
5744   coding->mode = 0;
5745   if (VECTORP (eol_type))
5746     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5747                             | CODING_REQUIRE_DETECTION_MASK);
5748   else if (! EQ (eol_type, Qunix))
5749     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5750                             | CODING_REQUIRE_ENCODING_MASK);
5751   else
5752     coding->common_flags = 0;
5753   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5754     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5755   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5756     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5757   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5758     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5759
5760   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5761   coding->max_charset_id = SCHARS (val) - 1;
5762   coding->safe_charsets = SDATA (val);
5763   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5764   coding->carryover_bytes = 0;
5765   coding->raw_destination = 0;
5766
5767   coding_type = CODING_ATTR_TYPE (attrs);
5768   if (EQ (coding_type, Qundecided))
5769     {
5770       coding->detector = NULL;
5771       coding->decoder = decode_coding_raw_text;
5772       coding->encoder = encode_coding_raw_text;
5773       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5774       coding->spec.undecided.inhibit_nbd
5775         = (encode_inhibit_flag
5776            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5777       coding->spec.undecided.inhibit_ied
5778         = (encode_inhibit_flag
5779            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5780       coding->spec.undecided.prefer_utf_8
5781         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5782     }
5783   else if (EQ (coding_type, Qiso_2022))
5784     {
5785       int i;
5786       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5787
5788       /* Invoke graphic register 0 to plane 0.  */
5789       CODING_ISO_INVOCATION (coding, 0) = 0;
5790       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5791       CODING_ISO_INVOCATION (coding, 1)
5792         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5793       /* Setup the initial status of designation.  */
5794       for (i = 0; i < 4; i++)
5795         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5796       /* Not single shifting initially.  */
5797       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5798       /* Beginning of buffer should also be regarded as bol. */
5799       CODING_ISO_BOL (coding) = 1;
5800       coding->detector = detect_coding_iso_2022;
5801       coding->decoder = decode_coding_iso_2022;
5802       coding->encoder = encode_coding_iso_2022;
5803       if (flags & CODING_ISO_FLAG_SAFE)
5804         coding->mode |= CODING_MODE_SAFE_ENCODING;
5805       coding->common_flags
5806         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5807             | CODING_REQUIRE_FLUSHING_MASK);
5808       if (flags & CODING_ISO_FLAG_COMPOSITION)
5809         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5810       if (flags & CODING_ISO_FLAG_DESIGNATION)
5811         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5812       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5813         {
5814           setup_iso_safe_charsets (attrs);
5815           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5816           coding->max_charset_id = SCHARS (val) - 1;
5817           coding->safe_charsets = SDATA (val);
5818         }
5819       CODING_ISO_FLAGS (coding) = flags;
5820       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5821       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5822       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5823       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5824     }
5825   else if (EQ (coding_type, Qcharset))
5826     {
5827       coding->detector = detect_coding_charset;
5828       coding->decoder = decode_coding_charset;
5829       coding->encoder = encode_coding_charset;
5830       coding->common_flags
5831         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5832     }
5833   else if (EQ (coding_type, Qutf_8))
5834     {
5835       val = AREF (attrs, coding_attr_utf_bom);
5836       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5837                                    : EQ (val, Qt) ? utf_with_bom
5838                                    : utf_without_bom);
5839       coding->detector = detect_coding_utf_8;
5840       coding->decoder = decode_coding_utf_8;
5841       coding->encoder = encode_coding_utf_8;
5842       coding->common_flags
5843         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5844       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5845         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5846     }
5847   else if (EQ (coding_type, Qutf_16))
5848     {
5849       val = AREF (attrs, coding_attr_utf_bom);
5850       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5851                                     : EQ (val, Qt) ? utf_with_bom
5852                                     : utf_without_bom);
5853       val = AREF (attrs, coding_attr_utf_16_endian);
5854       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5855                                        : utf_16_little_endian);
5856       CODING_UTF_16_SURROGATE (coding) = 0;
5857       coding->detector = detect_coding_utf_16;
5858       coding->decoder = decode_coding_utf_16;
5859       coding->encoder = encode_coding_utf_16;
5860       coding->common_flags
5861         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5862       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5863         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5864     }
5865   else if (EQ (coding_type, Qccl))
5866     {
5867       coding->detector = detect_coding_ccl;
5868       coding->decoder = decode_coding_ccl;
5869       coding->encoder = encode_coding_ccl;
5870       coding->common_flags
5871         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5872             | CODING_REQUIRE_FLUSHING_MASK);
5873     }
5874   else if (EQ (coding_type, Qemacs_mule))
5875     {
5876       coding->detector = detect_coding_emacs_mule;
5877       coding->decoder = decode_coding_emacs_mule;
5878       coding->encoder = encode_coding_emacs_mule;
5879       coding->common_flags
5880         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5881       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5882           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5883         {
5884           Lisp_Object tail, safe_charsets;
5885           int max_charset_id = 0;
5886
5887           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5888                tail = XCDR (tail))
5889             if (max_charset_id < XFASTINT (XCAR (tail)))
5890               max_charset_id = XFASTINT (XCAR (tail));
5891           safe_charsets = make_uninit_string (max_charset_id + 1);
5892           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5893           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5894                tail = XCDR (tail))
5895             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5896           coding->max_charset_id = max_charset_id;
5897           coding->safe_charsets = SDATA (safe_charsets);
5898         }
5899       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5900       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5901     }
5902   else if (EQ (coding_type, Qshift_jis))
5903     {
5904       coding->detector = detect_coding_sjis;
5905       coding->decoder = decode_coding_sjis;
5906       coding->encoder = encode_coding_sjis;
5907       coding->common_flags
5908         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5909     }
5910   else if (EQ (coding_type, Qbig5))
5911     {
5912       coding->detector = detect_coding_big5;
5913       coding->decoder = decode_coding_big5;
5914       coding->encoder = encode_coding_big5;
5915       coding->common_flags
5916         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5917     }
5918   else                          /* EQ (coding_type, Qraw_text) */
5919     {
5920       coding->detector = NULL;
5921       coding->decoder = decode_coding_raw_text;
5922       coding->encoder = encode_coding_raw_text;
5923       if (! EQ (eol_type, Qunix))
5924         {
5925           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5926           if (! VECTORP (eol_type))
5927             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5928         }
5929
5930     }
5931
5932   return;
5933 }
5934
5935 /* Return a list of charsets supported by CODING.  */
5936
5937 Lisp_Object
5938 coding_charset_list (struct coding_system *coding)
5939 {
5940   Lisp_Object attrs, charset_list;
5941
5942   CODING_GET_INFO (coding, attrs, charset_list);
5943   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5944     {
5945       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5946
5947       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5948         charset_list = Viso_2022_charset_list;
5949     }
5950   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5951     {
5952       charset_list = Vemacs_mule_charset_list;
5953     }
5954   return charset_list;
5955 }
5956
5957
5958 /* Return a list of charsets supported by CODING-SYSTEM.  */
5959
5960 Lisp_Object
5961 coding_system_charset_list (Lisp_Object coding_system)
5962 {
5963   ptrdiff_t id;
5964   Lisp_Object attrs, charset_list;
5965
5966   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5967   attrs = CODING_ID_ATTRS (id);
5968
5969   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5970     {
5971       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5972
5973       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5974         charset_list = Viso_2022_charset_list;
5975       else
5976         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5977     }
5978   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5979     {
5980       charset_list = Vemacs_mule_charset_list;
5981     }
5982   else
5983     {
5984       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5985     }
5986   return charset_list;
5987 }
5988
5989
5990 /* Return raw-text or one of its subsidiaries that has the same
5991    eol_type as CODING-SYSTEM.  */
5992
5993 Lisp_Object
5994 raw_text_coding_system (Lisp_Object coding_system)
5995 {
5996   Lisp_Object spec, attrs;
5997   Lisp_Object eol_type, raw_text_eol_type;
5998
5999   if (NILP (coding_system))
6000     return Qraw_text;
6001   spec = CODING_SYSTEM_SPEC (coding_system);
6002   attrs = AREF (spec, 0);
6003
6004   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6005     return coding_system;
6006
6007   eol_type = AREF (spec, 2);
6008   if (VECTORP (eol_type))
6009     return Qraw_text;
6010   spec = CODING_SYSTEM_SPEC (Qraw_text);
6011   raw_text_eol_type = AREF (spec, 2);
6012   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6013           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6014           : AREF (raw_text_eol_type, 2));
6015 }
6016
6017
6018 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6019    the subsidiary that has the same eol-spec as PARENT (if it is not
6020    nil and specifies end-of-line format) or the system's setting
6021    (system_eol_type).  */
6022
6023 Lisp_Object
6024 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6025 {
6026   Lisp_Object spec, eol_type;
6027
6028   if (NILP (coding_system))
6029     coding_system = Qraw_text;
6030   spec = CODING_SYSTEM_SPEC (coding_system);
6031   eol_type = AREF (spec, 2);
6032   if (VECTORP (eol_type))
6033     {
6034       Lisp_Object parent_eol_type;
6035
6036       if (! NILP (parent))
6037         {
6038           Lisp_Object parent_spec;
6039
6040           parent_spec = CODING_SYSTEM_SPEC (parent);
6041           parent_eol_type = AREF (parent_spec, 2);
6042           if (VECTORP (parent_eol_type))
6043             parent_eol_type = system_eol_type;
6044         }
6045       else
6046         parent_eol_type = system_eol_type;
6047       if (EQ (parent_eol_type, Qunix))
6048         coding_system = AREF (eol_type, 0);
6049       else if (EQ (parent_eol_type, Qdos))
6050         coding_system = AREF (eol_type, 1);
6051       else if (EQ (parent_eol_type, Qmac))
6052         coding_system = AREF (eol_type, 2);
6053     }
6054   return coding_system;
6055 }
6056
6057
6058 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6059    decided for writing to a process.  If not, complement them, and
6060    return a new coding system.  */
6061
6062 Lisp_Object
6063 complement_process_encoding_system (Lisp_Object coding_system)
6064 {
6065   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6066   Lisp_Object spec, attrs;
6067   int i;
6068
6069   for (i = 0; i < 3; i++)
6070     {
6071       if (i == 1)
6072         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6073       else if (i == 2)
6074         coding_system = preferred_coding_system ();
6075       spec = CODING_SYSTEM_SPEC (coding_system);
6076       if (NILP (spec))
6077         continue;
6078       attrs = AREF (spec, 0);
6079       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6080         coding_base = CODING_ATTR_BASE_NAME (attrs);
6081       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6082         eol_base = coding_system;
6083       if (! NILP (coding_base) && ! NILP (eol_base))
6084         break;
6085     }
6086
6087   if (i > 0)
6088     /* The original CODING_SYSTEM didn't specify text-conversion or
6089        eol-conversion.  Be sure that we return a fully complemented
6090        coding system.  */
6091     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6092   return coding_system;
6093 }
6094
6095
6096 /* Emacs has a mechanism to automatically detect a coding system if it
6097    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6098    it's impossible to distinguish some coding systems accurately
6099    because they use the same range of codes.  So, at first, coding
6100    systems are categorized into 7, those are:
6101
6102    o coding-category-emacs-mule
6103
6104         The category for a coding system which has the same code range
6105         as Emacs' internal format.  Assigned the coding-system (Lisp
6106         symbol) `emacs-mule' by default.
6107
6108    o coding-category-sjis
6109
6110         The category for a coding system which has the same code range
6111         as SJIS.  Assigned the coding-system (Lisp
6112         symbol) `japanese-shift-jis' by default.
6113
6114    o coding-category-iso-7
6115
6116         The category for a coding system which has the same code range
6117         as ISO2022 of 7-bit environment.  This doesn't use any locking
6118         shift and single shift functions.  This can encode/decode all
6119         charsets.  Assigned the coding-system (Lisp symbol)
6120         `iso-2022-7bit' by default.
6121
6122    o coding-category-iso-7-tight
6123
6124         Same as coding-category-iso-7 except that this can
6125         encode/decode only the specified charsets.
6126
6127    o coding-category-iso-8-1
6128
6129         The category for a coding system which has the same code range
6130         as ISO2022 of 8-bit environment and graphic plane 1 used only
6131         for DIMENSION1 charset.  This doesn't use any locking shift
6132         and single shift functions.  Assigned the coding-system (Lisp
6133         symbol) `iso-latin-1' by default.
6134
6135    o coding-category-iso-8-2
6136
6137         The category for a coding system which has the same code range
6138         as ISO2022 of 8-bit environment and graphic plane 1 used only
6139         for DIMENSION2 charset.  This doesn't use any locking shift
6140         and single shift functions.  Assigned the coding-system (Lisp
6141         symbol) `japanese-iso-8bit' by default.
6142
6143    o coding-category-iso-7-else
6144
6145         The category for a coding system which has the same code range
6146         as ISO2022 of 7-bit environment but uses locking shift or
6147         single shift functions.  Assigned the coding-system (Lisp
6148         symbol) `iso-2022-7bit-lock' by default.
6149
6150    o coding-category-iso-8-else
6151
6152         The category for a coding system which has the same code range
6153         as ISO2022 of 8-bit environment but uses locking shift or
6154         single shift functions.  Assigned the coding-system (Lisp
6155         symbol) `iso-2022-8bit-ss2' by default.
6156
6157    o coding-category-big5
6158
6159         The category for a coding system which has the same code range
6160         as BIG5.  Assigned the coding-system (Lisp symbol)
6161         `cn-big5' by default.
6162
6163    o coding-category-utf-8
6164
6165         The category for a coding system which has the same code range
6166         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6167         symbol) `utf-8' by default.
6168
6169    o coding-category-utf-16-be
6170
6171         The category for a coding system in which a text has an
6172         Unicode signature (cf. Unicode Standard) in the order of BIG
6173         endian at the head.  Assigned the coding-system (Lisp symbol)
6174         `utf-16-be' by default.
6175
6176    o coding-category-utf-16-le
6177
6178         The category for a coding system in which a text has an
6179         Unicode signature (cf. Unicode Standard) in the order of
6180         LITTLE endian at the head.  Assigned the coding-system (Lisp
6181         symbol) `utf-16-le' by default.
6182
6183    o coding-category-ccl
6184
6185         The category for a coding system of which encoder/decoder is
6186         written in CCL programs.  The default value is nil, i.e., no
6187         coding system is assigned.
6188
6189    o coding-category-binary
6190
6191         The category for a coding system not categorized in any of the
6192         above.  Assigned the coding-system (Lisp symbol)
6193         `no-conversion' by default.
6194
6195    Each of them is a Lisp symbol and the value is an actual
6196    `coding-system's (this is also a Lisp symbol) assigned by a user.
6197    What Emacs does actually is to detect a category of coding system.
6198    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6199    decide only one possible category, it selects a category of the
6200    highest priority.  Priorities of categories are also specified by a
6201    user in a Lisp variable `coding-category-list'.
6202
6203 */
6204
6205 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6206                                            int eol_seen);
6207
6208
6209 /* Return the number of ASCII characters at the head of the source.
6210    By side effects, set coding->head_ascii and update
6211    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6212    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6213    reliable only when all the source bytes are ASCII.  */
6214
6215 static ptrdiff_t
6216 check_ascii (struct coding_system *coding)
6217 {
6218   const unsigned char *src, *end;
6219   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6220   int eol_seen = coding->eol_seen;
6221
6222   coding_set_source (coding);
6223   src = coding->source;
6224   end = src + coding->src_bytes;
6225
6226   if (inhibit_eol_conversion
6227       || SYMBOLP (eol_type))
6228     {
6229       /* We don't have to check EOL format.  */
6230       while (src < end && !( *src & 0x80))
6231         {
6232           if (*src++ == '\n')
6233             eol_seen |= EOL_SEEN_LF;
6234         }
6235     }
6236   else
6237     {
6238       end--;                /* We look ahead one byte for "CR LF".  */
6239       while (src < end)
6240         {
6241           int c = *src;
6242
6243           if (c & 0x80)
6244             break;
6245           src++;
6246           if (c == '\r')
6247             {
6248               if (*src == '\n')
6249                 {
6250                   eol_seen |= EOL_SEEN_CRLF;
6251                   src++;
6252                 }
6253               else
6254                 eol_seen |= EOL_SEEN_CR;
6255             }
6256           else if (c == '\n')
6257             eol_seen |= EOL_SEEN_LF;
6258         }
6259       if (src == end)
6260         {
6261           int c = *src;
6262
6263           /* All bytes but the last one C are ASCII.  */
6264           if (! (c & 0x80))
6265             {
6266               if (c == '\r')
6267                 eol_seen |= EOL_SEEN_CR;
6268               else if (c  == '\n')
6269                 eol_seen |= EOL_SEEN_LF;
6270               src++;
6271             }
6272         }
6273     }
6274   coding->head_ascii = src - coding->source;
6275   coding->eol_seen = eol_seen;
6276   return (coding->head_ascii);
6277 }
6278
6279
6280 /* Return the number of characters at the source if all the bytes are
6281    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6282    effects, update coding->eol_seen.  The value of coding->eol_seen is
6283    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6284    the value is reliable only when all the source bytes are valid
6285    UTF-8.  */
6286
6287 static ptrdiff_t
6288 check_utf_8 (struct coding_system *coding)
6289 {
6290   const unsigned char *src, *end;
6291   int eol_seen;
6292   ptrdiff_t nchars = coding->head_ascii;
6293
6294   if (coding->head_ascii < 0)
6295     check_ascii (coding);
6296   else
6297     coding_set_source (coding);
6298   src = coding->source + coding->head_ascii;
6299   /* We look ahead one byte for CR LF.  */
6300   end = coding->source + coding->src_bytes - 1;
6301   eol_seen = coding->eol_seen;
6302   while (src < end)
6303     {
6304       int c = *src;
6305
6306       if (UTF_8_1_OCTET_P (*src))
6307         {
6308           src++;
6309           if (c < 0x20)
6310             {
6311               if (c == '\r')
6312                 {
6313                   if (*src == '\n')
6314                     {
6315                       eol_seen |= EOL_SEEN_CRLF;
6316                       src++;
6317                       nchars++;
6318                     }
6319                   else
6320                     eol_seen |= EOL_SEEN_CR;
6321                 }
6322               else if (c == '\n')
6323                 eol_seen |= EOL_SEEN_LF;
6324             }
6325         }
6326       else if (UTF_8_2_OCTET_LEADING_P (c))
6327         {
6328           if (c < 0xC2          /* overlong sequence */
6329               || src + 1 >= end
6330               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6331             return -1;
6332           src += 2;
6333         }
6334       else if (UTF_8_3_OCTET_LEADING_P (c))
6335         {
6336           if (src + 2 >= end
6337               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6338                     && UTF_8_EXTRA_OCTET_P (src[2])))
6339             return -1;
6340           c = (((c & 0xF) << 12)
6341                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6342           if (c < 0x800                       /* overlong sequence */
6343               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6344             return -1;
6345           src += 3;
6346         }
6347       else if (UTF_8_4_OCTET_LEADING_P (c))
6348         {
6349           if (src + 3 >= end
6350               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6351                     && UTF_8_EXTRA_OCTET_P (src[2])
6352                     && UTF_8_EXTRA_OCTET_P (src[3])))
6353             return -1;
6354           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6355                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6356           if (c < 0x10000       /* overlong sequence */
6357               || c >= 0x110000) /* non-Unicode character  */
6358             return -1;
6359           src += 4;
6360         }
6361       else
6362         return -1;
6363       nchars++;
6364     }
6365
6366   if (src == end)
6367     {
6368       if (! UTF_8_1_OCTET_P (*src))
6369         return -1;
6370       nchars++;
6371       if (*src == '\r')
6372         eol_seen |= EOL_SEEN_CR;
6373       else if (*src  == '\n')
6374         eol_seen |= EOL_SEEN_LF;
6375     }
6376   coding->eol_seen = eol_seen;
6377   return nchars;
6378 }
6379
6380
6381 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6382    SOURCE is encoded.  If CATEGORY is one of
6383    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6384    two-byte, else they are encoded by one-byte.
6385
6386    Return one of EOL_SEEN_XXX.  */
6387
6388 #define MAX_EOL_CHECK_COUNT 3
6389
6390 static int
6391 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6392             enum coding_category category)
6393 {
6394   const unsigned char *src = source, *src_end = src + src_bytes;
6395   unsigned char c;
6396   int total  = 0;
6397   int eol_seen = EOL_SEEN_NONE;
6398
6399   if ((1 << category) & CATEGORY_MASK_UTF_16)
6400     {
6401       bool msb = category == (coding_category_utf_16_le
6402                               | coding_category_utf_16_le_nosig);
6403       bool lsb = !msb;
6404
6405       while (src + 1 < src_end)
6406         {
6407           c = src[lsb];
6408           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6409             {
6410               int this_eol;
6411
6412               if (c == '\n')
6413                 this_eol = EOL_SEEN_LF;
6414               else if (src + 3 >= src_end
6415                        || src[msb + 2] != 0
6416                        || src[lsb + 2] != '\n')
6417                 this_eol = EOL_SEEN_CR;
6418               else
6419                 {
6420                   this_eol = EOL_SEEN_CRLF;
6421                   src += 2;
6422                 }
6423
6424               if (eol_seen == EOL_SEEN_NONE)
6425                 /* This is the first end-of-line.  */
6426                 eol_seen = this_eol;
6427               else if (eol_seen != this_eol)
6428                 {
6429                   /* The found type is different from what found before.
6430                      Allow for stray ^M characters in DOS EOL files.  */
6431                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6432                       || (eol_seen == EOL_SEEN_CRLF
6433                           && this_eol == EOL_SEEN_CR))
6434                     eol_seen = EOL_SEEN_CRLF;
6435                   else
6436                     {
6437                       eol_seen = EOL_SEEN_LF;
6438                       break;
6439                     }
6440                 }
6441               if (++total == MAX_EOL_CHECK_COUNT)
6442                 break;
6443             }
6444           src += 2;
6445         }
6446     }
6447   else
6448     while (src < src_end)
6449       {
6450         c = *src++;
6451         if (c == '\n' || c == '\r')
6452           {
6453             int this_eol;
6454
6455             if (c == '\n')
6456               this_eol = EOL_SEEN_LF;
6457             else if (src >= src_end || *src != '\n')
6458               this_eol = EOL_SEEN_CR;
6459             else
6460               this_eol = EOL_SEEN_CRLF, src++;
6461
6462             if (eol_seen == EOL_SEEN_NONE)
6463               /* This is the first end-of-line.  */
6464               eol_seen = this_eol;
6465             else if (eol_seen != this_eol)
6466               {
6467                 /* The found type is different from what found before.
6468                    Allow for stray ^M characters in DOS EOL files.  */
6469                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6470                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6471                   eol_seen = EOL_SEEN_CRLF;
6472                 else
6473                   {
6474                     eol_seen = EOL_SEEN_LF;
6475                     break;
6476                   }
6477               }
6478             if (++total == MAX_EOL_CHECK_COUNT)
6479               break;
6480           }
6481       }
6482   return eol_seen;
6483 }
6484
6485
6486 static Lisp_Object
6487 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6488 {
6489   Lisp_Object eol_type;
6490
6491   eol_type = CODING_ID_EOL_TYPE (coding->id);
6492   if (! VECTORP (eol_type))
6493     /* Already adjusted.  */
6494     return eol_type;
6495   if (eol_seen & EOL_SEEN_LF)
6496     {
6497       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6498       eol_type = Qunix;
6499     }
6500   else if (eol_seen & EOL_SEEN_CRLF)
6501     {
6502       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6503       eol_type = Qdos;
6504     }
6505   else if (eol_seen & EOL_SEEN_CR)
6506     {
6507       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6508       eol_type = Qmac;
6509     }
6510   return eol_type;
6511 }
6512
6513 /* Detect how a text specified in CODING is encoded.  If a coding
6514    system is detected, update fields of CODING by the detected coding
6515    system.  */
6516
6517 static void
6518 detect_coding (struct coding_system *coding)
6519 {
6520   const unsigned char *src, *src_end;
6521   unsigned int saved_mode = coding->mode;
6522   Lisp_Object found = Qnil;
6523   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6524
6525   coding->consumed = coding->consumed_char = 0;
6526   coding->produced = coding->produced_char = 0;
6527   coding_set_source (coding);
6528
6529   src_end = coding->source + coding->src_bytes;
6530
6531   coding->eol_seen = EOL_SEEN_NONE;
6532   /* If we have not yet decided the text encoding type, detect it
6533      now.  */
6534   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6535     {
6536       int c, i;
6537       struct coding_detection_info detect_info;
6538       bool null_byte_found = 0, eight_bit_found = 0;
6539       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6540                                        inhibit_null_byte_detection);
6541       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6542                                        inhibit_iso_escape_detection);
6543       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6544
6545       coding->head_ascii = 0;
6546       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6547       for (src = coding->source; src < src_end; src++)
6548         {
6549           c = *src;
6550           if (c & 0x80)
6551             {
6552               eight_bit_found = 1;
6553               if (null_byte_found)
6554                 break;
6555             }
6556           else if (c < 0x20)
6557             {
6558               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6559                   && ! inhibit_ied
6560                   && ! detect_info.checked)
6561                 {
6562                   if (detect_coding_iso_2022 (coding, &detect_info))
6563                     {
6564                       /* We have scanned the whole data.  */
6565                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6566                         {
6567                           /* We didn't find an 8-bit code.  We may
6568                              have found a null-byte, but it's very
6569                              rare that a binary file conforms to
6570                              ISO-2022.  */
6571                           src = src_end;
6572                           coding->head_ascii = src - coding->source;
6573                         }
6574                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6575                       break;
6576                     }
6577                 }
6578               else if (! c && !inhibit_nbd)
6579                 {
6580                   null_byte_found = 1;
6581                   if (eight_bit_found)
6582                     break;
6583                 }
6584               else if (! disable_ascii_optimization
6585                        && ! inhibit_eol_conversion)
6586                 {
6587                   if (c == '\r')
6588                     {
6589                       if (src < src_end && src[1] == '\n')
6590                         {
6591                           coding->eol_seen |= EOL_SEEN_CRLF;
6592                           src++;
6593                           if (! eight_bit_found)
6594                             coding->head_ascii++;
6595                         }
6596                       else
6597                         coding->eol_seen |= EOL_SEEN_CR;
6598                     }
6599                   else if (c == '\n')
6600                     {
6601                       coding->eol_seen |= EOL_SEEN_LF;
6602                     }
6603                 }
6604
6605               if (! eight_bit_found)
6606                 coding->head_ascii++;
6607             }
6608           else if (! eight_bit_found)
6609             coding->head_ascii++;
6610         }
6611
6612       if (null_byte_found || eight_bit_found
6613           || coding->head_ascii < coding->src_bytes
6614           || detect_info.found)
6615         {
6616           enum coding_category category;
6617           struct coding_system *this;
6618
6619           if (coding->head_ascii == coding->src_bytes)
6620             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6621             for (i = 0; i < coding_category_raw_text; i++)
6622               {
6623                 category = coding_priorities[i];
6624                 this = coding_categories + category;
6625                 if (detect_info.found & (1 << category))
6626                   break;
6627               }
6628           else
6629             {
6630               if (null_byte_found)
6631                 {
6632                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6633                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6634                 }
6635               else if (prefer_utf_8
6636                        && detect_coding_utf_8 (coding, &detect_info))
6637                 {
6638                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6639                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6640                 }
6641               for (i = 0; i < coding_category_raw_text; i++)
6642                 {
6643                   category = coding_priorities[i];
6644                   this = coding_categories + category;
6645                   /* Some of this->detector (e.g. detect_coding_sjis)
6646                      require this information.  */
6647                   coding->id = this->id;
6648                   if (this->id < 0)
6649                     {
6650                       /* No coding system of this category is defined.  */
6651                       detect_info.rejected |= (1 << category);
6652                     }
6653                   else if (category >= coding_category_raw_text)
6654                     continue;
6655                   else if (detect_info.checked & (1 << category))
6656                     {
6657                       if (detect_info.found & (1 << category))
6658                         break;
6659                     }
6660                   else if ((*(this->detector)) (coding, &detect_info)
6661                            && detect_info.found & (1 << category))
6662                     break;
6663                 }
6664             }
6665
6666           if (i < coding_category_raw_text)
6667             {
6668               if (category == coding_category_utf_8_auto)
6669                 {
6670                   Lisp_Object coding_systems;
6671
6672                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6673                                          coding_attr_utf_bom);
6674                   if (CONSP (coding_systems))
6675                     {
6676                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6677                         found = XCAR (coding_systems);
6678                       else
6679                         found = XCDR (coding_systems);
6680                     }
6681                   else
6682                     found = CODING_ID_NAME (this->id);
6683                 }
6684               else if (category == coding_category_utf_16_auto)
6685                 {
6686                   Lisp_Object coding_systems;
6687
6688                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6689                                          coding_attr_utf_bom);
6690                   if (CONSP (coding_systems))
6691                     {
6692                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6693                         found = XCAR (coding_systems);
6694                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6695                         found = XCDR (coding_systems);
6696                     }
6697                   else
6698                     found = CODING_ID_NAME (this->id);
6699                 }
6700               else
6701                 found = CODING_ID_NAME (this->id);
6702             }
6703           else if (null_byte_found)
6704             found = Qno_conversion;
6705           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6706                    == CATEGORY_MASK_ANY)
6707             found = Qraw_text;
6708           else if (detect_info.rejected)
6709             for (i = 0; i < coding_category_raw_text; i++)
6710               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6711                 {
6712                   this = coding_categories + coding_priorities[i];
6713                   found = CODING_ID_NAME (this->id);
6714                   break;
6715                 }
6716         }
6717     }
6718   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6719            == coding_category_utf_8_auto)
6720     {
6721       Lisp_Object coding_systems;
6722       struct coding_detection_info detect_info;
6723
6724       coding_systems
6725         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6726       detect_info.found = detect_info.rejected = 0;
6727       if (check_ascii (coding) == coding->src_bytes)
6728         {
6729           if (CONSP (coding_systems))
6730             found = XCDR (coding_systems);
6731         }
6732       else
6733         {
6734           if (CONSP (coding_systems)
6735               && detect_coding_utf_8 (coding, &detect_info))
6736             {
6737               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6738                 found = XCAR (coding_systems);
6739               else
6740                 found = XCDR (coding_systems);
6741             }
6742         }
6743     }
6744   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6745            == coding_category_utf_16_auto)
6746     {
6747       Lisp_Object coding_systems;
6748       struct coding_detection_info detect_info;
6749
6750       coding_systems
6751         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6752       detect_info.found = detect_info.rejected = 0;
6753       coding->head_ascii = 0;
6754       if (CONSP (coding_systems)
6755           && detect_coding_utf_16 (coding, &detect_info))
6756         {
6757           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6758             found = XCAR (coding_systems);
6759           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6760             found = XCDR (coding_systems);
6761         }
6762     }
6763
6764   if (! NILP (found))
6765     {
6766       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6767                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6768                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6769                            : EOL_SEEN_LF);
6770
6771       setup_coding_system (found, coding);
6772       if (specified_eol != EOL_SEEN_NONE)
6773         adjust_coding_eol_type (coding, specified_eol);
6774     }
6775
6776   coding->mode = saved_mode;
6777 }
6778
6779
6780 static void
6781 decode_eol (struct coding_system *coding)
6782 {
6783   Lisp_Object eol_type;
6784   unsigned char *p, *pbeg, *pend;
6785
6786   eol_type = CODING_ID_EOL_TYPE (coding->id);
6787   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6788     return;
6789
6790   if (NILP (coding->dst_object))
6791     pbeg = coding->destination;
6792   else
6793     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6794   pend = pbeg + coding->produced;
6795
6796   if (VECTORP (eol_type))
6797     {
6798       int eol_seen = EOL_SEEN_NONE;
6799
6800       for (p = pbeg; p < pend; p++)
6801         {
6802           if (*p == '\n')
6803             eol_seen |= EOL_SEEN_LF;
6804           else if (*p == '\r')
6805             {
6806               if (p + 1 < pend && *(p + 1) == '\n')
6807                 {
6808                   eol_seen |= EOL_SEEN_CRLF;
6809                   p++;
6810                 }
6811               else
6812                 eol_seen |= EOL_SEEN_CR;
6813             }
6814         }
6815       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6816       if ((eol_seen & EOL_SEEN_CRLF) != 0
6817           && (eol_seen & EOL_SEEN_CR) != 0
6818           && (eol_seen & EOL_SEEN_LF) == 0)
6819         eol_seen = EOL_SEEN_CRLF;
6820       else if (eol_seen != EOL_SEEN_NONE
6821           && eol_seen != EOL_SEEN_LF
6822           && eol_seen != EOL_SEEN_CRLF
6823           && eol_seen != EOL_SEEN_CR)
6824         eol_seen = EOL_SEEN_LF;
6825       if (eol_seen != EOL_SEEN_NONE)
6826         eol_type = adjust_coding_eol_type (coding, eol_seen);
6827     }
6828
6829   if (EQ (eol_type, Qmac))
6830     {
6831       for (p = pbeg; p < pend; p++)
6832         if (*p == '\r')
6833           *p = '\n';
6834     }
6835   else if (EQ (eol_type, Qdos))
6836     {
6837       ptrdiff_t n = 0;
6838
6839       if (NILP (coding->dst_object))
6840         {
6841           /* Start deleting '\r' from the tail to minimize the memory
6842              movement.  */
6843           for (p = pend - 2; p >= pbeg; p--)
6844             if (*p == '\r')
6845               {
6846                 memmove (p, p + 1, pend-- - p - 1);
6847                 n++;
6848               }
6849         }
6850       else
6851         {
6852           ptrdiff_t pos_byte = coding->dst_pos_byte;
6853           ptrdiff_t pos = coding->dst_pos;
6854           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6855
6856           while (pos < pos_end)
6857             {
6858               p = BYTE_POS_ADDR (pos_byte);
6859               if (*p == '\r' && p[1] == '\n')
6860                 {
6861                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6862                   n++;
6863                   pos_end--;
6864                 }
6865               pos++;
6866               if (coding->dst_multibyte)
6867                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6868               else
6869                 pos_byte++;
6870             }
6871         }
6872       coding->produced -= n;
6873       coding->produced_char -= n;
6874     }
6875 }
6876
6877
6878 /* Return a translation table (or list of them) from coding system
6879    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6880    not ENCODEP). */
6881
6882 static Lisp_Object
6883 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6884 {
6885   Lisp_Object standard, translation_table;
6886   Lisp_Object val;
6887
6888   if (NILP (Venable_character_translation))
6889     {
6890       if (max_lookup)
6891         *max_lookup = 0;
6892       return Qnil;
6893     }
6894   if (encodep)
6895     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6896       standard = Vstandard_translation_table_for_encode;
6897   else
6898     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6899       standard = Vstandard_translation_table_for_decode;
6900   if (NILP (translation_table))
6901     translation_table = standard;
6902   else
6903     {
6904       if (SYMBOLP (translation_table))
6905         translation_table = Fget (translation_table, Qtranslation_table);
6906       else if (CONSP (translation_table))
6907         {
6908           translation_table = Fcopy_sequence (translation_table);
6909           for (val = translation_table; CONSP (val); val = XCDR (val))
6910             if (SYMBOLP (XCAR (val)))
6911               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6912         }
6913       if (CHAR_TABLE_P (standard))
6914         {
6915           if (CONSP (translation_table))
6916             translation_table = nconc2 (translation_table, list1 (standard));
6917           else
6918             translation_table = list2 (translation_table, standard);
6919         }
6920     }
6921
6922   if (max_lookup)
6923     {
6924       *max_lookup = 1;
6925       if (CHAR_TABLE_P (translation_table)
6926           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6927         {
6928           val = XCHAR_TABLE (translation_table)->extras[1];
6929           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6930             *max_lookup = XFASTINT (val);
6931         }
6932       else if (CONSP (translation_table))
6933         {
6934           Lisp_Object tail;
6935
6936           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6937             if (CHAR_TABLE_P (XCAR (tail))
6938                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6939               {
6940                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6941                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6942                   *max_lookup = XFASTINT (tailval);
6943               }
6944         }
6945     }
6946   return translation_table;
6947 }
6948
6949 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6950   do {                                                          \
6951     trans = Qnil;                                               \
6952     if (CHAR_TABLE_P (table))                                   \
6953       {                                                         \
6954         trans = CHAR_TABLE_REF (table, c);                      \
6955         if (CHARACTERP (trans))                                 \
6956           c = XFASTINT (trans), trans = Qnil;                   \
6957       }                                                         \
6958     else if (CONSP (table))                                     \
6959       {                                                         \
6960         Lisp_Object tail;                                       \
6961                                                                 \
6962         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6963           if (CHAR_TABLE_P (XCAR (tail)))                       \
6964             {                                                   \
6965               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6966               if (CHARACTERP (trans))                           \
6967                 c = XFASTINT (trans), trans = Qnil;             \
6968               else if (! NILP (trans))                          \
6969                 break;                                          \
6970             }                                                   \
6971       }                                                         \
6972   } while (0)
6973
6974
6975 /* Return a translation of character(s) at BUF according to TRANS.
6976    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6977    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6978    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6979    translation is found, and Qnil if not found..
6980    If BUF is too short to lookup characters in FROM, return Qt.  */
6981
6982 static Lisp_Object
6983 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6984 {
6985
6986   if (INTEGERP (trans))
6987     return trans;
6988   for (; CONSP (trans); trans = XCDR (trans))
6989     {
6990       Lisp_Object val = XCAR (trans);
6991       Lisp_Object from = XCAR (val);
6992       ptrdiff_t len = ASIZE (from);
6993       ptrdiff_t i;
6994
6995       for (i = 0; i < len; i++)
6996         {
6997           if (buf + i == buf_end)
6998             return Qt;
6999           if (XINT (AREF (from, i)) != buf[i])
7000             break;
7001         }
7002       if (i == len)
7003         return val;
7004     }
7005   return Qnil;
7006 }
7007
7008
7009 static int
7010 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7011                bool last_block)
7012 {
7013   unsigned char *dst = coding->destination + coding->produced;
7014   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7015   ptrdiff_t produced;
7016   ptrdiff_t produced_chars = 0;
7017   int carryover = 0;
7018
7019   if (! coding->chars_at_source)
7020     {
7021       /* Source characters are in coding->charbuf.  */
7022       int *buf = coding->charbuf;
7023       int *buf_end = buf + coding->charbuf_used;
7024
7025       if (EQ (coding->src_object, coding->dst_object))
7026         {
7027           coding_set_source (coding);
7028           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7029         }
7030
7031       while (buf < buf_end)
7032         {
7033           int c = *buf;
7034           ptrdiff_t i;
7035
7036           if (c >= 0)
7037             {
7038               ptrdiff_t from_nchars = 1, to_nchars = 1;
7039               Lisp_Object trans = Qnil;
7040
7041               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7042               if (! NILP (trans))
7043                 {
7044                   trans = get_translation (trans, buf, buf_end);
7045                   if (INTEGERP (trans))
7046                     c = XINT (trans);
7047                   else if (CONSP (trans))
7048                     {
7049                       from_nchars = ASIZE (XCAR (trans));
7050                       trans = XCDR (trans);
7051                       if (INTEGERP (trans))
7052                         c = XINT (trans);
7053                       else
7054                         {
7055                           to_nchars = ASIZE (trans);
7056                           c = XINT (AREF (trans, 0));
7057                         }
7058                     }
7059                   else if (EQ (trans, Qt) && ! last_block)
7060                     break;
7061                 }
7062
7063               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7064                 {
7065                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7066                        / MAX_MULTIBYTE_LENGTH)
7067                       < to_nchars)
7068                     memory_full (SIZE_MAX);
7069                   dst = alloc_destination (coding,
7070                                            buf_end - buf
7071                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7072                                            dst);
7073                   if (EQ (coding->src_object, coding->dst_object))
7074                     {
7075                       coding_set_source (coding);
7076                       dst_end = (((unsigned char *) coding->source)
7077                                  + coding->consumed);
7078                     }
7079                   else
7080                     dst_end = coding->destination + coding->dst_bytes;
7081                 }
7082
7083               for (i = 0; i < to_nchars; i++)
7084                 {
7085                   if (i > 0)
7086                     c = XINT (AREF (trans, i));
7087                   if (coding->dst_multibyte
7088                       || ! CHAR_BYTE8_P (c))
7089                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7090                   else
7091                     *dst++ = CHAR_TO_BYTE8 (c);
7092                 }
7093               produced_chars += to_nchars;
7094               buf += from_nchars;
7095             }
7096           else
7097             /* This is an annotation datum.  (-C) is the length.  */
7098             buf += -c;
7099         }
7100       carryover = buf_end - buf;
7101     }
7102   else
7103     {
7104       /* Source characters are at coding->source.  */
7105       const unsigned char *src = coding->source;
7106       const unsigned char *src_end = src + coding->consumed;
7107
7108       if (EQ (coding->dst_object, coding->src_object))
7109         dst_end = (unsigned char *) src;
7110       if (coding->src_multibyte != coding->dst_multibyte)
7111         {
7112           if (coding->src_multibyte)
7113             {
7114               bool multibytep = 1;
7115               ptrdiff_t consumed_chars = 0;
7116
7117               while (1)
7118                 {
7119                   const unsigned char *src_base = src;
7120                   int c;
7121
7122                   ONE_MORE_BYTE (c);
7123                   if (dst == dst_end)
7124                     {
7125                       if (EQ (coding->src_object, coding->dst_object))
7126                         dst_end = (unsigned char *) src;
7127                       if (dst == dst_end)
7128                         {
7129                           ptrdiff_t offset = src - coding->source;
7130
7131                           dst = alloc_destination (coding, src_end - src + 1,
7132                                                    dst);
7133                           dst_end = coding->destination + coding->dst_bytes;
7134                           coding_set_source (coding);
7135                           src = coding->source + offset;
7136                           src_end = coding->source + coding->consumed;
7137                           if (EQ (coding->src_object, coding->dst_object))
7138                             dst_end = (unsigned char *) src;
7139                         }
7140                     }
7141                   *dst++ = c;
7142                   produced_chars++;
7143                 }
7144             no_more_source:
7145               ;
7146             }
7147           else
7148             while (src < src_end)
7149               {
7150                 bool multibytep = 1;
7151                 int c = *src++;
7152
7153                 if (dst >= dst_end - 1)
7154                   {
7155                     if (EQ (coding->src_object, coding->dst_object))
7156                       dst_end = (unsigned char *) src;
7157                     if (dst >= dst_end - 1)
7158                       {
7159                         ptrdiff_t offset = src - coding->source;
7160                         ptrdiff_t more_bytes;
7161
7162                         if (EQ (coding->src_object, coding->dst_object))
7163                           more_bytes = ((src_end - src) / 2) + 2;
7164                         else
7165                           more_bytes = src_end - src + 2;
7166                         dst = alloc_destination (coding, more_bytes, dst);
7167                         dst_end = coding->destination + coding->dst_bytes;
7168                         coding_set_source (coding);
7169                         src = coding->source + offset;
7170                         src_end = coding->source + coding->consumed;
7171                         if (EQ (coding->src_object, coding->dst_object))
7172                           dst_end = (unsigned char *) src;
7173                       }
7174                   }
7175                 EMIT_ONE_BYTE (c);
7176               }
7177         }
7178       else
7179         {
7180           if (!EQ (coding->src_object, coding->dst_object))
7181             {
7182               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7183
7184               if (require > 0)
7185                 {
7186                   ptrdiff_t offset = src - coding->source;
7187
7188                   dst = alloc_destination (coding, require, dst);
7189                   coding_set_source (coding);
7190                   src = coding->source + offset;
7191                   src_end = coding->source + coding->consumed;
7192                 }
7193             }
7194           produced_chars = coding->consumed_char;
7195           while (src < src_end)
7196             *dst++ = *src++;
7197         }
7198     }
7199
7200   produced = dst - (coding->destination + coding->produced);
7201   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7202     insert_from_gap (produced_chars, produced, 0);
7203   coding->produced += produced;
7204   coding->produced_char += produced_chars;
7205   return carryover;
7206 }
7207
7208 /* Compose text in CODING->object according to the annotation data at
7209    CHARBUF.  CHARBUF is an array:
7210      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7211  */
7212
7213 static void
7214 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7215 {
7216   int len;
7217   ptrdiff_t to;
7218   enum composition_method method;
7219   Lisp_Object components;
7220
7221   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7222   to = pos + charbuf[2];
7223   method = (enum composition_method) (charbuf[4]);
7224
7225   if (method == COMPOSITION_RELATIVE)
7226     components = Qnil;
7227   else
7228     {
7229       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7230       int i, j;
7231
7232       if (method == COMPOSITION_WITH_RULE)
7233         len = charbuf[2] * 3 - 2;
7234       charbuf += MAX_ANNOTATION_LENGTH;
7235       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7236       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7237         {
7238           if (charbuf[i] >= 0)
7239             args[j] = make_number (charbuf[i]);
7240           else
7241             {
7242               i++;
7243               args[j] = make_number (charbuf[i] % 0x100);
7244             }
7245         }
7246       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7247     }
7248   compose_text (pos, to, components, Qnil, coding->dst_object);
7249 }
7250
7251
7252 /* Put `charset' property on text in CODING->object according to
7253    the annotation data at CHARBUF.  CHARBUF is an array:
7254      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7255  */
7256
7257 static void
7258 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7259 {
7260   ptrdiff_t from = pos - charbuf[2];
7261   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7262
7263   Fput_text_property (make_number (from), make_number (pos),
7264                       Qcharset, CHARSET_NAME (charset),
7265                       coding->dst_object);
7266 }
7267
7268
7269 #define CHARBUF_SIZE 0x4000
7270
7271 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7272   do {                                                                  \
7273     coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int));        \
7274     coding->charbuf_size = CHARBUF_SIZE;                                \
7275   } while (0)
7276
7277
7278 static void
7279 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7280 {
7281   int *charbuf = coding->charbuf;
7282   int *charbuf_end = charbuf + coding->charbuf_used;
7283
7284   if (NILP (coding->dst_object))
7285     return;
7286
7287   while (charbuf < charbuf_end)
7288     {
7289       if (*charbuf >= 0)
7290         pos++, charbuf++;
7291       else
7292         {
7293           int len = -*charbuf;
7294
7295           if (len > 2)
7296             switch (charbuf[1])
7297               {
7298               case CODING_ANNOTATE_COMPOSITION_MASK:
7299                 produce_composition (coding, charbuf, pos);
7300                 break;
7301               case CODING_ANNOTATE_CHARSET_MASK:
7302                 produce_charset (coding, charbuf, pos);
7303                 break;
7304               }
7305           charbuf += len;
7306         }
7307     }
7308 }
7309
7310 /* Decode the data at CODING->src_object into CODING->dst_object.
7311    CODING->src_object is a buffer, a string, or nil.
7312    CODING->dst_object is a buffer.
7313
7314    If CODING->src_object is a buffer, it must be the current buffer.
7315    In this case, if CODING->src_pos is positive, it is a position of
7316    the source text in the buffer, otherwise, the source text is in the
7317    gap area of the buffer, and CODING->src_pos specifies the offset of
7318    the text from GPT (which must be the same as PT).  If this is the
7319    same buffer as CODING->dst_object, CODING->src_pos must be
7320    negative.
7321
7322    If CODING->src_object is a string, CODING->src_pos is an index to
7323    that string.
7324
7325    If CODING->src_object is nil, CODING->source must already point to
7326    the non-relocatable memory area.  In this case, CODING->src_pos is
7327    an offset from CODING->source.
7328
7329    The decoded data is inserted at the current point of the buffer
7330    CODING->dst_object.
7331 */
7332
7333 static void
7334 decode_coding (struct coding_system *coding)
7335 {
7336   Lisp_Object attrs;
7337   Lisp_Object undo_list;
7338   Lisp_Object translation_table;
7339   struct ccl_spec cclspec;
7340   int carryover;
7341   int i;
7342
7343   USE_SAFE_ALLOCA;
7344
7345   if (BUFFERP (coding->src_object)
7346       && coding->src_pos > 0
7347       && coding->src_pos < GPT
7348       && coding->src_pos + coding->src_chars > GPT)
7349     move_gap_both (coding->src_pos, coding->src_pos_byte);
7350
7351   undo_list = Qt;
7352   if (BUFFERP (coding->dst_object))
7353     {
7354       set_buffer_internal (XBUFFER (coding->dst_object));
7355       if (GPT != PT)
7356         move_gap_both (PT, PT_BYTE);
7357
7358       /* We must disable undo_list in order to record the whole insert
7359          transaction via record_insert at the end.  But doing so also
7360          disables the recording of the first change to the undo_list.
7361          Therefore we check for first change here and record it via
7362          record_first_change if needed.  */
7363       if (MODIFF <= SAVE_MODIFF)
7364         record_first_change ();
7365
7366       undo_list = BVAR (current_buffer, undo_list);
7367       bset_undo_list (current_buffer, Qt);
7368     }
7369
7370   coding->consumed = coding->consumed_char = 0;
7371   coding->produced = coding->produced_char = 0;
7372   coding->chars_at_source = 0;
7373   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7374   coding->errors = 0;
7375
7376   ALLOC_CONVERSION_WORK_AREA (coding);
7377
7378   attrs = CODING_ID_ATTRS (coding->id);
7379   translation_table = get_translation_table (attrs, 0, NULL);
7380
7381   carryover = 0;
7382   if (coding->decoder == decode_coding_ccl)
7383     {
7384       coding->spec.ccl = &cclspec;
7385       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7386     }
7387   do
7388     {
7389       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7390
7391       coding_set_source (coding);
7392       coding->annotated = 0;
7393       coding->charbuf_used = carryover;
7394       (*(coding->decoder)) (coding);
7395       coding_set_destination (coding);
7396       carryover = produce_chars (coding, translation_table, 0);
7397       if (coding->annotated)
7398         produce_annotation (coding, pos);
7399       for (i = 0; i < carryover; i++)
7400         coding->charbuf[i]
7401           = coding->charbuf[coding->charbuf_used - carryover + i];
7402     }
7403   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7404          || (coding->consumed < coding->src_bytes
7405              && (coding->result == CODING_RESULT_SUCCESS
7406                  || coding->result == CODING_RESULT_INVALID_SRC)));
7407
7408   if (carryover > 0)
7409     {
7410       coding_set_destination (coding);
7411       coding->charbuf_used = carryover;
7412       produce_chars (coding, translation_table, 1);
7413     }
7414
7415   coding->carryover_bytes = 0;
7416   if (coding->consumed < coding->src_bytes)
7417     {
7418       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7419       const unsigned char *src;
7420
7421       coding_set_source (coding);
7422       coding_set_destination (coding);
7423       src = coding->source + coding->consumed;
7424
7425       if (coding->mode & CODING_MODE_LAST_BLOCK)
7426         {
7427           /* Flush out unprocessed data as binary chars.  We are sure
7428              that the number of data is less than the size of
7429              coding->charbuf.  */
7430           coding->charbuf_used = 0;
7431           coding->chars_at_source = 0;
7432
7433           while (nbytes-- > 0)
7434             {
7435               int c = *src++;
7436
7437               if (c & 0x80)
7438                 c = BYTE8_TO_CHAR (c);
7439               coding->charbuf[coding->charbuf_used++] = c;
7440             }
7441           produce_chars (coding, Qnil, 1);
7442         }
7443       else
7444         {
7445           /* Record unprocessed bytes in coding->carryover.  We are
7446              sure that the number of data is less than the size of
7447              coding->carryover.  */
7448           unsigned char *p = coding->carryover;
7449
7450           if (nbytes > sizeof coding->carryover)
7451             nbytes = sizeof coding->carryover;
7452           coding->carryover_bytes = nbytes;
7453           while (nbytes-- > 0)
7454             *p++ = *src++;
7455         }
7456       coding->consumed = coding->src_bytes;
7457     }
7458
7459   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7460       && !inhibit_eol_conversion)
7461     decode_eol (coding);
7462   if (BUFFERP (coding->dst_object))
7463     {
7464       bset_undo_list (current_buffer, undo_list);
7465       record_insert (coding->dst_pos, coding->produced_char);
7466     }
7467
7468   SAFE_FREE ();
7469 }
7470
7471
7472 /* Extract an annotation datum from a composition starting at POS and
7473    ending before LIMIT of CODING->src_object (buffer or string), store
7474    the data in BUF, set *STOP to a starting position of the next
7475    composition (if any) or to LIMIT, and return the address of the
7476    next element of BUF.
7477
7478    If such an annotation is not found, set *STOP to a starting
7479    position of a composition after POS (if any) or to LIMIT, and
7480    return BUF.  */
7481
7482 static int *
7483 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7484                                struct coding_system *coding, int *buf,
7485                                ptrdiff_t *stop)
7486 {
7487   ptrdiff_t start, end;
7488   Lisp_Object prop;
7489
7490   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7491       || end > limit)
7492     *stop = limit;
7493   else if (start > pos)
7494     *stop = start;
7495   else
7496     {
7497       if (start == pos)
7498         {
7499           /* We found a composition.  Store the corresponding
7500              annotation data in BUF.  */
7501           int *head = buf;
7502           enum composition_method method = composition_method (prop);
7503           int nchars = COMPOSITION_LENGTH (prop);
7504
7505           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7506           if (method != COMPOSITION_RELATIVE)
7507             {
7508               Lisp_Object components;
7509               ptrdiff_t i, len, i_byte;
7510
7511               components = COMPOSITION_COMPONENTS (prop);
7512               if (VECTORP (components))
7513                 {
7514                   len = ASIZE (components);
7515                   for (i = 0; i < len; i++)
7516                     *buf++ = XINT (AREF (components, i));
7517                 }
7518               else if (STRINGP (components))
7519                 {
7520                   len = SCHARS (components);
7521                   i = i_byte = 0;
7522                   while (i < len)
7523                     {
7524                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7525                       buf++;
7526                     }
7527                 }
7528               else if (INTEGERP (components))
7529                 {
7530                   len = 1;
7531                   *buf++ = XINT (components);
7532                 }
7533               else if (CONSP (components))
7534                 {
7535                   for (len = 0; CONSP (components);
7536                        len++, components = XCDR (components))
7537                     *buf++ = XINT (XCAR (components));
7538                 }
7539               else
7540                 emacs_abort ();
7541               *head -= len;
7542             }
7543         }
7544
7545       if (find_composition (end, limit, &start, &end, &prop,
7546                             coding->src_object)
7547           && end <= limit)
7548         *stop = start;
7549       else
7550         *stop = limit;
7551     }
7552   return buf;
7553 }
7554
7555
7556 /* Extract an annotation datum from a text property `charset' at POS of
7557    CODING->src_object (buffer of string), store the data in BUF, set
7558    *STOP to the position where the value of `charset' property changes
7559    (limiting by LIMIT), and return the address of the next element of
7560    BUF.
7561
7562    If the property value is nil, set *STOP to the position where the
7563    property value is non-nil (limiting by LIMIT), and return BUF.  */
7564
7565 static int *
7566 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7567                            struct coding_system *coding, int *buf,
7568                            ptrdiff_t *stop)
7569 {
7570   Lisp_Object val, next;
7571   int id;
7572
7573   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7574   if (! NILP (val) && CHARSETP (val))
7575     id = XINT (CHARSET_SYMBOL_ID (val));
7576   else
7577     id = -1;
7578   ADD_CHARSET_DATA (buf, 0, id);
7579   next = Fnext_single_property_change (make_number (pos), Qcharset,
7580                                        coding->src_object,
7581                                        make_number (limit));
7582   *stop = XINT (next);
7583   return buf;
7584 }
7585
7586
7587 static void
7588 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7589                int max_lookup)
7590 {
7591   int *buf = coding->charbuf;
7592   int *buf_end = coding->charbuf + coding->charbuf_size;
7593   const unsigned char *src = coding->source + coding->consumed;
7594   const unsigned char *src_end = coding->source + coding->src_bytes;
7595   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7596   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7597   bool multibytep = coding->src_multibyte;
7598   Lisp_Object eol_type;
7599   int c;
7600   ptrdiff_t stop, stop_composition, stop_charset;
7601   int *lookup_buf = NULL;
7602
7603   if (! NILP (translation_table))
7604     lookup_buf = alloca (sizeof (int) * max_lookup);
7605
7606   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7607   if (VECTORP (eol_type))
7608     eol_type = Qunix;
7609
7610   /* Note: composition handling is not yet implemented.  */
7611   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7612
7613   if (NILP (coding->src_object))
7614     stop = stop_composition = stop_charset = end_pos;
7615   else
7616     {
7617       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7618         stop = stop_composition = pos;
7619       else
7620         stop = stop_composition = end_pos;
7621       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7622         stop = stop_charset = pos;
7623       else
7624         stop_charset = end_pos;
7625     }
7626
7627   /* Compensate for CRLF and conversion.  */
7628   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7629   while (buf < buf_end)
7630     {
7631       Lisp_Object trans;
7632
7633       if (pos == stop)
7634         {
7635           if (pos == end_pos)
7636             break;
7637           if (pos == stop_composition)
7638             buf = handle_composition_annotation (pos, end_pos, coding,
7639                                                  buf, &stop_composition);
7640           if (pos == stop_charset)
7641             buf = handle_charset_annotation (pos, end_pos, coding,
7642                                              buf, &stop_charset);
7643           stop = (stop_composition < stop_charset
7644                   ? stop_composition : stop_charset);
7645         }
7646
7647       if (! multibytep)
7648         {
7649           int bytes;
7650
7651           if (coding->encoder == encode_coding_raw_text
7652               || coding->encoder == encode_coding_ccl)
7653             c = *src++, pos++;
7654           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7655             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7656           else
7657             c = BYTE8_TO_CHAR (*src), src++, pos++;
7658         }
7659       else
7660         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7661       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7662         c = '\n';
7663       if (! EQ (eol_type, Qunix))
7664         {
7665           if (c == '\n')
7666             {
7667               if (EQ (eol_type, Qdos))
7668                 *buf++ = '\r';
7669               else
7670                 c = '\r';
7671             }
7672         }
7673
7674       trans = Qnil;
7675       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7676       if (NILP (trans))
7677         *buf++ = c;
7678       else
7679         {
7680           ptrdiff_t from_nchars = 1, to_nchars = 1;
7681           int *lookup_buf_end;
7682           const unsigned char *p = src;
7683           int i;
7684
7685           lookup_buf[0] = c;
7686           for (i = 1; i < max_lookup && p < src_end; i++)
7687             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7688           lookup_buf_end = lookup_buf + i;
7689           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7690           if (INTEGERP (trans))
7691             c = XINT (trans);
7692           else if (CONSP (trans))
7693             {
7694               from_nchars = ASIZE (XCAR (trans));
7695               trans = XCDR (trans);
7696               if (INTEGERP (trans))
7697                 c = XINT (trans);
7698               else
7699                 {
7700                   to_nchars = ASIZE (trans);
7701                   if (buf_end - buf < to_nchars)
7702                     break;
7703                   c = XINT (AREF (trans, 0));
7704                 }
7705             }
7706           else
7707             break;
7708           *buf++ = c;
7709           for (i = 1; i < to_nchars; i++)
7710             *buf++ = XINT (AREF (trans, i));
7711           for (i = 1; i < from_nchars; i++, pos++)
7712             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7713         }
7714     }
7715
7716   coding->consumed = src - coding->source;
7717   coding->consumed_char = pos - coding->src_pos;
7718   coding->charbuf_used = buf - coding->charbuf;
7719   coding->chars_at_source = 0;
7720 }
7721
7722
7723 /* Encode the text at CODING->src_object into CODING->dst_object.
7724    CODING->src_object is a buffer or a string.
7725    CODING->dst_object is a buffer or nil.
7726
7727    If CODING->src_object is a buffer, it must be the current buffer.
7728    In this case, if CODING->src_pos is positive, it is a position of
7729    the source text in the buffer, otherwise. the source text is in the
7730    gap area of the buffer, and coding->src_pos specifies the offset of
7731    the text from GPT (which must be the same as PT).  If this is the
7732    same buffer as CODING->dst_object, CODING->src_pos must be
7733    negative and CODING should not have `pre-write-conversion'.
7734
7735    If CODING->src_object is a string, CODING should not have
7736    `pre-write-conversion'.
7737
7738    If CODING->dst_object is a buffer, the encoded data is inserted at
7739    the current point of that buffer.
7740
7741    If CODING->dst_object is nil, the encoded data is placed at the
7742    memory area specified by CODING->destination.  */
7743
7744 static void
7745 encode_coding (struct coding_system *coding)
7746 {
7747   Lisp_Object attrs;
7748   Lisp_Object translation_table;
7749   int max_lookup;
7750   struct ccl_spec cclspec;
7751
7752   USE_SAFE_ALLOCA;
7753
7754   attrs = CODING_ID_ATTRS (coding->id);
7755   if (coding->encoder == encode_coding_raw_text)
7756     translation_table = Qnil, max_lookup = 0;
7757   else
7758     translation_table = get_translation_table (attrs, 1, &max_lookup);
7759
7760   if (BUFFERP (coding->dst_object))
7761     {
7762       set_buffer_internal (XBUFFER (coding->dst_object));
7763       coding->dst_multibyte
7764         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7765     }
7766
7767   coding->consumed = coding->consumed_char = 0;
7768   coding->produced = coding->produced_char = 0;
7769   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7770   coding->errors = 0;
7771
7772   ALLOC_CONVERSION_WORK_AREA (coding);
7773
7774   if (coding->encoder == encode_coding_ccl)
7775     {
7776       coding->spec.ccl = &cclspec;
7777       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7778     }
7779   do {
7780     coding_set_source (coding);
7781     consume_chars (coding, translation_table, max_lookup);
7782     coding_set_destination (coding);
7783     (*(coding->encoder)) (coding);
7784   } while (coding->consumed_char < coding->src_chars);
7785
7786   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7787     insert_from_gap (coding->produced_char, coding->produced, 0);
7788
7789   SAFE_FREE ();
7790 }
7791
7792
7793 /* Name (or base name) of work buffer for code conversion.  */
7794 static Lisp_Object Vcode_conversion_workbuf_name;
7795
7796 /* A working buffer used by the top level conversion.  Once it is
7797    created, it is never destroyed.  It has the name
7798    Vcode_conversion_workbuf_name.  The other working buffers are
7799    destroyed after the use is finished, and their names are modified
7800    versions of Vcode_conversion_workbuf_name.  */
7801 static Lisp_Object Vcode_conversion_reused_workbuf;
7802
7803 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7804 static bool reused_workbuf_in_use;
7805
7806
7807 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7808    multibyteness of returning buffer.  */
7809
7810 static Lisp_Object
7811 make_conversion_work_buffer (bool multibyte)
7812 {
7813   Lisp_Object name, workbuf;
7814   struct buffer *current;
7815
7816   if (reused_workbuf_in_use)
7817     {
7818       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7819       workbuf = Fget_buffer_create (name);
7820     }
7821   else
7822     {
7823       reused_workbuf_in_use = 1;
7824       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7825         Vcode_conversion_reused_workbuf
7826           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7827       workbuf = Vcode_conversion_reused_workbuf;
7828     }
7829   current = current_buffer;
7830   set_buffer_internal (XBUFFER (workbuf));
7831   /* We can't allow modification hooks to run in the work buffer.  For
7832      instance, directory_files_internal assumes that file decoding
7833      doesn't compile new regexps.  */
7834   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7835   Ferase_buffer ();
7836   bset_undo_list (current_buffer, Qt);
7837   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7838   set_buffer_internal (current);
7839   return workbuf;
7840 }
7841
7842
7843 static void
7844 code_conversion_restore (Lisp_Object arg)
7845 {
7846   Lisp_Object current, workbuf;
7847   struct gcpro gcpro1;
7848
7849   GCPRO1 (arg);
7850   current = XCAR (arg);
7851   workbuf = XCDR (arg);
7852   if (! NILP (workbuf))
7853     {
7854       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7855         reused_workbuf_in_use = 0;
7856       else
7857         Fkill_buffer (workbuf);
7858     }
7859   set_buffer_internal (XBUFFER (current));
7860   UNGCPRO;
7861 }
7862
7863 Lisp_Object
7864 code_conversion_save (bool with_work_buf, bool multibyte)
7865 {
7866   Lisp_Object workbuf = Qnil;
7867
7868   if (with_work_buf)
7869     workbuf = make_conversion_work_buffer (multibyte);
7870   record_unwind_protect (code_conversion_restore,
7871                          Fcons (Fcurrent_buffer (), workbuf));
7872   return workbuf;
7873 }
7874
7875 void
7876 decode_coding_gap (struct coding_system *coding,
7877                    ptrdiff_t chars, ptrdiff_t bytes)
7878 {
7879   ptrdiff_t count = SPECPDL_INDEX ();
7880   Lisp_Object attrs;
7881
7882   coding->src_object = Fcurrent_buffer ();
7883   coding->src_chars = chars;
7884   coding->src_bytes = bytes;
7885   coding->src_pos = -chars;
7886   coding->src_pos_byte = -bytes;
7887   coding->src_multibyte = chars < bytes;
7888   coding->dst_object = coding->src_object;
7889   coding->dst_pos = PT;
7890   coding->dst_pos_byte = PT_BYTE;
7891   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7892
7893   coding->head_ascii = -1;
7894   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7895   coding->eol_seen = EOL_SEEN_NONE;
7896   if (CODING_REQUIRE_DETECTION (coding))
7897     detect_coding (coding);
7898   attrs = CODING_ID_ATTRS (coding->id);
7899   if (! disable_ascii_optimization
7900       && ! coding->src_multibyte
7901       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7902       && NILP (CODING_ATTR_POST_READ (attrs))
7903       && NILP (get_translation_table (attrs, 0, NULL)))
7904     {
7905       chars = coding->head_ascii;
7906       if (chars < 0)
7907         chars = check_ascii (coding);
7908       if (chars != bytes)
7909         {
7910           /* There exists a non-ASCII byte.  */
7911           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7912               && coding->detected_utf8_bytes == coding->src_bytes)
7913             {
7914               if (coding->detected_utf8_chars >= 0)
7915                 chars = coding->detected_utf8_chars;
7916               else
7917                 chars = check_utf_8 (coding);
7918               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7919                   && coding->head_ascii == 0
7920                   && coding->source[0] == UTF_8_BOM_1
7921                   && coding->source[1] == UTF_8_BOM_2
7922                   && coding->source[2] == UTF_8_BOM_3)
7923                 {
7924                   chars--;
7925                   bytes -= 3;
7926                   coding->src_bytes -= 3;
7927                 }
7928             }
7929           else
7930             chars = -1;
7931         }
7932       if (chars >= 0)
7933         {
7934           Lisp_Object eol_type;
7935
7936           eol_type = CODING_ID_EOL_TYPE (coding->id);
7937           if (VECTORP (eol_type))
7938             {
7939               if (coding->eol_seen != EOL_SEEN_NONE)
7940                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7941             }
7942           if (EQ (eol_type, Qmac))
7943             {
7944               unsigned char *src_end = GAP_END_ADDR;
7945               unsigned char *src = src_end - coding->src_bytes;
7946
7947               while (src < src_end)
7948                 {
7949                   if (*src++ == '\r')
7950                     src[-1] = '\n';
7951                 }
7952             }
7953           else if (EQ (eol_type, Qdos))
7954             {
7955               unsigned char *src = GAP_END_ADDR;
7956               unsigned char *src_beg = src - coding->src_bytes;
7957               unsigned char *dst = src;
7958               ptrdiff_t diff;
7959
7960               while (src_beg < src)
7961                 {
7962                   *--dst = *--src;
7963                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7964                     src--;
7965                 }
7966               diff = dst - src;
7967               bytes -= diff;
7968               chars -= diff;
7969             }
7970           coding->produced = bytes;
7971           coding->produced_char = chars;
7972           insert_from_gap (chars, bytes, 1);
7973           return;
7974         }
7975     }
7976   code_conversion_save (0, 0);
7977
7978   coding->mode |= CODING_MODE_LAST_BLOCK;
7979   current_buffer->text->inhibit_shrinking = 1;
7980   decode_coding (coding);
7981   current_buffer->text->inhibit_shrinking = 0;
7982
7983   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7984     {
7985       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7986       Lisp_Object val;
7987
7988       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7989       val = call1 (CODING_ATTR_POST_READ (attrs),
7990                    make_number (coding->produced_char));
7991       CHECK_NATNUM (val);
7992       coding->produced_char += Z - prev_Z;
7993       coding->produced += Z_BYTE - prev_Z_BYTE;
7994     }
7995
7996   unbind_to (count, Qnil);
7997 }
7998
7999
8000 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8001    SRC_OBJECT into DST_OBJECT by coding context CODING.
8002
8003    SRC_OBJECT is a buffer, a string, or Qnil.
8004
8005    If it is a buffer, the text is at point of the buffer.  FROM and TO
8006    are positions in the buffer.
8007
8008    If it is a string, the text is at the beginning of the string.
8009    FROM and TO are indices to the string.
8010
8011    If it is nil, the text is at coding->source.  FROM and TO are
8012    indices to coding->source.
8013
8014    DST_OBJECT is a buffer, Qt, or Qnil.
8015
8016    If it is a buffer, the decoded text is inserted at point of the
8017    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8018    is deleted.
8019
8020    If it is Qt, a string is made from the decoded text, and
8021    set in CODING->dst_object.
8022
8023    If it is Qnil, the decoded text is stored at CODING->destination.
8024    The caller must allocate CODING->dst_bytes bytes at
8025    CODING->destination by xmalloc.  If the decoded text is longer than
8026    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8027  */
8028
8029 void
8030 decode_coding_object (struct coding_system *coding,
8031                       Lisp_Object src_object,
8032                       ptrdiff_t from, ptrdiff_t from_byte,
8033                       ptrdiff_t to, ptrdiff_t to_byte,
8034                       Lisp_Object dst_object)
8035 {
8036   ptrdiff_t count = SPECPDL_INDEX ();
8037   unsigned char *destination IF_LINT (= NULL);
8038   ptrdiff_t dst_bytes IF_LINT (= 0);
8039   ptrdiff_t chars = to - from;
8040   ptrdiff_t bytes = to_byte - from_byte;
8041   Lisp_Object attrs;
8042   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8043   bool need_marker_adjustment = 0;
8044   Lisp_Object old_deactivate_mark;
8045
8046   old_deactivate_mark = Vdeactivate_mark;
8047
8048   if (NILP (dst_object))
8049     {
8050       destination = coding->destination;
8051       dst_bytes = coding->dst_bytes;
8052     }
8053
8054   coding->src_object = src_object;
8055   coding->src_chars = chars;
8056   coding->src_bytes = bytes;
8057   coding->src_multibyte = chars < bytes;
8058
8059   if (STRINGP (src_object))
8060     {
8061       coding->src_pos = from;
8062       coding->src_pos_byte = from_byte;
8063     }
8064   else if (BUFFERP (src_object))
8065     {
8066       set_buffer_internal (XBUFFER (src_object));
8067       if (from != GPT)
8068         move_gap_both (from, from_byte);
8069       if (EQ (src_object, dst_object))
8070         {
8071           struct Lisp_Marker *tail;
8072
8073           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8074             {
8075               tail->need_adjustment
8076                 = tail->charpos == (tail->insertion_type ? from : to);
8077               need_marker_adjustment |= tail->need_adjustment;
8078             }
8079           saved_pt = PT, saved_pt_byte = PT_BYTE;
8080           TEMP_SET_PT_BOTH (from, from_byte);
8081           current_buffer->text->inhibit_shrinking = 1;
8082           del_range_both (from, from_byte, to, to_byte, 1);
8083           coding->src_pos = -chars;
8084           coding->src_pos_byte = -bytes;
8085         }
8086       else
8087         {
8088           coding->src_pos = from;
8089           coding->src_pos_byte = from_byte;
8090         }
8091     }
8092
8093   if (CODING_REQUIRE_DETECTION (coding))
8094     detect_coding (coding);
8095   attrs = CODING_ID_ATTRS (coding->id);
8096
8097   if (EQ (dst_object, Qt)
8098       || (! NILP (CODING_ATTR_POST_READ (attrs))
8099           && NILP (dst_object)))
8100     {
8101       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8102       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8103       coding->dst_pos = BEG;
8104       coding->dst_pos_byte = BEG_BYTE;
8105     }
8106   else if (BUFFERP (dst_object))
8107     {
8108       code_conversion_save (0, 0);
8109       coding->dst_object = dst_object;
8110       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8111       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8112       coding->dst_multibyte
8113         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8114     }
8115   else
8116     {
8117       code_conversion_save (0, 0);
8118       coding->dst_object = Qnil;
8119       /* Most callers presume this will return a multibyte result, and they
8120          won't use `binary' or `raw-text' anyway, so let's not worry about
8121          CODING_FOR_UNIBYTE.  */
8122       coding->dst_multibyte = 1;
8123     }
8124
8125   decode_coding (coding);
8126
8127   if (BUFFERP (coding->dst_object))
8128     set_buffer_internal (XBUFFER (coding->dst_object));
8129
8130   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8131     {
8132       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8133       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8134       Lisp_Object val;
8135
8136       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8137       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8138               old_deactivate_mark);
8139       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8140                         make_number (coding->produced_char));
8141       UNGCPRO;
8142       CHECK_NATNUM (val);
8143       coding->produced_char += Z - prev_Z;
8144       coding->produced += Z_BYTE - prev_Z_BYTE;
8145     }
8146
8147   if (EQ (dst_object, Qt))
8148     {
8149       coding->dst_object = Fbuffer_string ();
8150     }
8151   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8152     {
8153       set_buffer_internal (XBUFFER (coding->dst_object));
8154       if (dst_bytes < coding->produced)
8155         {
8156           eassert (coding->produced > 0);
8157           destination = xrealloc (destination, coding->produced);
8158           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8159             move_gap_both (BEGV, BEGV_BYTE);
8160           memcpy (destination, BEGV_ADDR, coding->produced);
8161           coding->destination = destination;
8162         }
8163     }
8164
8165   if (saved_pt >= 0)
8166     {
8167       /* This is the case of:
8168          (BUFFERP (src_object) && EQ (src_object, dst_object))
8169          As we have moved PT while replacing the original buffer
8170          contents, we must recover it now.  */
8171       set_buffer_internal (XBUFFER (src_object));
8172       current_buffer->text->inhibit_shrinking = 0;
8173       if (saved_pt < from)
8174         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8175       else if (saved_pt < from + chars)
8176         TEMP_SET_PT_BOTH (from, from_byte);
8177       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8178         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8179                           saved_pt_byte + (coding->produced - bytes));
8180       else
8181         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8182                           saved_pt_byte + (coding->produced - bytes));
8183
8184       if (need_marker_adjustment)
8185         {
8186           struct Lisp_Marker *tail;
8187
8188           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8189             if (tail->need_adjustment)
8190               {
8191                 tail->need_adjustment = 0;
8192                 if (tail->insertion_type)
8193                   {
8194                     tail->bytepos = from_byte;
8195                     tail->charpos = from;
8196                   }
8197                 else
8198                   {
8199                     tail->bytepos = from_byte + coding->produced;
8200                     tail->charpos
8201                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8202                          ? tail->bytepos : from + coding->produced_char);
8203                   }
8204               }
8205         }
8206     }
8207
8208   Vdeactivate_mark = old_deactivate_mark;
8209   unbind_to (count, coding->dst_object);
8210 }
8211
8212
8213 void
8214 encode_coding_object (struct coding_system *coding,
8215                       Lisp_Object src_object,
8216                       ptrdiff_t from, ptrdiff_t from_byte,
8217                       ptrdiff_t to, ptrdiff_t to_byte,
8218                       Lisp_Object dst_object)
8219 {
8220   ptrdiff_t count = SPECPDL_INDEX ();
8221   ptrdiff_t chars = to - from;
8222   ptrdiff_t bytes = to_byte - from_byte;
8223   Lisp_Object attrs;
8224   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8225   bool need_marker_adjustment = 0;
8226   bool kill_src_buffer = 0;
8227   Lisp_Object old_deactivate_mark;
8228
8229   old_deactivate_mark = Vdeactivate_mark;
8230
8231   coding->src_object = src_object;
8232   coding->src_chars = chars;
8233   coding->src_bytes = bytes;
8234   coding->src_multibyte = chars < bytes;
8235
8236   attrs = CODING_ID_ATTRS (coding->id);
8237
8238   if (EQ (src_object, dst_object))
8239     {
8240       struct Lisp_Marker *tail;
8241
8242       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8243         {
8244           tail->need_adjustment
8245             = tail->charpos == (tail->insertion_type ? from : to);
8246           need_marker_adjustment |= tail->need_adjustment;
8247         }
8248     }
8249
8250   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8251     {
8252       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8253       set_buffer_internal (XBUFFER (coding->src_object));
8254       if (STRINGP (src_object))
8255         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8256       else if (BUFFERP (src_object))
8257         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8258       else
8259         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8260
8261       if (EQ (src_object, dst_object))
8262         {
8263           set_buffer_internal (XBUFFER (src_object));
8264           saved_pt = PT, saved_pt_byte = PT_BYTE;
8265           del_range_both (from, from_byte, to, to_byte, 1);
8266           set_buffer_internal (XBUFFER (coding->src_object));
8267         }
8268
8269       {
8270         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8271
8272         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8273                 old_deactivate_mark);
8274         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8275                     make_number (BEG), make_number (Z));
8276         UNGCPRO;
8277       }
8278       if (XBUFFER (coding->src_object) != current_buffer)
8279         kill_src_buffer = 1;
8280       coding->src_object = Fcurrent_buffer ();
8281       if (BEG != GPT)
8282         move_gap_both (BEG, BEG_BYTE);
8283       coding->src_chars = Z - BEG;
8284       coding->src_bytes = Z_BYTE - BEG_BYTE;
8285       coding->src_pos = BEG;
8286       coding->src_pos_byte = BEG_BYTE;
8287       coding->src_multibyte = Z < Z_BYTE;
8288     }
8289   else if (STRINGP (src_object))
8290     {
8291       code_conversion_save (0, 0);
8292       coding->src_pos = from;
8293       coding->src_pos_byte = from_byte;
8294     }
8295   else if (BUFFERP (src_object))
8296     {
8297       code_conversion_save (0, 0);
8298       set_buffer_internal (XBUFFER (src_object));
8299       if (EQ (src_object, dst_object))
8300         {
8301           saved_pt = PT, saved_pt_byte = PT_BYTE;
8302           coding->src_object = del_range_1 (from, to, 1, 1);
8303           coding->src_pos = 0;
8304           coding->src_pos_byte = 0;
8305         }
8306       else
8307         {
8308           if (from < GPT && to >= GPT)
8309             move_gap_both (from, from_byte);
8310           coding->src_pos = from;
8311           coding->src_pos_byte = from_byte;
8312         }
8313     }
8314   else
8315     code_conversion_save (0, 0);
8316
8317   if (BUFFERP (dst_object))
8318     {
8319       coding->dst_object = dst_object;
8320       if (EQ (src_object, dst_object))
8321         {
8322           coding->dst_pos = from;
8323           coding->dst_pos_byte = from_byte;
8324         }
8325       else
8326         {
8327           struct buffer *current = current_buffer;
8328
8329           set_buffer_temp (XBUFFER (dst_object));
8330           coding->dst_pos = PT;
8331           coding->dst_pos_byte = PT_BYTE;
8332           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8333           set_buffer_temp (current);
8334         }
8335       coding->dst_multibyte
8336         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8337     }
8338   else if (EQ (dst_object, Qt))
8339     {
8340       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8341       coding->dst_object = Qnil;
8342       coding->destination = xmalloc (dst_bytes);
8343       coding->dst_bytes = dst_bytes;
8344       coding->dst_multibyte = 0;
8345     }
8346   else
8347     {
8348       coding->dst_object = Qnil;
8349       coding->dst_multibyte = 0;
8350     }
8351
8352   encode_coding (coding);
8353
8354   if (EQ (dst_object, Qt))
8355     {
8356       if (BUFFERP (coding->dst_object))
8357         coding->dst_object = Fbuffer_string ();
8358       else if (coding->raw_destination)
8359         /* This is used to avoid creating huge Lisp string.
8360            NOTE: caller who sets `raw_destination' is also
8361            responsible for freeing `destination' buffer.  */
8362         coding->dst_object = Qnil;
8363       else
8364         {
8365           coding->dst_object
8366             = make_unibyte_string ((char *) coding->destination,
8367                                    coding->produced);
8368           xfree (coding->destination);
8369         }
8370     }
8371
8372   if (saved_pt >= 0)
8373     {
8374       /* This is the case of:
8375          (BUFFERP (src_object) && EQ (src_object, dst_object))
8376          As we have moved PT while replacing the original buffer
8377          contents, we must recover it now.  */
8378       set_buffer_internal (XBUFFER (src_object));
8379       if (saved_pt < from)
8380         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8381       else if (saved_pt < from + chars)
8382         TEMP_SET_PT_BOTH (from, from_byte);
8383       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8384         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8385                           saved_pt_byte + (coding->produced - bytes));
8386       else
8387         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8388                           saved_pt_byte + (coding->produced - bytes));
8389
8390       if (need_marker_adjustment)
8391         {
8392           struct Lisp_Marker *tail;
8393
8394           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8395             if (tail->need_adjustment)
8396               {
8397                 tail->need_adjustment = 0;
8398                 if (tail->insertion_type)
8399                   {
8400                     tail->bytepos = from_byte;
8401                     tail->charpos = from;
8402                   }
8403                 else
8404                   {
8405                     tail->bytepos = from_byte + coding->produced;
8406                     tail->charpos
8407                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8408                          ? tail->bytepos : from + coding->produced_char);
8409                   }
8410               }
8411         }
8412     }
8413
8414   if (kill_src_buffer)
8415     Fkill_buffer (coding->src_object);
8416
8417   Vdeactivate_mark = old_deactivate_mark;
8418   unbind_to (count, Qnil);
8419 }
8420
8421
8422 Lisp_Object
8423 preferred_coding_system (void)
8424 {
8425   int id = coding_categories[coding_priorities[0]].id;
8426
8427   return CODING_ID_NAME (id);
8428 }
8429
8430 #if defined (WINDOWSNT) || defined (CYGWIN)
8431
8432 Lisp_Object
8433 from_unicode (Lisp_Object str)
8434 {
8435   CHECK_STRING (str);
8436   if (!STRING_MULTIBYTE (str) &&
8437       SBYTES (str) & 1)
8438     {
8439       str = Fsubstring (str, make_number (0), make_number (-1));
8440     }
8441
8442   return code_convert_string_norecord (str, Qutf_16le, 0);
8443 }
8444
8445 Lisp_Object
8446 from_unicode_buffer (const wchar_t* wstr)
8447 {
8448     return from_unicode (
8449         make_unibyte_string (
8450             (char*) wstr,
8451             /* we get one of the two final 0 bytes for free. */
8452             1 + sizeof (wchar_t) * wcslen (wstr)));
8453 }
8454
8455 wchar_t *
8456 to_unicode (Lisp_Object str, Lisp_Object *buf)
8457 {
8458   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8459   /* We need to make another copy (in addition to the one made by
8460      code_convert_string_norecord) to ensure that the final string is
8461      _doubly_ zero terminated --- that is, that the string is
8462      terminated by two zero bytes and one utf-16le null character.
8463      Because strings are already terminated with a single zero byte,
8464      we just add one additional zero. */
8465   str = make_uninit_string (SBYTES (*buf) + 1);
8466   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8467   SDATA (str) [SBYTES (*buf)] = '\0';
8468   *buf = str;
8469   return WCSDATA (*buf);
8470 }
8471
8472 #endif /* WINDOWSNT || CYGWIN */
8473
8474 \f
8475 #ifdef emacs
8476 /*** 8. Emacs Lisp library functions ***/
8477
8478 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8479        doc: /* Return t if OBJECT is nil or a coding-system.
8480 See the documentation of `define-coding-system' for information
8481 about coding-system objects.  */)
8482   (Lisp_Object object)
8483 {
8484   if (NILP (object)
8485       || CODING_SYSTEM_ID (object) >= 0)
8486     return Qt;
8487   if (! SYMBOLP (object)
8488       || NILP (Fget (object, Qcoding_system_define_form)))
8489     return Qnil;
8490   return Qt;
8491 }
8492
8493 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8494        Sread_non_nil_coding_system, 1, 1, 0,
8495        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8496   (Lisp_Object prompt)
8497 {
8498   Lisp_Object val;
8499   do
8500     {
8501       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8502                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8503     }
8504   while (SCHARS (val) == 0);
8505   return (Fintern (val, Qnil));
8506 }
8507
8508 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8509        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8510 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8511 Ignores case when completing coding systems (all Emacs coding systems
8512 are lower-case).  */)
8513   (Lisp_Object prompt, Lisp_Object default_coding_system)
8514 {
8515   Lisp_Object val;
8516   ptrdiff_t count = SPECPDL_INDEX ();
8517
8518   if (SYMBOLP (default_coding_system))
8519     default_coding_system = SYMBOL_NAME (default_coding_system);
8520   specbind (Qcompletion_ignore_case, Qt);
8521   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8522                           Qt, Qnil, Qcoding_system_history,
8523                           default_coding_system, Qnil);
8524   unbind_to (count, Qnil);
8525   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8526 }
8527
8528 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8529        1, 1, 0,
8530        doc: /* Check validity of CODING-SYSTEM.
8531 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8532 It is valid if it is nil or a symbol defined as a coding system by the
8533 function `define-coding-system'.  */)
8534   (Lisp_Object coding_system)
8535 {
8536   Lisp_Object define_form;
8537
8538   define_form = Fget (coding_system, Qcoding_system_define_form);
8539   if (! NILP (define_form))
8540     {
8541       Fput (coding_system, Qcoding_system_define_form, Qnil);
8542       safe_eval (define_form);
8543     }
8544   if (!NILP (Fcoding_system_p (coding_system)))
8545     return coding_system;
8546   xsignal1 (Qcoding_system_error, coding_system);
8547 }
8548
8549 \f
8550 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8551    HIGHEST, return the coding system of the highest
8552    priority among the detected coding systems.  Otherwise return a
8553    list of detected coding systems sorted by their priorities.  If
8554    MULTIBYTEP, it is assumed that the bytes are in correct
8555    multibyte form but contains only ASCII and eight-bit chars.
8556    Otherwise, the bytes are raw bytes.
8557
8558    CODING-SYSTEM controls the detection as below:
8559
8560    If it is nil, detect both text-format and eol-format.  If the
8561    text-format part of CODING-SYSTEM is already specified
8562    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8563    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8564    detect only text-format.  */
8565
8566 Lisp_Object
8567 detect_coding_system (const unsigned char *src,
8568                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8569                       bool highest, bool multibytep,
8570                       Lisp_Object coding_system)
8571 {
8572   const unsigned char *src_end = src + src_bytes;
8573   Lisp_Object attrs, eol_type;
8574   Lisp_Object val = Qnil;
8575   struct coding_system coding;
8576   ptrdiff_t id;
8577   struct coding_detection_info detect_info;
8578   enum coding_category base_category;
8579   bool null_byte_found = 0, eight_bit_found = 0;
8580
8581   if (NILP (coding_system))
8582     coding_system = Qundecided;
8583   setup_coding_system (coding_system, &coding);
8584   attrs = CODING_ID_ATTRS (coding.id);
8585   eol_type = CODING_ID_EOL_TYPE (coding.id);
8586   coding_system = CODING_ATTR_BASE_NAME (attrs);
8587
8588   coding.source = src;
8589   coding.src_chars = src_chars;
8590   coding.src_bytes = src_bytes;
8591   coding.src_multibyte = multibytep;
8592   coding.consumed = 0;
8593   coding.mode |= CODING_MODE_LAST_BLOCK;
8594   coding.head_ascii = 0;
8595
8596   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8597
8598   /* At first, detect text-format if necessary.  */
8599   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8600   if (base_category == coding_category_undecided)
8601     {
8602       enum coding_category category IF_LINT (= 0);
8603       struct coding_system *this IF_LINT (= NULL);
8604       int c, i;
8605       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8606                                        inhibit_null_byte_detection);
8607       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8608                                        inhibit_iso_escape_detection);
8609       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8610
8611       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8612       for (; src < src_end; src++)
8613         {
8614           c = *src;
8615           if (c & 0x80)
8616             {
8617               eight_bit_found = 1;
8618               if (null_byte_found)
8619                 break;
8620             }
8621           else if (c < 0x20)
8622             {
8623               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8624                   && ! inhibit_ied
8625                   && ! detect_info.checked)
8626                 {
8627                   if (detect_coding_iso_2022 (&coding, &detect_info))
8628                     {
8629                       /* We have scanned the whole data.  */
8630                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8631                         {
8632                           /* We didn't find an 8-bit code.  We may
8633                              have found a null-byte, but it's very
8634                              rare that a binary file confirm to
8635                              ISO-2022.  */
8636                           src = src_end;
8637                           coding.head_ascii = src - coding.source;
8638                         }
8639                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8640                       break;
8641                     }
8642                 }
8643               else if (! c && !inhibit_nbd)
8644                 {
8645                   null_byte_found = 1;
8646                   if (eight_bit_found)
8647                     break;
8648                 }
8649               if (! eight_bit_found)
8650                 coding.head_ascii++;
8651             }
8652           else if (! eight_bit_found)
8653             coding.head_ascii++;
8654         }
8655
8656       if (null_byte_found || eight_bit_found
8657           || coding.head_ascii < coding.src_bytes
8658           || detect_info.found)
8659         {
8660           if (coding.head_ascii == coding.src_bytes)
8661             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8662             for (i = 0; i < coding_category_raw_text; i++)
8663               {
8664                 category = coding_priorities[i];
8665                 this = coding_categories + category;
8666                 if (detect_info.found & (1 << category))
8667                   break;
8668               }
8669           else
8670             {
8671               if (null_byte_found)
8672                 {
8673                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8674                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8675                 }
8676               else if (prefer_utf_8
8677                        && detect_coding_utf_8 (&coding, &detect_info))
8678                 {
8679                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8680                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8681                 }
8682               for (i = 0; i < coding_category_raw_text; i++)
8683                 {
8684                   category = coding_priorities[i];
8685                   this = coding_categories + category;
8686
8687                   if (this->id < 0)
8688                     {
8689                       /* No coding system of this category is defined.  */
8690                       detect_info.rejected |= (1 << category);
8691                     }
8692                   else if (category >= coding_category_raw_text)
8693                     continue;
8694                   else if (detect_info.checked & (1 << category))
8695                     {
8696                       if (highest
8697                           && (detect_info.found & (1 << category)))
8698                         break;
8699                     }
8700                   else if ((*(this->detector)) (&coding, &detect_info)
8701                            && highest
8702                            && (detect_info.found & (1 << category)))
8703                     {
8704                       if (category == coding_category_utf_16_auto)
8705                         {
8706                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8707                             category = coding_category_utf_16_le;
8708                           else
8709                             category = coding_category_utf_16_be;
8710                         }
8711                       break;
8712                     }
8713                 }
8714             }
8715         }
8716
8717       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8718           || null_byte_found)
8719         {
8720           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8721           id = CODING_SYSTEM_ID (Qno_conversion);
8722           val = list1 (make_number (id));
8723         }
8724       else if (! detect_info.rejected && ! detect_info.found)
8725         {
8726           detect_info.found = CATEGORY_MASK_ANY;
8727           id = coding_categories[coding_category_undecided].id;
8728           val = list1 (make_number (id));
8729         }
8730       else if (highest)
8731         {
8732           if (detect_info.found)
8733             {
8734               detect_info.found = 1 << category;
8735               val = list1 (make_number (this->id));
8736             }
8737           else
8738             for (i = 0; i < coding_category_raw_text; i++)
8739               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8740                 {
8741                   detect_info.found = 1 << coding_priorities[i];
8742                   id = coding_categories[coding_priorities[i]].id;
8743                   val = list1 (make_number (id));
8744                   break;
8745                 }
8746         }
8747       else
8748         {
8749           int mask = detect_info.rejected | detect_info.found;
8750           int found = 0;
8751
8752           for (i = coding_category_raw_text - 1; i >= 0; i--)
8753             {
8754               category = coding_priorities[i];
8755               if (! (mask & (1 << category)))
8756                 {
8757                   found |= 1 << category;
8758                   id = coding_categories[category].id;
8759                   if (id >= 0)
8760                     val = list1 (make_number (id));
8761                 }
8762             }
8763           for (i = coding_category_raw_text - 1; i >= 0; i--)
8764             {
8765               category = coding_priorities[i];
8766               if (detect_info.found & (1 << category))
8767                 {
8768                   id = coding_categories[category].id;
8769                   val = Fcons (make_number (id), val);
8770                 }
8771             }
8772           detect_info.found |= found;
8773         }
8774     }
8775   else if (base_category == coding_category_utf_8_auto)
8776     {
8777       if (detect_coding_utf_8 (&coding, &detect_info))
8778         {
8779           struct coding_system *this;
8780
8781           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8782             this = coding_categories + coding_category_utf_8_sig;
8783           else
8784             this = coding_categories + coding_category_utf_8_nosig;
8785           val = list1 (make_number (this->id));
8786         }
8787     }
8788   else if (base_category == coding_category_utf_16_auto)
8789     {
8790       if (detect_coding_utf_16 (&coding, &detect_info))
8791         {
8792           struct coding_system *this;
8793
8794           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8795             this = coding_categories + coding_category_utf_16_le;
8796           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8797             this = coding_categories + coding_category_utf_16_be;
8798           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8799             this = coding_categories + coding_category_utf_16_be_nosig;
8800           else
8801             this = coding_categories + coding_category_utf_16_le_nosig;
8802           val = list1 (make_number (this->id));
8803         }
8804     }
8805   else
8806     {
8807       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8808       val = list1 (make_number (coding.id));
8809     }
8810
8811   /* Then, detect eol-format if necessary.  */
8812   {
8813     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8814     Lisp_Object tail;
8815
8816     if (VECTORP (eol_type))
8817       {
8818         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8819           {
8820             if (null_byte_found)
8821               normal_eol = EOL_SEEN_LF;
8822             else
8823               normal_eol = detect_eol (coding.source, src_bytes,
8824                                        coding_category_raw_text);
8825           }
8826         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8827                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8828           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8829                                       coding_category_utf_16_be);
8830         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8831                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8832           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8833                                       coding_category_utf_16_le);
8834       }
8835     else
8836       {
8837         if (EQ (eol_type, Qunix))
8838           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8839         else if (EQ (eol_type, Qdos))
8840           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8841         else
8842           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8843       }
8844
8845     for (tail = val; CONSP (tail); tail = XCDR (tail))
8846       {
8847         enum coding_category category;
8848         int this_eol;
8849
8850         id = XINT (XCAR (tail));
8851         attrs = CODING_ID_ATTRS (id);
8852         category = XINT (CODING_ATTR_CATEGORY (attrs));
8853         eol_type = CODING_ID_EOL_TYPE (id);
8854         if (VECTORP (eol_type))
8855           {
8856             if (category == coding_category_utf_16_be
8857                 || category == coding_category_utf_16_be_nosig)
8858               this_eol = utf_16_be_eol;
8859             else if (category == coding_category_utf_16_le
8860                      || category == coding_category_utf_16_le_nosig)
8861               this_eol = utf_16_le_eol;
8862             else
8863               this_eol = normal_eol;
8864
8865             if (this_eol == EOL_SEEN_LF)
8866               XSETCAR (tail, AREF (eol_type, 0));
8867             else if (this_eol == EOL_SEEN_CRLF)
8868               XSETCAR (tail, AREF (eol_type, 1));
8869             else if (this_eol == EOL_SEEN_CR)
8870               XSETCAR (tail, AREF (eol_type, 2));
8871             else
8872               XSETCAR (tail, CODING_ID_NAME (id));
8873           }
8874         else
8875           XSETCAR (tail, CODING_ID_NAME (id));
8876       }
8877   }
8878
8879   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8880 }
8881
8882
8883 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8884        2, 3, 0,
8885        doc: /* Detect coding system of the text in the region between START and END.
8886 Return a list of possible coding systems ordered by priority.
8887 The coding systems to try and their priorities follows what
8888 the function `coding-system-priority-list' (which see) returns.
8889
8890 If only ASCII characters are found (except for such ISO-2022 control
8891 characters as ESC), it returns a list of single element `undecided'
8892 or its subsidiary coding system according to a detected end-of-line
8893 format.
8894
8895 If optional argument HIGHEST is non-nil, return the coding system of
8896 highest priority.  */)
8897   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8898 {
8899   ptrdiff_t from, to;
8900   ptrdiff_t from_byte, to_byte;
8901
8902   validate_region (&start, &end);
8903   from = XINT (start), to = XINT (end);
8904   from_byte = CHAR_TO_BYTE (from);
8905   to_byte = CHAR_TO_BYTE (to);
8906
8907   if (from < GPT && to >= GPT)
8908     move_gap_both (to, to_byte);
8909
8910   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8911                                to - from, to_byte - from_byte,
8912                                !NILP (highest),
8913                                !NILP (BVAR (current_buffer
8914                                       , enable_multibyte_characters)),
8915                                Qnil);
8916 }
8917
8918 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8919        1, 2, 0,
8920        doc: /* Detect coding system of the text in STRING.
8921 Return a list of possible coding systems ordered by priority.
8922 The coding systems to try and their priorities follows what
8923 the function `coding-system-priority-list' (which see) returns.
8924
8925 If only ASCII characters are found (except for such ISO-2022 control
8926 characters as ESC), it returns a list of single element `undecided'
8927 or its subsidiary coding system according to a detected end-of-line
8928 format.
8929
8930 If optional argument HIGHEST is non-nil, return the coding system of
8931 highest priority.  */)
8932   (Lisp_Object string, Lisp_Object highest)
8933 {
8934   CHECK_STRING (string);
8935
8936   return detect_coding_system (SDATA (string),
8937                                SCHARS (string), SBYTES (string),
8938                                !NILP (highest), STRING_MULTIBYTE (string),
8939                                Qnil);
8940 }
8941
8942
8943 static bool
8944 char_encodable_p (int c, Lisp_Object attrs)
8945 {
8946   Lisp_Object tail;
8947   struct charset *charset;
8948   Lisp_Object translation_table;
8949
8950   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8951   if (! NILP (translation_table))
8952     c = translate_char (translation_table, c);
8953   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8954        CONSP (tail); tail = XCDR (tail))
8955     {
8956       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8957       if (CHAR_CHARSET_P (c, charset))
8958         break;
8959     }
8960   return (! NILP (tail));
8961 }
8962
8963
8964 /* Return a list of coding systems that safely encode the text between
8965    START and END.  If EXCLUDE is non-nil, it is a list of coding
8966    systems not to check.  The returned list doesn't contain any such
8967    coding systems.  In any case, if the text contains only ASCII or is
8968    unibyte, return t.  */
8969
8970 DEFUN ("find-coding-systems-region-internal",
8971        Ffind_coding_systems_region_internal,
8972        Sfind_coding_systems_region_internal, 2, 3, 0,
8973        doc: /* Internal use only.  */)
8974   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8975 {
8976   Lisp_Object coding_attrs_list, safe_codings;
8977   ptrdiff_t start_byte, end_byte;
8978   const unsigned char *p, *pbeg, *pend;
8979   int c;
8980   Lisp_Object tail, elt, work_table;
8981
8982   if (STRINGP (start))
8983     {
8984       if (!STRING_MULTIBYTE (start)
8985           || SCHARS (start) == SBYTES (start))
8986         return Qt;
8987       start_byte = 0;
8988       end_byte = SBYTES (start);
8989     }
8990   else
8991     {
8992       CHECK_NUMBER_COERCE_MARKER (start);
8993       CHECK_NUMBER_COERCE_MARKER (end);
8994       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8995         args_out_of_range (start, end);
8996       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8997         return Qt;
8998       start_byte = CHAR_TO_BYTE (XINT (start));
8999       end_byte = CHAR_TO_BYTE (XINT (end));
9000       if (XINT (end) - XINT (start) == end_byte - start_byte)
9001         return Qt;
9002
9003       if (XINT (start) < GPT && XINT (end) > GPT)
9004         {
9005           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9006             move_gap_both (XINT (start), start_byte);
9007           else
9008             move_gap_both (XINT (end), end_byte);
9009         }
9010     }
9011
9012   coding_attrs_list = Qnil;
9013   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9014     if (NILP (exclude)
9015         || NILP (Fmemq (XCAR (tail), exclude)))
9016       {
9017         Lisp_Object attrs;
9018
9019         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9020         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9021           {
9022             ASET (attrs, coding_attr_trans_tbl,
9023                   get_translation_table (attrs, 1, NULL));
9024             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9025           }
9026       }
9027
9028   if (STRINGP (start))
9029     p = pbeg = SDATA (start);
9030   else
9031     p = pbeg = BYTE_POS_ADDR (start_byte);
9032   pend = p + (end_byte - start_byte);
9033
9034   while (p < pend && ASCII_BYTE_P (*p)) p++;
9035   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9036
9037   work_table = Fmake_char_table (Qnil, Qnil);
9038   while (p < pend)
9039     {
9040       if (ASCII_BYTE_P (*p))
9041         p++;
9042       else
9043         {
9044           c = STRING_CHAR_ADVANCE (p);
9045           if (!NILP (char_table_ref (work_table, c)))
9046             /* This character was already checked.  Ignore it.  */
9047             continue;
9048
9049           charset_map_loaded = 0;
9050           for (tail = coding_attrs_list; CONSP (tail);)
9051             {
9052               elt = XCAR (tail);
9053               if (NILP (elt))
9054                 tail = XCDR (tail);
9055               else if (char_encodable_p (c, elt))
9056                 tail = XCDR (tail);
9057               else if (CONSP (XCDR (tail)))
9058                 {
9059                   XSETCAR (tail, XCAR (XCDR (tail)));
9060                   XSETCDR (tail, XCDR (XCDR (tail)));
9061                 }
9062               else
9063                 {
9064                   XSETCAR (tail, Qnil);
9065                   tail = XCDR (tail);
9066                 }
9067             }
9068           if (charset_map_loaded)
9069             {
9070               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9071
9072               if (STRINGP (start))
9073                 pbeg = SDATA (start);
9074               else
9075                 pbeg = BYTE_POS_ADDR (start_byte);
9076               p = pbeg + p_offset;
9077               pend = pbeg + pend_offset;
9078             }
9079           char_table_set (work_table, c, Qt);
9080         }
9081     }
9082
9083   safe_codings = list2 (Qraw_text, Qno_conversion);
9084   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9085     if (! NILP (XCAR (tail)))
9086       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9087
9088   return safe_codings;
9089 }
9090
9091
9092 DEFUN ("unencodable-char-position", Funencodable_char_position,
9093        Sunencodable_char_position, 3, 5, 0,
9094        doc: /*
9095 Return position of first un-encodable character in a region.
9096 START and END specify the region and CODING-SYSTEM specifies the
9097 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9098
9099 If optional 4th argument COUNT is non-nil, it specifies at most how
9100 many un-encodable characters to search.  In this case, the value is a
9101 list of positions.
9102
9103 If optional 5th argument STRING is non-nil, it is a string to search
9104 for un-encodable characters.  In that case, START and END are indexes
9105 to the string.  */)
9106   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
9107 {
9108   EMACS_INT n;
9109   struct coding_system coding;
9110   Lisp_Object attrs, charset_list, translation_table;
9111   Lisp_Object positions;
9112   ptrdiff_t from, to;
9113   const unsigned char *p, *stop, *pend;
9114   bool ascii_compatible;
9115
9116   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9117   attrs = CODING_ID_ATTRS (coding.id);
9118   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9119     return Qnil;
9120   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9121   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9122   translation_table = get_translation_table (attrs, 1, NULL);
9123
9124   if (NILP (string))
9125     {
9126       validate_region (&start, &end);
9127       from = XINT (start);
9128       to = XINT (end);
9129       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9130           || (ascii_compatible
9131               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9132         return Qnil;
9133       p = CHAR_POS_ADDR (from);
9134       pend = CHAR_POS_ADDR (to);
9135       if (from < GPT && to >= GPT)
9136         stop = GPT_ADDR;
9137       else
9138         stop = pend;
9139     }
9140   else
9141     {
9142       CHECK_STRING (string);
9143       CHECK_NATNUM (start);
9144       CHECK_NATNUM (end);
9145       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
9146         args_out_of_range_3 (string, start, end);
9147       from = XINT (start);
9148       to = XINT (end);
9149       if (! STRING_MULTIBYTE (string))
9150         return Qnil;
9151       p = SDATA (string) + string_char_to_byte (string, from);
9152       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9153       if (ascii_compatible && (to - from) == (pend - p))
9154         return Qnil;
9155     }
9156
9157   if (NILP (count))
9158     n = 1;
9159   else
9160     {
9161       CHECK_NATNUM (count);
9162       n = XINT (count);
9163     }
9164
9165   positions = Qnil;
9166   charset_map_loaded = 0;
9167   while (1)
9168     {
9169       int c;
9170
9171       if (ascii_compatible)
9172         while (p < stop && ASCII_BYTE_P (*p))
9173           p++, from++;
9174       if (p >= stop)
9175         {
9176           if (p >= pend)
9177             break;
9178           stop = pend;
9179           p = GAP_END_ADDR;
9180         }
9181
9182       c = STRING_CHAR_ADVANCE (p);
9183       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9184           && ! char_charset (translate_char (translation_table, c),
9185                              charset_list, NULL))
9186         {
9187           positions = Fcons (make_number (from), positions);
9188           n--;
9189           if (n == 0)
9190             break;
9191         }
9192
9193       from++;
9194       if (charset_map_loaded && NILP (string))
9195         {
9196           p = CHAR_POS_ADDR (from);
9197           pend = CHAR_POS_ADDR (to);
9198           if (from < GPT && to >= GPT)
9199             stop = GPT_ADDR;
9200           else
9201             stop = pend;
9202           charset_map_loaded = 0;
9203         }
9204     }
9205
9206   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9207 }
9208
9209
9210 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9211        Scheck_coding_systems_region, 3, 3, 0,
9212        doc: /* Check if the region is encodable by coding systems.
9213
9214 START and END are buffer positions specifying the region.
9215 CODING-SYSTEM-LIST is a list of coding systems to check.
9216
9217 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9218 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9219 whole region, POS0, POS1, ... are buffer positions where non-encodable
9220 characters are found.
9221
9222 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9223 value is nil.
9224
9225 START may be a string.  In that case, check if the string is
9226 encodable, and the value contains indices to the string instead of
9227 buffer positions.  END is ignored.
9228
9229 If the current buffer (or START if it is a string) is unibyte, the value
9230 is nil.  */)
9231   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9232 {
9233   Lisp_Object list;
9234   ptrdiff_t start_byte, end_byte;
9235   ptrdiff_t pos;
9236   const unsigned char *p, *pbeg, *pend;
9237   int c;
9238   Lisp_Object tail, elt, attrs;
9239
9240   if (STRINGP (start))
9241     {
9242       if (!STRING_MULTIBYTE (start)
9243           || SCHARS (start) == SBYTES (start))
9244         return Qnil;
9245       start_byte = 0;
9246       end_byte = SBYTES (start);
9247       pos = 0;
9248     }
9249   else
9250     {
9251       CHECK_NUMBER_COERCE_MARKER (start);
9252       CHECK_NUMBER_COERCE_MARKER (end);
9253       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9254         args_out_of_range (start, end);
9255       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9256         return Qnil;
9257       start_byte = CHAR_TO_BYTE (XINT (start));
9258       end_byte = CHAR_TO_BYTE (XINT (end));
9259       if (XINT (end) - XINT (start) == end_byte - start_byte)
9260         return Qnil;
9261
9262       if (XINT (start) < GPT && XINT (end) > GPT)
9263         {
9264           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9265             move_gap_both (XINT (start), start_byte);
9266           else
9267             move_gap_both (XINT (end), end_byte);
9268         }
9269       pos = XINT (start);
9270     }
9271
9272   list = Qnil;
9273   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9274     {
9275       elt = XCAR (tail);
9276       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9277       ASET (attrs, coding_attr_trans_tbl,
9278             get_translation_table (attrs, 1, NULL));
9279       list = Fcons (list2 (elt, attrs), list);
9280     }
9281
9282   if (STRINGP (start))
9283     p = pbeg = SDATA (start);
9284   else
9285     p = pbeg = BYTE_POS_ADDR (start_byte);
9286   pend = p + (end_byte - start_byte);
9287
9288   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9289   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9290
9291   while (p < pend)
9292     {
9293       if (ASCII_BYTE_P (*p))
9294         p++;
9295       else
9296         {
9297           c = STRING_CHAR_ADVANCE (p);
9298
9299           charset_map_loaded = 0;
9300           for (tail = list; CONSP (tail); tail = XCDR (tail))
9301             {
9302               elt = XCDR (XCAR (tail));
9303               if (! char_encodable_p (c, XCAR (elt)))
9304                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9305             }
9306           if (charset_map_loaded)
9307             {
9308               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9309
9310               if (STRINGP (start))
9311                 pbeg = SDATA (start);
9312               else
9313                 pbeg = BYTE_POS_ADDR (start_byte);
9314               p = pbeg + p_offset;
9315               pend = pbeg + pend_offset;
9316             }
9317         }
9318       pos++;
9319     }
9320
9321   tail = list;
9322   list = Qnil;
9323   for (; CONSP (tail); tail = XCDR (tail))
9324     {
9325       elt = XCAR (tail);
9326       if (CONSP (XCDR (XCDR (elt))))
9327         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9328                       list);
9329     }
9330
9331   return list;
9332 }
9333
9334
9335 static Lisp_Object
9336 code_convert_region (Lisp_Object start, Lisp_Object end,
9337                      Lisp_Object coding_system, Lisp_Object dst_object,
9338                      bool encodep, bool norecord)
9339 {
9340   struct coding_system coding;
9341   ptrdiff_t from, from_byte, to, to_byte;
9342   Lisp_Object src_object;
9343
9344   if (NILP (coding_system))
9345     coding_system = Qno_conversion;
9346   else
9347     CHECK_CODING_SYSTEM (coding_system);
9348   src_object = Fcurrent_buffer ();
9349   if (NILP (dst_object))
9350     dst_object = src_object;
9351   else if (! EQ (dst_object, Qt))
9352     CHECK_BUFFER (dst_object);
9353
9354   validate_region (&start, &end);
9355   from = XFASTINT (start);
9356   from_byte = CHAR_TO_BYTE (from);
9357   to = XFASTINT (end);
9358   to_byte = CHAR_TO_BYTE (to);
9359
9360   setup_coding_system (coding_system, &coding);
9361   coding.mode |= CODING_MODE_LAST_BLOCK;
9362
9363   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9364     {
9365       struct buffer *buf = XBUFFER (dst_object);
9366       ptrdiff_t buf_pt = BUF_PT (buf);
9367
9368       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9369     }
9370
9371   if (encodep)
9372     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9373                           dst_object);
9374   else
9375     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9376                           dst_object);
9377   if (! norecord)
9378     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9379
9380   return (BUFFERP (dst_object)
9381           ? make_number (coding.produced_char)
9382           : coding.dst_object);
9383 }
9384
9385
9386 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9387        3, 4, "r\nzCoding system: ",
9388        doc: /* Decode the current region from the specified coding system.
9389 When called from a program, takes four arguments:
9390         START, END, CODING-SYSTEM, and DESTINATION.
9391 START and END are buffer positions.
9392
9393 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9394 If nil, the region between START and END is replaced by the decoded text.
9395 If buffer, the decoded text is inserted in that buffer after point (point
9396 does not move).
9397 In those cases, the length of the decoded text is returned.
9398 If DESTINATION is t, the decoded text is returned.
9399
9400 This function sets `last-coding-system-used' to the precise coding system
9401 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9402 not fully specified.)  */)
9403   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9404 {
9405   return code_convert_region (start, end, coding_system, destination, 0, 0);
9406 }
9407
9408 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9409        3, 4, "r\nzCoding system: ",
9410        doc: /* Encode the current region by specified coding system.
9411 When called from a program, takes four arguments:
9412         START, END, CODING-SYSTEM and DESTINATION.
9413 START and END are buffer positions.
9414
9415 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9416 If nil, the region between START and END is replace by the encoded text.
9417 If buffer, the encoded text is inserted in that buffer after point (point
9418 does not move).
9419 In those cases, the length of the encoded text is returned.
9420 If DESTINATION is t, the encoded text is returned.
9421
9422 This function sets `last-coding-system-used' to the precise coding system
9423 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9424 not fully specified.)  */)
9425   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9426 {
9427   return code_convert_region (start, end, coding_system, destination, 1, 0);
9428 }
9429
9430 Lisp_Object
9431 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9432                      Lisp_Object dst_object, bool encodep, bool nocopy,
9433                      bool norecord)
9434 {
9435   struct coding_system coding;
9436   ptrdiff_t chars, bytes;
9437
9438   CHECK_STRING (string);
9439   if (NILP (coding_system))
9440     {
9441       if (! norecord)
9442         Vlast_coding_system_used = Qno_conversion;
9443       if (NILP (dst_object))
9444         return (nocopy ? Fcopy_sequence (string) : string);
9445     }
9446
9447   if (NILP (coding_system))
9448     coding_system = Qno_conversion;
9449   else
9450     CHECK_CODING_SYSTEM (coding_system);
9451   if (NILP (dst_object))
9452     dst_object = Qt;
9453   else if (! EQ (dst_object, Qt))
9454     CHECK_BUFFER (dst_object);
9455
9456   setup_coding_system (coding_system, &coding);
9457   coding.mode |= CODING_MODE_LAST_BLOCK;
9458   chars = SCHARS (string);
9459   bytes = SBYTES (string);
9460
9461   if (BUFFERP (dst_object))
9462     {
9463       struct buffer *buf = XBUFFER (dst_object);
9464       ptrdiff_t buf_pt = BUF_PT (buf);
9465
9466       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9467     }
9468
9469   if (encodep)
9470     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9471   else
9472     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9473   if (! norecord)
9474     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9475
9476   return (BUFFERP (dst_object)
9477           ? make_number (coding.produced_char)
9478           : coding.dst_object);
9479 }
9480
9481
9482 /* Encode or decode STRING according to CODING_SYSTEM.
9483    Do not set Vlast_coding_system_used.
9484
9485    This function is called only from macros DECODE_FILE and
9486    ENCODE_FILE, thus we ignore character composition.  */
9487
9488 Lisp_Object
9489 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9490                               bool encodep)
9491 {
9492   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9493 }
9494
9495 /* Encode or decode a file name, to or from a unibyte string suitable
9496    for passing to C library functions.  */
9497 Lisp_Object
9498 decode_file_name (Lisp_Object fname)
9499 {
9500 #ifdef WINDOWSNT
9501   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9502      converts the file names either to UTF-16LE or to the system ANSI
9503      codepage internally, depending on the underlying OS; see w32.c.  */
9504   if (! NILP (Fcoding_system_p (Qutf_8)))
9505     return code_convert_string_norecord (fname, Qutf_8, 0);
9506   return fname;
9507 #else  /* !WINDOWSNT */
9508   if (! NILP (Vfile_name_coding_system))
9509     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9510   else if (! NILP (Vdefault_file_name_coding_system))
9511     return code_convert_string_norecord (fname,
9512                                          Vdefault_file_name_coding_system, 0);
9513   else
9514     return fname;
9515 #endif
9516 }
9517
9518 Lisp_Object
9519 encode_file_name (Lisp_Object fname)
9520 {
9521   /* This is especially important during bootstrap and dumping, when
9522      file-name encoding is not yet known, and therefore any non-ASCII
9523      file names are unibyte strings, and could only be thrashed if we
9524      try to encode them.  */
9525   if (!STRING_MULTIBYTE (fname))
9526     return fname;
9527 #ifdef WINDOWSNT
9528   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9529      converts the file names either to UTF-16LE or to the system ANSI
9530      codepage internally, depending on the underlying OS; see w32.c.  */
9531   if (! NILP (Fcoding_system_p (Qutf_8)))
9532     return code_convert_string_norecord (fname, Qutf_8, 1);
9533   return fname;
9534 #else  /* !WINDOWSNT */
9535   if (! NILP (Vfile_name_coding_system))
9536     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9537   else if (! NILP (Vdefault_file_name_coding_system))
9538     return code_convert_string_norecord (fname,
9539                                          Vdefault_file_name_coding_system, 1);
9540   else
9541     return fname;
9542 #endif
9543 }
9544
9545 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9546        2, 4, 0,
9547        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9548
9549 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9550 if the decoding operation is trivial.
9551
9552 Optional fourth arg BUFFER non-nil means that the decoded text is
9553 inserted in that buffer after point (point does not move).  In this
9554 case, the return value is the length of the decoded text.
9555
9556 This function sets `last-coding-system-used' to the precise coding system
9557 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9558 not fully specified.)  */)
9559   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9560 {
9561   return code_convert_string (string, coding_system, buffer,
9562                               0, ! NILP (nocopy), 0);
9563 }
9564
9565 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9566        2, 4, 0,
9567        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9568
9569 Optional third arg NOCOPY non-nil means it is OK to return STRING
9570 itself if the encoding operation is trivial.
9571
9572 Optional fourth arg BUFFER non-nil means that the encoded text is
9573 inserted in that buffer after point (point does not move).  In this
9574 case, the return value is the length of the encoded text.
9575
9576 This function sets `last-coding-system-used' to the precise coding system
9577 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9578 not fully specified.)  */)
9579   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9580 {
9581   return code_convert_string (string, coding_system, buffer,
9582                               1, ! NILP (nocopy), 0);
9583 }
9584
9585 \f
9586 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9587        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9588 Return the corresponding character.  */)
9589   (Lisp_Object code)
9590 {
9591   Lisp_Object spec, attrs, val;
9592   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9593   EMACS_INT ch;
9594   int c;
9595
9596   CHECK_NATNUM (code);
9597   ch = XFASTINT (code);
9598   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9599   attrs = AREF (spec, 0);
9600
9601   if (ASCII_BYTE_P (ch)
9602       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9603     return code;
9604
9605   val = CODING_ATTR_CHARSET_LIST (attrs);
9606   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9607   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9608   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9609
9610   if (ch <= 0x7F)
9611     {
9612       c = ch;
9613       charset = charset_roman;
9614     }
9615   else if (ch >= 0xA0 && ch < 0xDF)
9616     {
9617       c = ch - 0x80;
9618       charset = charset_kana;
9619     }
9620   else
9621     {
9622       EMACS_INT c1 = ch >> 8;
9623       int c2 = ch & 0xFF;
9624
9625       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9626           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9627         error ("Invalid code: %"pI"d", ch);
9628       c = ch;
9629       SJIS_TO_JIS (c);
9630       charset = charset_kanji;
9631     }
9632   c = DECODE_CHAR (charset, c);
9633   if (c < 0)
9634     error ("Invalid code: %"pI"d", ch);
9635   return make_number (c);
9636 }
9637
9638
9639 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9640        doc: /* Encode a Japanese character CH to shift_jis encoding.
9641 Return the corresponding code in SJIS.  */)
9642   (Lisp_Object ch)
9643 {
9644   Lisp_Object spec, attrs, charset_list;
9645   int c;
9646   struct charset *charset;
9647   unsigned code;
9648
9649   CHECK_CHARACTER (ch);
9650   c = XFASTINT (ch);
9651   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9652   attrs = AREF (spec, 0);
9653
9654   if (ASCII_CHAR_P (c)
9655       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9656     return ch;
9657
9658   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9659   charset = char_charset (c, charset_list, &code);
9660   if (code == CHARSET_INVALID_CODE (charset))
9661     error ("Can't encode by shift_jis encoding: %c", c);
9662   JIS_TO_SJIS (code);
9663
9664   return make_number (code);
9665 }
9666
9667 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9668        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9669 Return the corresponding character.  */)
9670   (Lisp_Object code)
9671 {
9672   Lisp_Object spec, attrs, val;
9673   struct charset *charset_roman, *charset_big5, *charset;
9674   EMACS_INT ch;
9675   int c;
9676
9677   CHECK_NATNUM (code);
9678   ch = XFASTINT (code);
9679   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9680   attrs = AREF (spec, 0);
9681
9682   if (ASCII_BYTE_P (ch)
9683       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9684     return code;
9685
9686   val = CODING_ATTR_CHARSET_LIST (attrs);
9687   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9688   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9689
9690   if (ch <= 0x7F)
9691     {
9692       c = ch;
9693       charset = charset_roman;
9694     }
9695   else
9696     {
9697       EMACS_INT b1 = ch >> 8;
9698       int b2 = ch & 0x7F;
9699       if (b1 < 0xA1 || b1 > 0xFE
9700           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9701         error ("Invalid code: %"pI"d", ch);
9702       c = ch;
9703       charset = charset_big5;
9704     }
9705   c = DECODE_CHAR (charset, c);
9706   if (c < 0)
9707     error ("Invalid code: %"pI"d", ch);
9708   return make_number (c);
9709 }
9710
9711 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9712        doc: /* Encode the Big5 character CH to BIG5 coding system.
9713 Return the corresponding character code in Big5.  */)
9714   (Lisp_Object ch)
9715 {
9716   Lisp_Object spec, attrs, charset_list;
9717   struct charset *charset;
9718   int c;
9719   unsigned code;
9720
9721   CHECK_CHARACTER (ch);
9722   c = XFASTINT (ch);
9723   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9724   attrs = AREF (spec, 0);
9725   if (ASCII_CHAR_P (c)
9726       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9727     return ch;
9728
9729   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9730   charset = char_charset (c, charset_list, &code);
9731   if (code == CHARSET_INVALID_CODE (charset))
9732     error ("Can't encode by Big5 encoding: %c", c);
9733
9734   return make_number (code);
9735 }
9736
9737 \f
9738 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9739        Sset_terminal_coding_system_internal, 1, 2, 0,
9740        doc: /* Internal use only.  */)
9741   (Lisp_Object coding_system, Lisp_Object terminal)
9742 {
9743   struct terminal *term = get_terminal (terminal, 1);
9744   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9745   CHECK_SYMBOL (coding_system);
9746   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9747   /* We had better not send unsafe characters to terminal.  */
9748   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9749   /* Character composition should be disabled.  */
9750   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9751   terminal_coding->src_multibyte = 1;
9752   terminal_coding->dst_multibyte = 0;
9753   tset_charset_list
9754     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9755             ? coding_charset_list (terminal_coding)
9756             : list1 (make_number (charset_ascii))));
9757   return Qnil;
9758 }
9759
9760 DEFUN ("set-safe-terminal-coding-system-internal",
9761        Fset_safe_terminal_coding_system_internal,
9762        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9763        doc: /* Internal use only.  */)
9764   (Lisp_Object coding_system)
9765 {
9766   CHECK_SYMBOL (coding_system);
9767   setup_coding_system (Fcheck_coding_system (coding_system),
9768                        &safe_terminal_coding);
9769   /* Character composition should be disabled.  */
9770   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9771   safe_terminal_coding.src_multibyte = 1;
9772   safe_terminal_coding.dst_multibyte = 0;
9773   return Qnil;
9774 }
9775
9776 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9777        Sterminal_coding_system, 0, 1, 0,
9778        doc: /* Return coding system specified for terminal output on the given terminal.
9779 TERMINAL may be a terminal object, a frame, or nil for the selected
9780 frame's terminal device.  */)
9781   (Lisp_Object terminal)
9782 {
9783   struct coding_system *terminal_coding
9784     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9785   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9786
9787   /* For backward compatibility, return nil if it is `undecided'.  */
9788   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9789 }
9790
9791 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9792        Sset_keyboard_coding_system_internal, 1, 2, 0,
9793        doc: /* Internal use only.  */)
9794   (Lisp_Object coding_system, Lisp_Object terminal)
9795 {
9796   struct terminal *t = get_terminal (terminal, 1);
9797   CHECK_SYMBOL (coding_system);
9798   if (NILP (coding_system))
9799     coding_system = Qno_conversion;
9800   else
9801     Fcheck_coding_system (coding_system);
9802   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9803   /* Character composition should be disabled.  */
9804   TERMINAL_KEYBOARD_CODING (t)->common_flags
9805     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9806   return Qnil;
9807 }
9808
9809 DEFUN ("keyboard-coding-system",
9810        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9811        doc: /* Return coding system specified for decoding keyboard input.  */)
9812   (Lisp_Object terminal)
9813 {
9814   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9815                          (get_terminal (terminal, 1))->id);
9816 }
9817
9818 \f
9819 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9820        Sfind_operation_coding_system,  1, MANY, 0,
9821        doc: /* Choose a coding system for an operation based on the target name.
9822 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9823 DECODING-SYSTEM is the coding system to use for decoding
9824 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9825 for encoding (in case OPERATION does encoding).
9826
9827 The first argument OPERATION specifies an I/O primitive:
9828   For file I/O, `insert-file-contents' or `write-region'.
9829   For process I/O, `call-process', `call-process-region', or `start-process'.
9830   For network I/O, `open-network-stream'.
9831
9832 The remaining arguments should be the same arguments that were passed
9833 to the primitive.  Depending on which primitive, one of those arguments
9834 is selected as the TARGET.  For example, if OPERATION does file I/O,
9835 whichever argument specifies the file name is TARGET.
9836
9837 TARGET has a meaning which depends on OPERATION:
9838   For file I/O, TARGET is a file name (except for the special case below).
9839   For process I/O, TARGET is a process name.
9840   For network I/O, TARGET is a service name or a port number.
9841
9842 This function looks up what is specified for TARGET in
9843 `file-coding-system-alist', `process-coding-system-alist',
9844 or `network-coding-system-alist' depending on OPERATION.
9845 They may specify a coding system, a cons of coding systems,
9846 or a function symbol to call.
9847 In the last case, we call the function with one argument,
9848 which is a list of all the arguments given to this function.
9849 If the function can't decide a coding system, it can return
9850 `undecided' so that the normal code-detection is performed.
9851
9852 If OPERATION is `insert-file-contents', the argument corresponding to
9853 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9854 file name to look up, and BUFFER is a buffer that contains the file's
9855 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9856 function to call for FILENAME, that function should examine the
9857 contents of BUFFER instead of reading the file.
9858
9859 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9860   (ptrdiff_t nargs, Lisp_Object *args)
9861 {
9862   Lisp_Object operation, target_idx, target, val;
9863   register Lisp_Object chain;
9864
9865   if (nargs < 2)
9866     error ("Too few arguments");
9867   operation = args[0];
9868   if (!SYMBOLP (operation)
9869       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9870     error ("Invalid first argument");
9871   if (nargs <= 1 + XFASTINT (target_idx))
9872     error ("Too few arguments for operation `%s'",
9873            SDATA (SYMBOL_NAME (operation)));
9874   target = args[XFASTINT (target_idx) + 1];
9875   if (!(STRINGP (target)
9876         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9877             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9878         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9879     error ("Invalid argument %"pI"d of operation `%s'",
9880            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9881   if (CONSP (target))
9882     target = XCAR (target);
9883
9884   chain = ((EQ (operation, Qinsert_file_contents)
9885             || EQ (operation, Qwrite_region))
9886            ? Vfile_coding_system_alist
9887            : (EQ (operation, Qopen_network_stream)
9888               ? Vnetwork_coding_system_alist
9889               : Vprocess_coding_system_alist));
9890   if (NILP (chain))
9891     return Qnil;
9892
9893   for (; CONSP (chain); chain = XCDR (chain))
9894     {
9895       Lisp_Object elt;
9896
9897       elt = XCAR (chain);
9898       if (CONSP (elt)
9899           && ((STRINGP (target)
9900                && STRINGP (XCAR (elt))
9901                && fast_string_match (XCAR (elt), target) >= 0)
9902               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9903         {
9904           val = XCDR (elt);
9905           /* Here, if VAL is both a valid coding system and a valid
9906              function symbol, we return VAL as a coding system.  */
9907           if (CONSP (val))
9908             return val;
9909           if (! SYMBOLP (val))
9910             return Qnil;
9911           if (! NILP (Fcoding_system_p (val)))
9912             return Fcons (val, val);
9913           if (! NILP (Ffboundp (val)))
9914             {
9915               /* We use call1 rather than safe_call1
9916                  so as to get bug reports about functions called here
9917                  which don't handle the current interface.  */
9918               val = call1 (val, Flist (nargs, args));
9919               if (CONSP (val))
9920                 return val;
9921               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9922                 return Fcons (val, val);
9923             }
9924           return Qnil;
9925         }
9926     }
9927   return Qnil;
9928 }
9929
9930 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9931        Sset_coding_system_priority, 0, MANY, 0,
9932        doc: /* Assign higher priority to the coding systems given as arguments.
9933 If multiple coding systems belong to the same category,
9934 all but the first one are ignored.
9935
9936 usage: (set-coding-system-priority &rest coding-systems)  */)
9937   (ptrdiff_t nargs, Lisp_Object *args)
9938 {
9939   ptrdiff_t i, j;
9940   bool changed[coding_category_max];
9941   enum coding_category priorities[coding_category_max];
9942
9943   memset (changed, 0, sizeof changed);
9944
9945   for (i = j = 0; i < nargs; i++)
9946     {
9947       enum coding_category category;
9948       Lisp_Object spec, attrs;
9949
9950       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9951       attrs = AREF (spec, 0);
9952       category = XINT (CODING_ATTR_CATEGORY (attrs));
9953       if (changed[category])
9954         /* Ignore this coding system because a coding system of the
9955            same category already had a higher priority.  */
9956         continue;
9957       changed[category] = 1;
9958       priorities[j++] = category;
9959       if (coding_categories[category].id >= 0
9960           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9961         setup_coding_system (args[i], &coding_categories[category]);
9962       Fset (AREF (Vcoding_category_table, category), args[i]);
9963     }
9964
9965   /* Now we have decided top J priorities.  Reflect the order of the
9966      original priorities to the remaining priorities.  */
9967
9968   for (i = j, j = 0; i < coding_category_max; i++, j++)
9969     {
9970       while (j < coding_category_max
9971              && changed[coding_priorities[j]])
9972         j++;
9973       if (j == coding_category_max)
9974         emacs_abort ();
9975       priorities[i] = coding_priorities[j];
9976     }
9977
9978   memcpy (coding_priorities, priorities, sizeof priorities);
9979
9980   /* Update `coding-category-list'.  */
9981   Vcoding_category_list = Qnil;
9982   for (i = coding_category_max; i-- > 0; )
9983     Vcoding_category_list
9984       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9985                Vcoding_category_list);
9986
9987   return Qnil;
9988 }
9989
9990 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9991        Scoding_system_priority_list, 0, 1, 0,
9992        doc: /* Return a list of coding systems ordered by their priorities.
9993 The list contains a subset of coding systems; i.e. coding systems
9994 assigned to each coding category (see `coding-category-list').
9995
9996 HIGHESTP non-nil means just return the highest priority one.  */)
9997   (Lisp_Object highestp)
9998 {
9999   int i;
10000   Lisp_Object val;
10001
10002   for (i = 0, val = Qnil; i < coding_category_max; i++)
10003     {
10004       enum coding_category category = coding_priorities[i];
10005       int id = coding_categories[category].id;
10006       Lisp_Object attrs;
10007
10008       if (id < 0)
10009         continue;
10010       attrs = CODING_ID_ATTRS (id);
10011       if (! NILP (highestp))
10012         return CODING_ATTR_BASE_NAME (attrs);
10013       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10014     }
10015   return Fnreverse (val);
10016 }
10017
10018 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10019
10020 static Lisp_Object
10021 make_subsidiaries (Lisp_Object base)
10022 {
10023   Lisp_Object subsidiaries;
10024   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10025   char *buf = alloca (base_name_len + 6);
10026   int i;
10027
10028   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10029   subsidiaries = make_uninit_vector (3);
10030   for (i = 0; i < 3; i++)
10031     {
10032       strcpy (buf + base_name_len, suffixes[i]);
10033       ASET (subsidiaries, i, intern (buf));
10034     }
10035   return subsidiaries;
10036 }
10037
10038
10039 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10040        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10041        doc: /* For internal use only.
10042 usage: (define-coding-system-internal ...)  */)
10043   (ptrdiff_t nargs, Lisp_Object *args)
10044 {
10045   Lisp_Object name;
10046   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10047   Lisp_Object attrs;            /* Vector of attributes.  */
10048   Lisp_Object eol_type;
10049   Lisp_Object aliases;
10050   Lisp_Object coding_type, charset_list, safe_charsets;
10051   enum coding_category category;
10052   Lisp_Object tail, val;
10053   int max_charset_id = 0;
10054   int i;
10055
10056   if (nargs < coding_arg_max)
10057     goto short_args;
10058
10059   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10060
10061   name = args[coding_arg_name];
10062   CHECK_SYMBOL (name);
10063   ASET (attrs, coding_attr_base_name, name);
10064
10065   val = args[coding_arg_mnemonic];
10066   if (! STRINGP (val))
10067     CHECK_CHARACTER (val);
10068   ASET (attrs, coding_attr_mnemonic, val);
10069
10070   coding_type = args[coding_arg_coding_type];
10071   CHECK_SYMBOL (coding_type);
10072   ASET (attrs, coding_attr_type, coding_type);
10073
10074   charset_list = args[coding_arg_charset_list];
10075   if (SYMBOLP (charset_list))
10076     {
10077       if (EQ (charset_list, Qiso_2022))
10078         {
10079           if (! EQ (coding_type, Qiso_2022))
10080             error ("Invalid charset-list");
10081           charset_list = Viso_2022_charset_list;
10082         }
10083       else if (EQ (charset_list, Qemacs_mule))
10084         {
10085           if (! EQ (coding_type, Qemacs_mule))
10086             error ("Invalid charset-list");
10087           charset_list = Vemacs_mule_charset_list;
10088         }
10089       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10090         {
10091           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10092             error ("Invalid charset-list");
10093           if (max_charset_id < XFASTINT (XCAR (tail)))
10094             max_charset_id = XFASTINT (XCAR (tail));
10095         }
10096     }
10097   else
10098     {
10099       charset_list = Fcopy_sequence (charset_list);
10100       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10101         {
10102           struct charset *charset;
10103
10104           val = XCAR (tail);
10105           CHECK_CHARSET_GET_CHARSET (val, charset);
10106           if (EQ (coding_type, Qiso_2022)
10107               ? CHARSET_ISO_FINAL (charset) < 0
10108               : EQ (coding_type, Qemacs_mule)
10109               ? CHARSET_EMACS_MULE_ID (charset) < 0
10110               : 0)
10111             error ("Can't handle charset `%s'",
10112                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10113
10114           XSETCAR (tail, make_number (charset->id));
10115           if (max_charset_id < charset->id)
10116             max_charset_id = charset->id;
10117         }
10118     }
10119   ASET (attrs, coding_attr_charset_list, charset_list);
10120
10121   safe_charsets = make_uninit_string (max_charset_id + 1);
10122   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10123   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10124     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10125   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10126
10127   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10128
10129   val = args[coding_arg_decode_translation_table];
10130   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10131     CHECK_SYMBOL (val);
10132   ASET (attrs, coding_attr_decode_tbl, val);
10133
10134   val = args[coding_arg_encode_translation_table];
10135   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10136     CHECK_SYMBOL (val);
10137   ASET (attrs, coding_attr_encode_tbl, val);
10138
10139   val = args[coding_arg_post_read_conversion];
10140   CHECK_SYMBOL (val);
10141   ASET (attrs, coding_attr_post_read, val);
10142
10143   val = args[coding_arg_pre_write_conversion];
10144   CHECK_SYMBOL (val);
10145   ASET (attrs, coding_attr_pre_write, val);
10146
10147   val = args[coding_arg_default_char];
10148   if (NILP (val))
10149     ASET (attrs, coding_attr_default_char, make_number (' '));
10150   else
10151     {
10152       CHECK_CHARACTER (val);
10153       ASET (attrs, coding_attr_default_char, val);
10154     }
10155
10156   val = args[coding_arg_for_unibyte];
10157   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10158
10159   val = args[coding_arg_plist];
10160   CHECK_LIST (val);
10161   ASET (attrs, coding_attr_plist, val);
10162
10163   if (EQ (coding_type, Qcharset))
10164     {
10165       /* Generate a lisp vector of 256 elements.  Each element is nil,
10166          integer, or a list of charset IDs.
10167
10168          If Nth element is nil, the byte code N is invalid in this
10169          coding system.
10170
10171          If Nth element is a number NUM, N is the first byte of a
10172          charset whose ID is NUM.
10173
10174          If Nth element is a list of charset IDs, N is the first byte
10175          of one of them.  The list is sorted by dimensions of the
10176          charsets.  A charset of smaller dimension comes first. */
10177       val = Fmake_vector (make_number (256), Qnil);
10178
10179       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10180         {
10181           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10182           int dim = CHARSET_DIMENSION (charset);
10183           int idx = (dim - 1) * 4;
10184
10185           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10186             ASET (attrs, coding_attr_ascii_compat, Qt);
10187
10188           for (i = charset->code_space[idx];
10189                i <= charset->code_space[idx + 1]; i++)
10190             {
10191               Lisp_Object tmp, tmp2;
10192               int dim2;
10193
10194               tmp = AREF (val, i);
10195               if (NILP (tmp))
10196                 tmp = XCAR (tail);
10197               else if (NUMBERP (tmp))
10198                 {
10199                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10200                   if (dim < dim2)
10201                     tmp = list2 (XCAR (tail), tmp);
10202                   else
10203                     tmp = list2 (tmp, XCAR (tail));
10204                 }
10205               else
10206                 {
10207                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10208                     {
10209                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10210                       if (dim < dim2)
10211                         break;
10212                     }
10213                   if (NILP (tmp2))
10214                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10215                   else
10216                     {
10217                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10218                       XSETCAR (tmp2, XCAR (tail));
10219                     }
10220                 }
10221               ASET (val, i, tmp);
10222             }
10223         }
10224       ASET (attrs, coding_attr_charset_valids, val);
10225       category = coding_category_charset;
10226     }
10227   else if (EQ (coding_type, Qccl))
10228     {
10229       Lisp_Object valids;
10230
10231       if (nargs < coding_arg_ccl_max)
10232         goto short_args;
10233
10234       val = args[coding_arg_ccl_decoder];
10235       CHECK_CCL_PROGRAM (val);
10236       if (VECTORP (val))
10237         val = Fcopy_sequence (val);
10238       ASET (attrs, coding_attr_ccl_decoder, val);
10239
10240       val = args[coding_arg_ccl_encoder];
10241       CHECK_CCL_PROGRAM (val);
10242       if (VECTORP (val))
10243         val = Fcopy_sequence (val);
10244       ASET (attrs, coding_attr_ccl_encoder, val);
10245
10246       val = args[coding_arg_ccl_valids];
10247       valids = Fmake_string (make_number (256), make_number (0));
10248       for (tail = val; CONSP (tail); tail = XCDR (tail))
10249         {
10250           int from, to;
10251
10252           val = XCAR (tail);
10253           if (INTEGERP (val))
10254             {
10255               if (! (0 <= XINT (val) && XINT (val) <= 255))
10256                 args_out_of_range_3 (val, make_number (0), make_number (255));
10257               from = to = XINT (val);
10258             }
10259           else
10260             {
10261               CHECK_CONS (val);
10262               CHECK_NATNUM_CAR (val);
10263               CHECK_NUMBER_CDR (val);
10264               if (XINT (XCAR (val)) > 255)
10265                 args_out_of_range_3 (XCAR (val),
10266                                      make_number (0), make_number (255));
10267               from = XINT (XCAR (val));
10268               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10269                 args_out_of_range_3 (XCDR (val),
10270                                      XCAR (val), make_number (255));
10271               to = XINT (XCDR (val));
10272             }
10273           for (i = from; i <= to; i++)
10274             SSET (valids, i, 1);
10275         }
10276       ASET (attrs, coding_attr_ccl_valids, valids);
10277
10278       category = coding_category_ccl;
10279     }
10280   else if (EQ (coding_type, Qutf_16))
10281     {
10282       Lisp_Object bom, endian;
10283
10284       ASET (attrs, coding_attr_ascii_compat, Qnil);
10285
10286       if (nargs < coding_arg_utf16_max)
10287         goto short_args;
10288
10289       bom = args[coding_arg_utf16_bom];
10290       if (! NILP (bom) && ! EQ (bom, Qt))
10291         {
10292           CHECK_CONS (bom);
10293           val = XCAR (bom);
10294           CHECK_CODING_SYSTEM (val);
10295           val = XCDR (bom);
10296           CHECK_CODING_SYSTEM (val);
10297         }
10298       ASET (attrs, coding_attr_utf_bom, bom);
10299
10300       endian = args[coding_arg_utf16_endian];
10301       CHECK_SYMBOL (endian);
10302       if (NILP (endian))
10303         endian = Qbig;
10304       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10305         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10306       ASET (attrs, coding_attr_utf_16_endian, endian);
10307
10308       category = (CONSP (bom)
10309                   ? coding_category_utf_16_auto
10310                   : NILP (bom)
10311                   ? (EQ (endian, Qbig)
10312                      ? coding_category_utf_16_be_nosig
10313                      : coding_category_utf_16_le_nosig)
10314                   : (EQ (endian, Qbig)
10315                      ? coding_category_utf_16_be
10316                      : coding_category_utf_16_le));
10317     }
10318   else if (EQ (coding_type, Qiso_2022))
10319     {
10320       Lisp_Object initial, reg_usage, request, flags;
10321
10322       if (nargs < coding_arg_iso2022_max)
10323         goto short_args;
10324
10325       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10326       CHECK_VECTOR (initial);
10327       for (i = 0; i < 4; i++)
10328         {
10329           val = AREF (initial, i);
10330           if (! NILP (val))
10331             {
10332               struct charset *charset;
10333
10334               CHECK_CHARSET_GET_CHARSET (val, charset);
10335               ASET (initial, i, make_number (CHARSET_ID (charset)));
10336               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10337                 ASET (attrs, coding_attr_ascii_compat, Qt);
10338             }
10339           else
10340             ASET (initial, i, make_number (-1));
10341         }
10342
10343       reg_usage = args[coding_arg_iso2022_reg_usage];
10344       CHECK_CONS (reg_usage);
10345       CHECK_NUMBER_CAR (reg_usage);
10346       CHECK_NUMBER_CDR (reg_usage);
10347
10348       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10349       for (tail = request; CONSP (tail); tail = XCDR (tail))
10350         {
10351           int id;
10352           Lisp_Object tmp1;
10353
10354           val = XCAR (tail);
10355           CHECK_CONS (val);
10356           tmp1 = XCAR (val);
10357           CHECK_CHARSET_GET_ID (tmp1, id);
10358           CHECK_NATNUM_CDR (val);
10359           if (XINT (XCDR (val)) >= 4)
10360             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10361           XSETCAR (val, make_number (id));
10362         }
10363
10364       flags = args[coding_arg_iso2022_flags];
10365       CHECK_NATNUM (flags);
10366       i = XINT (flags) & INT_MAX;
10367       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10368         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10369       flags = make_number (i);
10370
10371       ASET (attrs, coding_attr_iso_initial, initial);
10372       ASET (attrs, coding_attr_iso_usage, reg_usage);
10373       ASET (attrs, coding_attr_iso_request, request);
10374       ASET (attrs, coding_attr_iso_flags, flags);
10375       setup_iso_safe_charsets (attrs);
10376
10377       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10378         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10379                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10380                     ? coding_category_iso_7_else
10381                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10382                     ? coding_category_iso_7
10383                     : coding_category_iso_7_tight);
10384       else
10385         {
10386           int id = XINT (AREF (initial, 1));
10387
10388           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10389                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10390                        || id < 0)
10391                       ? coding_category_iso_8_else
10392                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10393                       ? coding_category_iso_8_1
10394                       : coding_category_iso_8_2);
10395         }
10396       if (category != coding_category_iso_8_1
10397           && category != coding_category_iso_8_2)
10398         ASET (attrs, coding_attr_ascii_compat, Qnil);
10399     }
10400   else if (EQ (coding_type, Qemacs_mule))
10401     {
10402       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10403         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10404       ASET (attrs, coding_attr_ascii_compat, Qt);
10405       category = coding_category_emacs_mule;
10406     }
10407   else if (EQ (coding_type, Qshift_jis))
10408     {
10409
10410       struct charset *charset;
10411
10412       if (XINT (Flength (charset_list)) != 3
10413           && XINT (Flength (charset_list)) != 4)
10414         error ("There should be three or four charsets");
10415
10416       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10417       if (CHARSET_DIMENSION (charset) != 1)
10418         error ("Dimension of charset %s is not one",
10419                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10420       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10421         ASET (attrs, coding_attr_ascii_compat, Qt);
10422
10423       charset_list = XCDR (charset_list);
10424       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10425       if (CHARSET_DIMENSION (charset) != 1)
10426         error ("Dimension of charset %s is not one",
10427                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10428
10429       charset_list = XCDR (charset_list);
10430       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10431       if (CHARSET_DIMENSION (charset) != 2)
10432         error ("Dimension of charset %s is not two",
10433                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10434
10435       charset_list = XCDR (charset_list);
10436       if (! NILP (charset_list))
10437         {
10438           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10439           if (CHARSET_DIMENSION (charset) != 2)
10440             error ("Dimension of charset %s is not two",
10441                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10442         }
10443
10444       category = coding_category_sjis;
10445       Vsjis_coding_system = name;
10446     }
10447   else if (EQ (coding_type, Qbig5))
10448     {
10449       struct charset *charset;
10450
10451       if (XINT (Flength (charset_list)) != 2)
10452         error ("There should be just two charsets");
10453
10454       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10455       if (CHARSET_DIMENSION (charset) != 1)
10456         error ("Dimension of charset %s is not one",
10457                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10458       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10459         ASET (attrs, coding_attr_ascii_compat, Qt);
10460
10461       charset_list = XCDR (charset_list);
10462       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10463       if (CHARSET_DIMENSION (charset) != 2)
10464         error ("Dimension of charset %s is not two",
10465                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10466
10467       category = coding_category_big5;
10468       Vbig5_coding_system = name;
10469     }
10470   else if (EQ (coding_type, Qraw_text))
10471     {
10472       category = coding_category_raw_text;
10473       ASET (attrs, coding_attr_ascii_compat, Qt);
10474     }
10475   else if (EQ (coding_type, Qutf_8))
10476     {
10477       Lisp_Object bom;
10478
10479       if (nargs < coding_arg_utf8_max)
10480         goto short_args;
10481
10482       bom = args[coding_arg_utf8_bom];
10483       if (! NILP (bom) && ! EQ (bom, Qt))
10484         {
10485           CHECK_CONS (bom);
10486           val = XCAR (bom);
10487           CHECK_CODING_SYSTEM (val);
10488           val = XCDR (bom);
10489           CHECK_CODING_SYSTEM (val);
10490         }
10491       ASET (attrs, coding_attr_utf_bom, bom);
10492       if (NILP (bom))
10493         ASET (attrs, coding_attr_ascii_compat, Qt);
10494
10495       category = (CONSP (bom) ? coding_category_utf_8_auto
10496                   : NILP (bom) ? coding_category_utf_8_nosig
10497                   : coding_category_utf_8_sig);
10498     }
10499   else if (EQ (coding_type, Qundecided))
10500     {
10501       if (nargs < coding_arg_undecided_max)
10502         goto short_args;
10503       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10504             args[coding_arg_undecided_inhibit_null_byte_detection]);
10505       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10506             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10507       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10508             args[coding_arg_undecided_prefer_utf_8]);
10509       category = coding_category_undecided;
10510     }
10511   else
10512     error ("Invalid coding system type: %s",
10513            SDATA (SYMBOL_NAME (coding_type)));
10514
10515   ASET (attrs, coding_attr_category, make_number (category));
10516   ASET (attrs, coding_attr_plist,
10517         Fcons (QCcategory,
10518                Fcons (AREF (Vcoding_category_table, category),
10519                       CODING_ATTR_PLIST (attrs))));
10520   ASET (attrs, coding_attr_plist,
10521         Fcons (QCascii_compatible_p,
10522                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10523                       CODING_ATTR_PLIST (attrs))));
10524
10525   eol_type = args[coding_arg_eol_type];
10526   if (! NILP (eol_type)
10527       && ! EQ (eol_type, Qunix)
10528       && ! EQ (eol_type, Qdos)
10529       && ! EQ (eol_type, Qmac))
10530     error ("Invalid eol-type");
10531
10532   aliases = list1 (name);
10533
10534   if (NILP (eol_type))
10535     {
10536       eol_type = make_subsidiaries (name);
10537       for (i = 0; i < 3; i++)
10538         {
10539           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10540
10541           this_name = AREF (eol_type, i);
10542           this_aliases = list1 (this_name);
10543           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10544           this_spec = make_uninit_vector (3);
10545           ASET (this_spec, 0, attrs);
10546           ASET (this_spec, 1, this_aliases);
10547           ASET (this_spec, 2, this_eol_type);
10548           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10549           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10550           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10551           if (NILP (val))
10552             Vcoding_system_alist
10553               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10554                        Vcoding_system_alist);
10555         }
10556     }
10557
10558   spec_vec = make_uninit_vector (3);
10559   ASET (spec_vec, 0, attrs);
10560   ASET (spec_vec, 1, aliases);
10561   ASET (spec_vec, 2, eol_type);
10562
10563   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10564   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10565   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10566   if (NILP (val))
10567     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10568                                   Vcoding_system_alist);
10569
10570   {
10571     int id = coding_categories[category].id;
10572
10573     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10574       setup_coding_system (name, &coding_categories[category]);
10575   }
10576
10577   return Qnil;
10578
10579  short_args:
10580   return Fsignal (Qwrong_number_of_arguments,
10581                   Fcons (intern ("define-coding-system-internal"),
10582                          make_number (nargs)));
10583 }
10584
10585
10586 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10587        3, 3, 0,
10588        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10589   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10590 {
10591   Lisp_Object spec, attrs;
10592
10593   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10594   attrs = AREF (spec, 0);
10595   if (EQ (prop, QCmnemonic))
10596     {
10597       if (! STRINGP (val))
10598         CHECK_CHARACTER (val);
10599       ASET (attrs, coding_attr_mnemonic, val);
10600     }
10601   else if (EQ (prop, QCdefault_char))
10602     {
10603       if (NILP (val))
10604         val = make_number (' ');
10605       else
10606         CHECK_CHARACTER (val);
10607       ASET (attrs, coding_attr_default_char, val);
10608     }
10609   else if (EQ (prop, QCdecode_translation_table))
10610     {
10611       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10612         CHECK_SYMBOL (val);
10613       ASET (attrs, coding_attr_decode_tbl, val);
10614     }
10615   else if (EQ (prop, QCencode_translation_table))
10616     {
10617       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10618         CHECK_SYMBOL (val);
10619       ASET (attrs, coding_attr_encode_tbl, val);
10620     }
10621   else if (EQ (prop, QCpost_read_conversion))
10622     {
10623       CHECK_SYMBOL (val);
10624       ASET (attrs, coding_attr_post_read, val);
10625     }
10626   else if (EQ (prop, QCpre_write_conversion))
10627     {
10628       CHECK_SYMBOL (val);
10629       ASET (attrs, coding_attr_pre_write, val);
10630     }
10631   else if (EQ (prop, QCascii_compatible_p))
10632     {
10633       ASET (attrs, coding_attr_ascii_compat, val);
10634     }
10635
10636   ASET (attrs, coding_attr_plist,
10637         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10638   return val;
10639 }
10640
10641
10642 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10643        Sdefine_coding_system_alias, 2, 2, 0,
10644        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10645   (Lisp_Object alias, Lisp_Object coding_system)
10646 {
10647   Lisp_Object spec, aliases, eol_type, val;
10648
10649   CHECK_SYMBOL (alias);
10650   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10651   aliases = AREF (spec, 1);
10652   /* ALIASES should be a list of length more than zero, and the first
10653      element is a base coding system.  Append ALIAS at the tail of the
10654      list.  */
10655   while (!NILP (XCDR (aliases)))
10656     aliases = XCDR (aliases);
10657   XSETCDR (aliases, list1 (alias));
10658
10659   eol_type = AREF (spec, 2);
10660   if (VECTORP (eol_type))
10661     {
10662       Lisp_Object subsidiaries;
10663       int i;
10664
10665       subsidiaries = make_subsidiaries (alias);
10666       for (i = 0; i < 3; i++)
10667         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10668                                      AREF (eol_type, i));
10669     }
10670
10671   Fputhash (alias, spec, Vcoding_system_hash_table);
10672   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10673   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10674   if (NILP (val))
10675     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10676                                   Vcoding_system_alist);
10677
10678   return Qnil;
10679 }
10680
10681 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10682        1, 1, 0,
10683        doc: /* Return the base of CODING-SYSTEM.
10684 Any alias or subsidiary coding system is not a base coding system.  */)
10685   (Lisp_Object coding_system)
10686 {
10687   Lisp_Object spec, attrs;
10688
10689   if (NILP (coding_system))
10690     return (Qno_conversion);
10691   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10692   attrs = AREF (spec, 0);
10693   return CODING_ATTR_BASE_NAME (attrs);
10694 }
10695
10696 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10697        1, 1, 0,
10698        doc: "Return the property list of CODING-SYSTEM.")
10699   (Lisp_Object coding_system)
10700 {
10701   Lisp_Object spec, attrs;
10702
10703   if (NILP (coding_system))
10704     coding_system = Qno_conversion;
10705   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10706   attrs = AREF (spec, 0);
10707   return CODING_ATTR_PLIST (attrs);
10708 }
10709
10710
10711 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10712        1, 1, 0,
10713        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10714   (Lisp_Object coding_system)
10715 {
10716   Lisp_Object spec;
10717
10718   if (NILP (coding_system))
10719     coding_system = Qno_conversion;
10720   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10721   return AREF (spec, 1);
10722 }
10723
10724 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10725        Scoding_system_eol_type, 1, 1, 0,
10726        doc: /* Return eol-type of CODING-SYSTEM.
10727 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10728
10729 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10730 and CR respectively.
10731
10732 A vector value indicates that a format of end-of-line should be
10733 detected automatically.  Nth element of the vector is the subsidiary
10734 coding system whose eol-type is N.  */)
10735   (Lisp_Object coding_system)
10736 {
10737   Lisp_Object spec, eol_type;
10738   int n;
10739
10740   if (NILP (coding_system))
10741     coding_system = Qno_conversion;
10742   if (! CODING_SYSTEM_P (coding_system))
10743     return Qnil;
10744   spec = CODING_SYSTEM_SPEC (coding_system);
10745   eol_type = AREF (spec, 2);
10746   if (VECTORP (eol_type))
10747     return Fcopy_sequence (eol_type);
10748   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10749   return make_number (n);
10750 }
10751
10752 #endif /* emacs */
10753
10754 \f
10755 /*** 9. Post-amble ***/
10756
10757 void
10758 init_coding_once (void)
10759 {
10760   int i;
10761
10762   for (i = 0; i < coding_category_max; i++)
10763     {
10764       coding_categories[i].id = -1;
10765       coding_priorities[i] = i;
10766     }
10767
10768   /* ISO2022 specific initialize routine.  */
10769   for (i = 0; i < 0x20; i++)
10770     iso_code_class[i] = ISO_control_0;
10771   for (i = 0x21; i < 0x7F; i++)
10772     iso_code_class[i] = ISO_graphic_plane_0;
10773   for (i = 0x80; i < 0xA0; i++)
10774     iso_code_class[i] = ISO_control_1;
10775   for (i = 0xA1; i < 0xFF; i++)
10776     iso_code_class[i] = ISO_graphic_plane_1;
10777   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10778   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10779   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10780   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10781   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10782   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10783   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10784   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10785   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10786
10787   for (i = 0; i < 256; i++)
10788     {
10789       emacs_mule_bytes[i] = 1;
10790     }
10791   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10792   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10793   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10794   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10795 }
10796
10797 #ifdef emacs
10798
10799 void
10800 syms_of_coding (void)
10801 {
10802   staticpro (&Vcoding_system_hash_table);
10803   {
10804     Lisp_Object args[2];
10805     args[0] = QCtest;
10806     args[1] = Qeq;
10807     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10808   }
10809
10810   staticpro (&Vsjis_coding_system);
10811   Vsjis_coding_system = Qnil;
10812
10813   staticpro (&Vbig5_coding_system);
10814   Vbig5_coding_system = Qnil;
10815
10816   staticpro (&Vcode_conversion_reused_workbuf);
10817   Vcode_conversion_reused_workbuf = Qnil;
10818
10819   staticpro (&Vcode_conversion_workbuf_name);
10820   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10821
10822   reused_workbuf_in_use = 0;
10823
10824   DEFSYM (Qcharset, "charset");
10825   DEFSYM (Qtarget_idx, "target-idx");
10826   DEFSYM (Qcoding_system_history, "coding-system-history");
10827   Fset (Qcoding_system_history, Qnil);
10828
10829   /* Target FILENAME is the first argument.  */
10830   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10831   /* Target FILENAME is the third argument.  */
10832   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10833
10834   DEFSYM (Qcall_process, "call-process");
10835   /* Target PROGRAM is the first argument.  */
10836   Fput (Qcall_process, Qtarget_idx, make_number (0));
10837
10838   DEFSYM (Qcall_process_region, "call-process-region");
10839   /* Target PROGRAM is the third argument.  */
10840   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10841
10842   DEFSYM (Qstart_process, "start-process");
10843   /* Target PROGRAM is the third argument.  */
10844   Fput (Qstart_process, Qtarget_idx, make_number (2));
10845
10846   DEFSYM (Qopen_network_stream, "open-network-stream");
10847   /* Target SERVICE is the fourth argument.  */
10848   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10849
10850   DEFSYM (Qcoding_system, "coding-system");
10851   DEFSYM (Qcoding_aliases, "coding-aliases");
10852
10853   DEFSYM (Qeol_type, "eol-type");
10854   DEFSYM (Qunix, "unix");
10855   DEFSYM (Qdos, "dos");
10856   DEFSYM (Qmac, "mac");
10857
10858   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10859   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10860   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10861   DEFSYM (Qdefault_char, "default-char");
10862   DEFSYM (Qundecided, "undecided");
10863   DEFSYM (Qno_conversion, "no-conversion");
10864   DEFSYM (Qraw_text, "raw-text");
10865
10866   DEFSYM (Qiso_2022, "iso-2022");
10867
10868   DEFSYM (Qutf_8, "utf-8");
10869   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10870
10871 #if defined (WINDOWSNT) || defined (CYGWIN)
10872   /* No, not utf-16-le: that one has a BOM.  */
10873   DEFSYM (Qutf_16le, "utf-16le");
10874 #endif
10875
10876   DEFSYM (Qutf_16, "utf-16");
10877   DEFSYM (Qbig, "big");
10878   DEFSYM (Qlittle, "little");
10879
10880   DEFSYM (Qshift_jis, "shift-jis");
10881   DEFSYM (Qbig5, "big5");
10882
10883   DEFSYM (Qcoding_system_p, "coding-system-p");
10884
10885   DEFSYM (Qcoding_system_error, "coding-system-error");
10886   Fput (Qcoding_system_error, Qerror_conditions,
10887         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10888   Fput (Qcoding_system_error, Qerror_message,
10889         build_pure_c_string ("Invalid coding system"));
10890
10891   DEFSYM (Qtranslation_table, "translation-table");
10892   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10893   DEFSYM (Qtranslation_table_id, "translation-table-id");
10894   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10895   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10896
10897   DEFSYM (Qvalid_codes, "valid-codes");
10898
10899   DEFSYM (Qemacs_mule, "emacs-mule");
10900
10901   DEFSYM (QCcategory, ":category");
10902   DEFSYM (QCmnemonic, ":mnemonic");
10903   DEFSYM (QCdefault_char, ":default-char");
10904   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10905   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10906   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10907   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10908   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10909
10910   Vcoding_category_table
10911     = Fmake_vector (make_number (coding_category_max), Qnil);
10912   staticpro (&Vcoding_category_table);
10913   /* Followings are target of code detection.  */
10914   ASET (Vcoding_category_table, coding_category_iso_7,
10915         intern_c_string ("coding-category-iso-7"));
10916   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10917         intern_c_string ("coding-category-iso-7-tight"));
10918   ASET (Vcoding_category_table, coding_category_iso_8_1,
10919         intern_c_string ("coding-category-iso-8-1"));
10920   ASET (Vcoding_category_table, coding_category_iso_8_2,
10921         intern_c_string ("coding-category-iso-8-2"));
10922   ASET (Vcoding_category_table, coding_category_iso_7_else,
10923         intern_c_string ("coding-category-iso-7-else"));
10924   ASET (Vcoding_category_table, coding_category_iso_8_else,
10925         intern_c_string ("coding-category-iso-8-else"));
10926   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10927         intern_c_string ("coding-category-utf-8-auto"));
10928   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10929         intern_c_string ("coding-category-utf-8"));
10930   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10931         intern_c_string ("coding-category-utf-8-sig"));
10932   ASET (Vcoding_category_table, coding_category_utf_16_be,
10933         intern_c_string ("coding-category-utf-16-be"));
10934   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10935         intern_c_string ("coding-category-utf-16-auto"));
10936   ASET (Vcoding_category_table, coding_category_utf_16_le,
10937         intern_c_string ("coding-category-utf-16-le"));
10938   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10939         intern_c_string ("coding-category-utf-16-be-nosig"));
10940   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10941         intern_c_string ("coding-category-utf-16-le-nosig"));
10942   ASET (Vcoding_category_table, coding_category_charset,
10943         intern_c_string ("coding-category-charset"));
10944   ASET (Vcoding_category_table, coding_category_sjis,
10945         intern_c_string ("coding-category-sjis"));
10946   ASET (Vcoding_category_table, coding_category_big5,
10947         intern_c_string ("coding-category-big5"));
10948   ASET (Vcoding_category_table, coding_category_ccl,
10949         intern_c_string ("coding-category-ccl"));
10950   ASET (Vcoding_category_table, coding_category_emacs_mule,
10951         intern_c_string ("coding-category-emacs-mule"));
10952   /* Followings are NOT target of code detection.  */
10953   ASET (Vcoding_category_table, coding_category_raw_text,
10954         intern_c_string ("coding-category-raw-text"));
10955   ASET (Vcoding_category_table, coding_category_undecided,
10956         intern_c_string ("coding-category-undecided"));
10957
10958   DEFSYM (Qinsufficient_source, "insufficient-source");
10959   DEFSYM (Qinvalid_source, "invalid-source");
10960   DEFSYM (Qinterrupted, "interrupted");
10961   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10962
10963   defsubr (&Scoding_system_p);
10964   defsubr (&Sread_coding_system);
10965   defsubr (&Sread_non_nil_coding_system);
10966   defsubr (&Scheck_coding_system);
10967   defsubr (&Sdetect_coding_region);
10968   defsubr (&Sdetect_coding_string);
10969   defsubr (&Sfind_coding_systems_region_internal);
10970   defsubr (&Sunencodable_char_position);
10971   defsubr (&Scheck_coding_systems_region);
10972   defsubr (&Sdecode_coding_region);
10973   defsubr (&Sencode_coding_region);
10974   defsubr (&Sdecode_coding_string);
10975   defsubr (&Sencode_coding_string);
10976   defsubr (&Sdecode_sjis_char);
10977   defsubr (&Sencode_sjis_char);
10978   defsubr (&Sdecode_big5_char);
10979   defsubr (&Sencode_big5_char);
10980   defsubr (&Sset_terminal_coding_system_internal);
10981   defsubr (&Sset_safe_terminal_coding_system_internal);
10982   defsubr (&Sterminal_coding_system);
10983   defsubr (&Sset_keyboard_coding_system_internal);
10984   defsubr (&Skeyboard_coding_system);
10985   defsubr (&Sfind_operation_coding_system);
10986   defsubr (&Sset_coding_system_priority);
10987   defsubr (&Sdefine_coding_system_internal);
10988   defsubr (&Sdefine_coding_system_alias);
10989   defsubr (&Scoding_system_put);
10990   defsubr (&Scoding_system_base);
10991   defsubr (&Scoding_system_plist);
10992   defsubr (&Scoding_system_aliases);
10993   defsubr (&Scoding_system_eol_type);
10994   defsubr (&Scoding_system_priority_list);
10995
10996   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10997                doc: /* List of coding systems.
10998
10999 Do not alter the value of this variable manually.  This variable should be
11000 updated by the functions `define-coding-system' and
11001 `define-coding-system-alias'.  */);
11002   Vcoding_system_list = Qnil;
11003
11004   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
11005                doc: /* Alist of coding system names.
11006 Each element is one element list of coding system name.
11007 This variable is given to `completing-read' as COLLECTION argument.
11008
11009 Do not alter the value of this variable manually.  This variable should be
11010 updated by the functions `make-coding-system' and
11011 `define-coding-system-alias'.  */);
11012   Vcoding_system_alist = Qnil;
11013
11014   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
11015                doc: /* List of coding-categories (symbols) ordered by priority.
11016
11017 On detecting a coding system, Emacs tries code detection algorithms
11018 associated with each coding-category one by one in this order.  When
11019 one algorithm agrees with a byte sequence of source text, the coding
11020 system bound to the corresponding coding-category is selected.
11021
11022 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
11023   {
11024     int i;
11025
11026     Vcoding_category_list = Qnil;
11027     for (i = coding_category_max - 1; i >= 0; i--)
11028       Vcoding_category_list
11029         = Fcons (AREF (Vcoding_category_table, i),
11030                  Vcoding_category_list);
11031   }
11032
11033   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11034                doc: /* Specify the coding system for read operations.
11035 It is useful to bind this variable with `let', but do not set it globally.
11036 If the value is a coding system, it is used for decoding on read operation.
11037 If not, an appropriate element is used from one of the coding system alists.
11038 There are three such tables: `file-coding-system-alist',
11039 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11040   Vcoding_system_for_read = Qnil;
11041
11042   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11043                doc: /* Specify the coding system for write operations.
11044 Programs bind this variable with `let', but you should not set it globally.
11045 If the value is a coding system, it is used for encoding of output,
11046 when writing it to a file and when sending it to a file or subprocess.
11047
11048 If this does not specify a coding system, an appropriate element
11049 is used from one of the coding system alists.
11050 There are three such tables: `file-coding-system-alist',
11051 `process-coding-system-alist', and `network-coding-system-alist'.
11052 For output to files, if the above procedure does not specify a coding system,
11053 the value of `buffer-file-coding-system' is used.  */);
11054   Vcoding_system_for_write = Qnil;
11055
11056   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11057                doc: /*
11058 Coding system used in the latest file or process I/O.  */);
11059   Vlast_coding_system_used = Qnil;
11060
11061   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11062                doc: /*
11063 Error status of the last code conversion.
11064
11065 When an error was detected in the last code conversion, this variable
11066 is set to one of the following symbols.
11067   `insufficient-source'
11068   `inconsistent-eol'
11069   `invalid-source'
11070   `interrupted'
11071   `insufficient-memory'
11072 When no error was detected, the value doesn't change.  So, to check
11073 the error status of a code conversion by this variable, you must
11074 explicitly set this variable to nil before performing code
11075 conversion.  */);
11076   Vlast_code_conversion_error = Qnil;
11077
11078   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11079                doc: /*
11080 *Non-nil means always inhibit code conversion of end-of-line format.
11081 See info node `Coding Systems' and info node `Text and Binary' concerning
11082 such conversion.  */);
11083   inhibit_eol_conversion = 0;
11084
11085   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11086                doc: /*
11087 Non-nil means process buffer inherits coding system of process output.
11088 Bind it to t if the process output is to be treated as if it were a file
11089 read from some filesystem.  */);
11090   inherit_process_coding_system = 0;
11091
11092   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11093                doc: /*
11094 Alist to decide a coding system to use for a file I/O operation.
11095 The format is ((PATTERN . VAL) ...),
11096 where PATTERN is a regular expression matching a file name,
11097 VAL is a coding system, a cons of coding systems, or a function symbol.
11098 If VAL is a coding system, it is used for both decoding and encoding
11099 the file contents.
11100 If VAL is a cons of coding systems, the car part is used for decoding,
11101 and the cdr part is used for encoding.
11102 If VAL is a function symbol, the function must return a coding system
11103 or a cons of coding systems which are used as above.  The function is
11104 called with an argument that is a list of the arguments with which
11105 `find-operation-coding-system' was called.  If the function can't decide
11106 a coding system, it can return `undecided' so that the normal
11107 code-detection is performed.
11108
11109 See also the function `find-operation-coding-system'
11110 and the variable `auto-coding-alist'.  */);
11111   Vfile_coding_system_alist = Qnil;
11112
11113   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11114                doc: /*
11115 Alist to decide a coding system to use for a process I/O operation.
11116 The format is ((PATTERN . VAL) ...),
11117 where PATTERN is a regular expression matching a program name,
11118 VAL is a coding system, a cons of coding systems, or a function symbol.
11119 If VAL is a coding system, it is used for both decoding what received
11120 from the program and encoding what sent to the program.
11121 If VAL is a cons of coding systems, the car part is used for decoding,
11122 and the cdr part is used for encoding.
11123 If VAL is a function symbol, the function must return a coding system
11124 or a cons of coding systems which are used as above.
11125
11126 See also the function `find-operation-coding-system'.  */);
11127   Vprocess_coding_system_alist = Qnil;
11128
11129   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11130                doc: /*
11131 Alist to decide a coding system to use for a network I/O operation.
11132 The format is ((PATTERN . VAL) ...),
11133 where PATTERN is a regular expression matching a network service name
11134 or is a port number to connect to,
11135 VAL is a coding system, a cons of coding systems, or a function symbol.
11136 If VAL is a coding system, it is used for both decoding what received
11137 from the network stream and encoding what sent to the network stream.
11138 If VAL is a cons of coding systems, the car part is used for decoding,
11139 and the cdr part is used for encoding.
11140 If VAL is a function symbol, the function must return a coding system
11141 or a cons of coding systems which are used as above.
11142
11143 See also the function `find-operation-coding-system'.  */);
11144   Vnetwork_coding_system_alist = Qnil;
11145
11146   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11147                doc: /* Coding system to use with system messages.
11148 Also used for decoding keyboard input on X Window system.  */);
11149   Vlocale_coding_system = Qnil;
11150
11151   /* The eol mnemonics are reset in startup.el system-dependently.  */
11152   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11153                doc: /*
11154 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11155   eol_mnemonic_unix = build_pure_c_string (":");
11156
11157   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11158                doc: /*
11159 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11160   eol_mnemonic_dos = build_pure_c_string ("\\");
11161
11162   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11163                doc: /*
11164 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11165   eol_mnemonic_mac = build_pure_c_string ("/");
11166
11167   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11168                doc: /*
11169 *String displayed in mode line when end-of-line format is not yet determined.  */);
11170   eol_mnemonic_undecided = build_pure_c_string (":");
11171
11172   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11173                doc: /*
11174 *Non-nil enables character translation while encoding and decoding.  */);
11175   Venable_character_translation = Qt;
11176
11177   DEFVAR_LISP ("standard-translation-table-for-decode",
11178                Vstandard_translation_table_for_decode,
11179                doc: /* Table for translating characters while decoding.  */);
11180   Vstandard_translation_table_for_decode = Qnil;
11181
11182   DEFVAR_LISP ("standard-translation-table-for-encode",
11183                Vstandard_translation_table_for_encode,
11184                doc: /* Table for translating characters while encoding.  */);
11185   Vstandard_translation_table_for_encode = Qnil;
11186
11187   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11188                doc: /* Alist of charsets vs revision numbers.
11189 While encoding, if a charset (car part of an element) is found,
11190 designate it with the escape sequence identifying revision (cdr part
11191 of the element).  */);
11192   Vcharset_revision_table = Qnil;
11193
11194   DEFVAR_LISP ("default-process-coding-system",
11195                Vdefault_process_coding_system,
11196                doc: /* Cons of coding systems used for process I/O by default.
11197 The car part is used for decoding a process output,
11198 the cdr part is used for encoding a text to be sent to a process.  */);
11199   Vdefault_process_coding_system = Qnil;
11200
11201   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11202                doc: /*
11203 Table of extra Latin codes in the range 128..159 (inclusive).
11204 This is a vector of length 256.
11205 If Nth element is non-nil, the existence of code N in a file
11206 \(or output of subprocess) doesn't prevent it to be detected as
11207 a coding system of ISO 2022 variant which has a flag
11208 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11209 or reading output of a subprocess.
11210 Only 128th through 159th elements have a meaning.  */);
11211   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11212
11213   DEFVAR_LISP ("select-safe-coding-system-function",
11214                Vselect_safe_coding_system_function,
11215                doc: /*
11216 Function to call to select safe coding system for encoding a text.
11217
11218 If set, this function is called to force a user to select a proper
11219 coding system which can encode the text in the case that a default
11220 coding system used in each operation can't encode the text.  The
11221 function should take care that the buffer is not modified while
11222 the coding system is being selected.
11223
11224 The default value is `select-safe-coding-system' (which see).  */);
11225   Vselect_safe_coding_system_function = Qnil;
11226
11227   DEFVAR_BOOL ("coding-system-require-warning",
11228                coding_system_require_warning,
11229                doc: /* Internal use only.
11230 If non-nil, on writing a file, `select-safe-coding-system-function' is
11231 called even if `coding-system-for-write' is non-nil.  The command
11232 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11233   coding_system_require_warning = 0;
11234
11235
11236   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11237                inhibit_iso_escape_detection,
11238                doc: /*
11239 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11240
11241 When Emacs reads text, it tries to detect how the text is encoded.
11242 This code detection is sensitive to escape sequences.  If Emacs sees
11243 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11244 of the ISO2022 encodings, and decodes text by the corresponding coding
11245 system (e.g. `iso-2022-7bit').
11246
11247 However, there may be a case that you want to read escape sequences in
11248 a file as is.  In such a case, you can set this variable to non-nil.
11249 Then the code detection will ignore any escape sequences, and no text is
11250 detected as encoded in some ISO-2022 encoding.  The result is that all
11251 escape sequences become visible in a buffer.
11252
11253 The default value is nil, and it is strongly recommended not to change
11254 it.  That is because many Emacs Lisp source files that contain
11255 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11256 in Emacs's distribution, and they won't be decoded correctly on
11257 reading if you suppress escape sequence detection.
11258
11259 The other way to read escape sequences in a file without decoding is
11260 to explicitly specify some coding system that doesn't use ISO-2022
11261 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11262   inhibit_iso_escape_detection = 0;
11263
11264   DEFVAR_BOOL ("inhibit-null-byte-detection",
11265                inhibit_null_byte_detection,
11266                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11267 By default, Emacs treats it as binary data, and does not attempt to
11268 decode it.  The effect is as if you specified `no-conversion' for
11269 reading that text.
11270
11271 Set this to non-nil when a regular text happens to include null bytes.
11272 Examples are Index nodes of Info files and null-byte delimited output
11273 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11274 decode text as usual.  */);
11275   inhibit_null_byte_detection = 0;
11276
11277   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11278                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11279 Internal use only.  Removed after the experimental optimizer gets stable. */);
11280   disable_ascii_optimization = 0;
11281
11282   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11283                doc: /* Char table for translating self-inserting characters.
11284 This is applied to the result of input methods, not their input.
11285 See also `keyboard-translate-table'.
11286
11287 Use of this variable for character code unification was rendered
11288 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11289 internal character representation.  */);
11290     Vtranslation_table_for_input = Qnil;
11291
11292   {
11293     Lisp_Object args[coding_arg_undecided_max];
11294     Lisp_Object plist[16];
11295     int i;
11296
11297     for (i = 0; i < coding_arg_undecided_max; i++)
11298       args[i] = Qnil;
11299
11300     plist[0] = intern_c_string (":name");
11301     plist[1] = args[coding_arg_name] = Qno_conversion;
11302     plist[2] = intern_c_string (":mnemonic");
11303     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11304     plist[4] = intern_c_string (":coding-type");
11305     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11306     plist[6] = intern_c_string (":ascii-compatible-p");
11307     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11308     plist[8] = intern_c_string (":default-char");
11309     plist[9] = args[coding_arg_default_char] = make_number (0);
11310     plist[10] = intern_c_string (":for-unibyte");
11311     plist[11] = args[coding_arg_for_unibyte] = Qt;
11312     plist[12] = intern_c_string (":docstring");
11313     plist[13] = build_pure_c_string ("Do no conversion.\n\
11314 \n\
11315 When you visit a file with this coding, the file is read into a\n\
11316 unibyte buffer as is, thus each byte of a file is treated as a\n\
11317 character.");
11318     plist[14] = intern_c_string (":eol-type");
11319     plist[15] = args[coding_arg_eol_type] = Qunix;
11320     args[coding_arg_plist] = Flist (16, plist);
11321     Fdefine_coding_system_internal (coding_arg_max, args);
11322
11323     plist[1] = args[coding_arg_name] = Qundecided;
11324     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11325     plist[5] = args[coding_arg_coding_type] = Qundecided;
11326     /* This is already set.
11327        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11328     plist[8] = intern_c_string (":charset-list");
11329     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11330     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11331     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11332     plist[15] = args[coding_arg_eol_type] = Qnil;
11333     args[coding_arg_plist] = Flist (16, plist);
11334     args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11335     args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11336     Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11337   }
11338
11339   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11340
11341   {
11342     int i;
11343
11344     for (i = 0; i < coding_category_max; i++)
11345       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11346   }
11347 #if defined (DOS_NT)
11348   system_eol_type = Qdos;
11349 #else
11350   system_eol_type = Qunix;
11351 #endif
11352   staticpro (&system_eol_type);
11353 }
11354
11355 char *
11356 emacs_strerror (int error_number)
11357 {
11358   char *str;
11359
11360   synchronize_system_messages_locale ();
11361   str = strerror (error_number);
11362
11363   if (! NILP (Vlocale_coding_system))
11364     {
11365       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11366                                                       Vlocale_coding_system,
11367                                                       0);
11368       str = SSDATA (dec);
11369     }
11370
11371   return str;
11372 }
11373
11374 #endif /* emacs */