src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2013 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 497
 498 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 499
 500 /* A character to be produced on output if encoding of the original
 501    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 502 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 503
 504 /* UTF-8 section */
 505 #define CODING_UTF_8_BOM(coding)        \
 506   ((coding)->spec.utf_8_bom)
 507
 508 /* UTF-16 section */
 509 #define CODING_UTF_16_BOM(coding)       \
 510   ((coding)->spec.utf_16.bom)
 511
 512 #define CODING_UTF_16_ENDIAN(coding)    \
 513   ((coding)->spec.utf_16.endian)
 514
 515 #define CODING_UTF_16_SURROGATE(coding) \
 516   ((coding)->spec.utf_16.surrogate)
 517
 518
 519 /* CCL section */
 520 #define CODING_CCL_DECODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 522 #define CODING_CCL_ENCODER(coding)      \
 523   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 524 #define CODING_CCL_VALIDS(coding)                                          \
 525   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 526
 527 /* Index for each coding category in `coding_categories' */
 528
 529 enum coding_category
 530   {
 531     coding_category_iso_7,
 532     coding_category_iso_7_tight,
 533     coding_category_iso_8_1,
 534     coding_category_iso_8_2,
 535     coding_category_iso_7_else,
 536     coding_category_iso_8_else,
 537     coding_category_utf_8_auto,
 538     coding_category_utf_8_nosig,
 539     coding_category_utf_8_sig,
 540     coding_category_utf_16_auto,
 541     coding_category_utf_16_be,
 542     coding_category_utf_16_le,
 543     coding_category_utf_16_be_nosig,
 544     coding_category_utf_16_le_nosig,
 545     coding_category_charset,
 546     coding_category_sjis,
 547     coding_category_big5,
 548     coding_category_ccl,
 549     coding_category_emacs_mule,
 550     /* All above are targets of code detection.  */
 551     coding_category_raw_text,
 552     coding_category_undecided,
 553     coding_category_max
 554   };
 555
 556 /* Definitions of flag bits used in detect_coding_XXXX.  */
 557 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 558 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 559 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 560 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 561 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 562 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 563 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 564 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 565 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 566 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 567 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 568 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 569 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 570 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 571 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 572 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 573 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 574 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 575 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 576 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 577
 578 /* This value is returned if detect_coding_mask () find nothing other
 579    than ASCII characters.  */
 580 #define CATEGORY_MASK_ANY               \
 581   (CATEGORY_MASK_ISO_7                  \
 582    | CATEGORY_MASK_ISO_7_TIGHT          \
 583    | CATEGORY_MASK_ISO_8_1              \
 584    | CATEGORY_MASK_ISO_8_2              \
 585    | CATEGORY_MASK_ISO_7_ELSE           \
 586    | CATEGORY_MASK_ISO_8_ELSE           \
 587    | CATEGORY_MASK_UTF_8_AUTO           \
 588    | CATEGORY_MASK_UTF_8_NOSIG          \
 589    | CATEGORY_MASK_UTF_8_SIG            \
 590    | CATEGORY_MASK_UTF_16_AUTO          \
 591    | CATEGORY_MASK_UTF_16_BE            \
 592    | CATEGORY_MASK_UTF_16_LE            \
 593    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 594    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 595    | CATEGORY_MASK_CHARSET              \
 596    | CATEGORY_MASK_SJIS                 \
 597    | CATEGORY_MASK_BIG5                 \
 598    | CATEGORY_MASK_CCL                  \
 599    | CATEGORY_MASK_EMACS_MULE)
 600
 601
 602 #define CATEGORY_MASK_ISO_7BIT \
 603   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 604
 605 #define CATEGORY_MASK_ISO_8BIT \
 606   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 607
 608 #define CATEGORY_MASK_ISO_ELSE \
 609   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 610
 611 #define CATEGORY_MASK_ISO_ESCAPE        \
 612   (CATEGORY_MASK_ISO_7                  \
 613    | CATEGORY_MASK_ISO_7_TIGHT          \
 614    | CATEGORY_MASK_ISO_7_ELSE           \
 615    | CATEGORY_MASK_ISO_8_ELSE)
 616
 617 #define CATEGORY_MASK_ISO       \
 618   (  CATEGORY_MASK_ISO_7BIT     \
 619      | CATEGORY_MASK_ISO_8BIT   \
 620      | CATEGORY_MASK_ISO_ELSE)
 621
 622 #define CATEGORY_MASK_UTF_16            \
 623   (CATEGORY_MASK_UTF_16_AUTO            \
 624    | CATEGORY_MASK_UTF_16_BE            \
 625    | CATEGORY_MASK_UTF_16_LE            \
 626    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 627    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 628
 629 #define CATEGORY_MASK_UTF_8     \
 630   (CATEGORY_MASK_UTF_8_AUTO     \
 631    | CATEGORY_MASK_UTF_8_NOSIG  \
 632    | CATEGORY_MASK_UTF_8_SIG)
 633
 634 /* Table of coding categories (Lisp symbols).  This variable is for
 635    internal use only.  */
 636 static Lisp_Object Vcoding_category_table;
 637
 638 /* Table of coding-categories ordered by priority.  */
 639 static enum coding_category coding_priorities[coding_category_max];
 640
 641 /* Nth element is a coding context for the coding system bound to the
 642    Nth coding category.  */
 643 static struct coding_system coding_categories[coding_category_max];
 644
 645 /*** Commonly used macros and functions ***/
 646
 647 #ifndef min
 648 #define min(a, b) ((a) < (b) ? (a) : (b))
 649 #endif
 650 #ifndef max
 651 #define max(a, b) ((a) > (b) ? (a) : (b))
 652 #endif
 653
 654 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 655
 656 static int
 657 encode_inhibit_flag (Lisp_Object flag)
 658 {
 659   return NILP (flag) ? -1 : EQ (flag, Qt);
 660 }
 661
 662 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 663    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 664
 665 static bool
 666 inhibit_flag (int encoded_flag, bool var)
 667 {
 668   return 0 < encoded_flag + var;
 669 }
 670
 671 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 672   do {                                                  \
 673     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 674     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 675   } while (0)
 676
 677 static void
 678 CHECK_NATNUM_CAR (Lisp_Object x)
 679 {
 680   Lisp_Object tmp = XCAR (x);
 681   CHECK_NATNUM (tmp);
 682   XSETCAR (x, tmp);
 683 }
 684
 685 static void
 686 CHECK_NATNUM_CDR (Lisp_Object x)
 687 {
 688   Lisp_Object tmp = XCDR (x);
 689   CHECK_NATNUM (tmp);
 690   XSETCDR (x, tmp);
 691 }
 692
 693
 694 /* Safely get one byte from the source text pointed by SRC which ends
 695    at SRC_END, and set C to that byte.  If there are not enough bytes
 696    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 697    and a multibyte character is found at SRC, set C to the
 698    negative value of the character code.  The caller should declare
 699    and set these variables appropriately in advance:
 700         src, src_end, multibytep */
 701
 702 #define ONE_MORE_BYTE(c)                                \
 703   do {                                                  \
 704     if (src == src_end)                                 \
 705       {                                                 \
 706         if (src_base < src)                             \
 707           record_conversion_result                      \
 708             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 709         goto no_more_source;                            \
 710       }                                                 \
 711     c = *src++;                                         \
 712     if (multibytep && (c & 0x80))                       \
 713       {                                                 \
 714         if ((c & 0xFE) == 0xC0)                         \
 715           c = ((c & 1) << 6) | *src++;                  \
 716         else                                            \
 717           {                                             \
 718             src--;                                      \
 719             c = - string_char (src, &src, NULL);        \
 720             record_conversion_result                    \
 721               (coding, CODING_RESULT_INVALID_SRC);      \
 722           }                                             \
 723       }                                                 \
 724     consumed_chars++;                                   \
 725   } while (0)
 726
 727 /* Safely get two bytes from the source text pointed by SRC which ends
 728    at SRC_END, and set C1 and C2 to those bytes while skipping the
 729    heading multibyte characters.  If there are not enough bytes in the
 730    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 731    a multibyte character is found for C2, set C2 to the negative value
 732    of the character code.  The caller should declare and set these
 733    variables appropriately in advance:
 734         src, src_end, multibytep
 735    It is intended that this macro is used in detect_coding_utf_16.  */
 736
 737 #define TWO_MORE_BYTES(c1, c2)                          \
 738   do {                                                  \
 739     do {                                                \
 740       if (src == src_end)                               \
 741         goto no_more_source;                            \
 742       c1 = *src++;                                      \
 743       if (multibytep && (c1 & 0x80))                    \
 744         {                                               \
 745           if ((c1 & 0xFE) == 0xC0)                      \
 746             c1 = ((c1 & 1) << 6) | *src++;              \
 747           else                                          \
 748             {                                           \
 749               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 750               c1 = -1;                                  \
 751             }                                           \
 752         }                                               \
 753     } while (c1 < 0);                                   \
 754     if (src == src_end)                                 \
 755       goto no_more_source;                              \
 756     c2 = *src++;                                        \
 757     if (multibytep && (c2 & 0x80))                      \
 758       {                                                 \
 759         if ((c2 & 0xFE) == 0xC0)                        \
 760           c2 = ((c2 & 1) << 6) | *src++;                \
 761         else                                            \
 762           c2 = -1;                                      \
 763       }                                                 \
 764   } while (0)
 765
 766
 767 /* Store a byte C in the place pointed by DST and increment DST to the
 768    next free point, and increment PRODUCED_CHARS.  The caller should
 769    assure that C is 0..127, and declare and set the variable `dst'
 770    appropriately in advance.
 771 */
 772
 773
 774 #define EMIT_ONE_ASCII_BYTE(c)  \
 775   do {                          \
 776     produced_chars++;           \
 777     *dst++ = (c);               \
 778   } while (0)
 779
 780
 781 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 782
 783 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 784   do {                                  \
 785     produced_chars += 2;                \
 786     *dst++ = (c1), *dst++ = (c2);       \
 787   } while (0)
 788
 789
 790 /* Store a byte C in the place pointed by DST and increment DST to the
 791    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 792    store in an appropriate multibyte form.  The caller should
 793    declare and set the variables `dst' and `multibytep' appropriately
 794    in advance.  */
 795
 796 #define EMIT_ONE_BYTE(c)                \
 797   do {                                  \
 798     produced_chars++;                   \
 799     if (multibytep)                     \
 800       {                                 \
 801         unsigned ch = (c);              \
 802         if (ch >= 0x80)                 \
 803           ch = BYTE8_TO_CHAR (ch);      \
 804         CHAR_STRING_ADVANCE (ch, dst);  \
 805       }                                 \
 806     else                                \
 807       *dst++ = (c);                     \
 808   } while (0)
 809
 810
 811 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 812
 813 #define EMIT_TWO_BYTES(c1, c2)          \
 814   do {                                  \
 815     produced_chars += 2;                \
 816     if (multibytep)                     \
 817       {                                 \
 818         unsigned ch;                    \
 819                                         \
 820         ch = (c1);                      \
 821         if (ch >= 0x80)                 \
 822           ch = BYTE8_TO_CHAR (ch);      \
 823         CHAR_STRING_ADVANCE (ch, dst);  \
 824         ch = (c2);                      \
 825         if (ch >= 0x80)                 \
 826           ch = BYTE8_TO_CHAR (ch);      \
 827         CHAR_STRING_ADVANCE (ch, dst);  \
 828       }                                 \
 829     else                                \
 830       {                                 \
 831         *dst++ = (c1);                  \
 832         *dst++ = (c2);                  \
 833       }                                 \
 834   } while (0)
 835
 836
 837 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 838   do {                                  \
 839     EMIT_ONE_BYTE (c1);                 \
 840     EMIT_TWO_BYTES (c2, c3);            \
 841   } while (0)
 842
 843
 844 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 845   do {                                          \
 846     EMIT_TWO_BYTES (c1, c2);                    \
 847     EMIT_TWO_BYTES (c3, c4);                    \
 848   } while (0)
 849
 850
 851 static void
 852 record_conversion_result (struct coding_system *coding,
 853                           enum coding_result_code result)
 854 {
 855   coding->result = result;
 856   switch (result)
 857     {
 858     case CODING_RESULT_INSUFFICIENT_SRC:
 859       Vlast_code_conversion_error = Qinsufficient_source;
 860       break;
 861     case CODING_RESULT_INVALID_SRC:
 862       Vlast_code_conversion_error = Qinvalid_source;
 863       break;
 864     case CODING_RESULT_INTERRUPT:
 865       Vlast_code_conversion_error = Qinterrupted;
 866       break;
 867     case CODING_RESULT_INSUFFICIENT_DST:
 868       /* Don't record this error in Vlast_code_conversion_error
 869          because it happens just temporarily and is resolved when the
 870          whole conversion is finished.  */
 871       break;
 872     case CODING_RESULT_SUCCESS:
 873       break;
 874     default:
 875       Vlast_code_conversion_error = intern ("Unknown error");
 876     }
 877 }
 878
 879 /* These wrapper macros are used to preserve validity of pointers into
 880    buffer text across calls to decode_char, encode_char, etc, which
 881    could cause relocation of buffers if it loads a charset map,
 882    because loading a charset map allocates large structures.  */
 883
 884 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 885   do {                                                                       \
 886     ptrdiff_t offset;                                                        \
 887                                                                              \
 888     charset_map_loaded = 0;                                                  \
 889     c = DECODE_CHAR (charset, code);                                         \
 890     if (charset_map_loaded                                                   \
 891         && (offset = coding_change_source (coding)))                         \
 892       {                                                                      \
 893         src += offset;                                                       \
 894         src_base += offset;                                                  \
 895         src_end += offset;                                                   \
 896       }                                                                      \
 897   } while (0)
 898
 899 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 900   do {                                                                  \
 901     ptrdiff_t offset;                                                   \
 902                                                                         \
 903     charset_map_loaded = 0;                                             \
 904     code = ENCODE_CHAR (charset, c);                                    \
 905     if (charset_map_loaded                                              \
 906         && (offset = coding_change_destination (coding)))               \
 907       {                                                                 \
 908         dst += offset;                                                  \
 909         dst_end += offset;                                              \
 910       }                                                                 \
 911   } while (0)
 912
 913 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 914   do {                                                                  \
 915     ptrdiff_t offset;                                                   \
 916                                                                         \
 917     charset_map_loaded = 0;                                             \
 918     charset = char_charset (c, charset_list, code_return);              \
 919     if (charset_map_loaded                                              \
 920         && (offset = coding_change_destination (coding)))               \
 921       {                                                                 \
 922         dst += offset;                                                  \
 923         dst_end += offset;                                              \
 924       }                                                                 \
 925   } while (0)
 926
 927 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 928   do {                                                                  \
 929     ptrdiff_t offset;                                                   \
 930                                                                         \
 931     charset_map_loaded = 0;                                             \
 932     result = CHAR_CHARSET_P (c, charset);                               \
 933     if (charset_map_loaded                                              \
 934         && (offset = coding_change_destination (coding)))               \
 935       {                                                                 \
 936         dst += offset;                                                  \
 937         dst_end += offset;                                              \
 938       }                                                                 \
 939   } while (0)
 940
 941
 942 /* If there are at least BYTES length of room at dst, allocate memory
 943    for coding->destination and update dst and dst_end.  We don't have
 944    to take care of coding->source which will be relocated.  It is
 945    handled by calling coding_set_source in encode_coding.  */
 946
 947 #define ASSURE_DESTINATION(bytes)                               \
 948   do {                                                          \
 949     if (dst + (bytes) >= dst_end)                               \
 950       {                                                         \
 951         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 952                                                                 \
 953         dst = alloc_destination (coding, more_bytes, dst);      \
 954         dst_end = coding->destination + coding->dst_bytes;      \
 955       }                                                         \
 956   } while (0)
 957
 958
 959 /* Store multibyte form of the character C in P, and advance P to the
 960    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 961    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 962    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 963
 964 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 965
 966 /* Return the character code of character whose multibyte form is at
 967    P, and advance P to the end of the multibyte form.  This used to be
 968    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 969    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 970
 971 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 972
 973 /* Set coding->source from coding->src_object.  */
 974
 975 static void
 976 coding_set_source (struct coding_system *coding)
 977 {
 978   if (BUFFERP (coding->src_object))
 979     {
 980       struct buffer *buf = XBUFFER (coding->src_object);
 981
 982       if (coding->src_pos < 0)
 983         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 984       else
 985         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 986     }
 987   else if (STRINGP (coding->src_object))
 988     {
 989       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 990     }
 991   else
 992     {
 993       /* Otherwise, the source is C string and is never relocated
 994          automatically.  Thus we don't have to update anything.  */
 995     }
 996 }
 997
 998
 999 /* Set coding->source from coding->src_object, and return how many
1000    bytes coding->source was changed.  */
1001
1002 static ptrdiff_t
1003 coding_change_source (struct coding_system *coding)
1004 {
1005   const unsigned char *orig = coding->source;
1006   coding_set_source (coding);
1007   return coding->source - orig;
1008 }
1009
1010
1011 /* Set coding->destination from coding->dst_object.  */
1012
1013 static void
1014 coding_set_destination (struct coding_system *coding)
1015 {
1016   if (BUFFERP (coding->dst_object))
1017     {
1018       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1019         {
1020           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1021           coding->dst_bytes = (GAP_END_ADDR
1022                                - (coding->src_bytes - coding->consumed)
1023                                - coding->destination);
1024         }
1025       else
1026         {
1027           /* We are sure that coding->dst_pos_byte is before the gap
1028              of the buffer. */
1029           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1030                                  + coding->dst_pos_byte - BEG_BYTE);
1031           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1032                                - coding->destination);
1033         }
1034     }
1035   else
1036     {
1037       /* Otherwise, the destination is C string and is never relocated
1038          automatically.  Thus we don't have to update anything.  */
1039     }
1040 }
1041
1042
1043 /* Set coding->destination from coding->dst_object, and return how
1044    many bytes coding->destination was changed.  */
1045
1046 static ptrdiff_t
1047 coding_change_destination (struct coding_system *coding)
1048 {
1049   const unsigned char *orig = coding->destination;
1050   coding_set_destination (coding);
1051   return coding->destination - orig;
1052 }
1053
1054
1055 static void
1056 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1057 {
1058   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1059     string_overflow ();
1060   coding->destination = xrealloc (coding->destination,
1061                                   coding->dst_bytes + bytes);
1062   coding->dst_bytes += bytes;
1063 }
1064
1065 static void
1066 coding_alloc_by_making_gap (struct coding_system *coding,
1067                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1068 {
1069   if (EQ (coding->src_object, coding->dst_object))
1070     {
1071       /* The gap may contain the produced data at the head and not-yet
1072          consumed data at the tail.  To preserve those data, we at
1073          first make the gap size to zero, then increase the gap
1074          size.  */
1075       ptrdiff_t add = GAP_SIZE;
1076
1077       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1078       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1079       make_gap (bytes);
1080       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1081       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1082     }
1083   else
1084     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1085 }
1086
1087
1088 static unsigned char *
1089 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1090                    unsigned char *dst)
1091 {
1092   ptrdiff_t offset = dst - coding->destination;
1093
1094   if (BUFFERP (coding->dst_object))
1095     {
1096       struct buffer *buf = XBUFFER (coding->dst_object);
1097
1098       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1099     }
1100   else
1101     coding_alloc_by_realloc (coding, nbytes);
1102   coding_set_destination (coding);
1103   dst = coding->destination + offset;
1104   return dst;
1105 }
1106
1107 /** Macros for annotations.  */
1108
1109 /* An annotation data is stored in the array coding->charbuf in this
1110    format:
1111      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1112    LENGTH is the number of elements in the annotation.
1113    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1114    NCHARS is the number of characters in the text annotated.
1115
1116    The format of the following elements depend on ANNOTATION_MASK.
1117
1118    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1119    follows:
1120      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1121
1122    NBYTES is the number of bytes specified in the header part of
1123    old-style emacs-mule encoding, or 0 for the other kind of
1124    composition.
1125
1126    METHOD is one of enum composition_method.
1127
1128    Optional COMPOSITION-COMPONENTS are characters and composition
1129    rules.
1130
1131    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1132    follows.
1133
1134    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1135    recover from an invalid annotation, and should be skipped by
1136    produce_annotation.  */
1137
1138 /* Maximum length of the header of annotation data.  */
1139 #define MAX_ANNOTATION_LENGTH 5
1140
1141 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1142   do {                                                  \
1143     *(buf)++ = -(len);                                  \
1144     *(buf)++ = (mask);                                  \
1145     *(buf)++ = (nchars);                                \
1146     coding->annotated = 1;                              \
1147   } while (0);
1148
1149 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1150   do {                                                                      \
1151     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1152     *buf++ = nbytes;                                                        \
1153     *buf++ = method;                                                        \
1154   } while (0)
1155
1156
1157 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1158   do {                                                                  \
1159     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1160     *buf++ = id;                                                        \
1161   } while (0)
1162
1163
1164 /* Bitmasks for coding->eol_seen.  */
1165
1166 #define EOL_SEEN_NONE   0
1167 #define EOL_SEEN_LF     1
1168 #define EOL_SEEN_CR     2
1169 #define EOL_SEEN_CRLF   4
1170
1171 \f
1172 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1173
1174
1175
1176 \f
1177 /*** 3. UTF-8 ***/
1178
1179 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1180    Return true if a text is encoded in UTF-8.  */
1181
1182 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1183 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1184 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1185 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1186 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1187 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1188
1189 #define UTF_8_BOM_1 0xEF
1190 #define UTF_8_BOM_2 0xBB
1191 #define UTF_8_BOM_3 0xBF
1192
1193 /* Unlike the other detect_coding_XXX, this function counts number of
1194    characters and check EOL format.  */
1195
1196 static bool
1197 detect_coding_utf_8 (struct coding_system *coding,
1198                      struct coding_detection_info *detect_info)
1199 {
1200   const unsigned char *src = coding->source, *src_base;
1201   const unsigned char *src_end = coding->source + coding->src_bytes;
1202   bool multibytep = coding->src_multibyte;
1203   ptrdiff_t consumed_chars = 0;
1204   bool bom_found = 0;
1205   int nchars = coding->head_ascii;
1206   int eol_seen = coding->eol_seen;
1207
1208   detect_info->checked |= CATEGORY_MASK_UTF_8;
1209   /* A coding system of this category is always ASCII compatible.  */
1210   src += nchars;
1211
1212   if (src == coding->source     /* BOM should be at the head.  */
1213       && src + 3 < src_end      /* BOM is 3-byte long.  */
1214       && src[0] == UTF_8_BOM_1
1215       && src[1] == UTF_8_BOM_2
1216       && src[2] == UTF_8_BOM_3)
1217     {
1218       bom_found = 1;
1219       src += 3;
1220       nchars++;
1221     }
1222
1223   while (1)
1224     {
1225       int c, c1, c2, c3, c4;
1226
1227       src_base = src;
1228       ONE_MORE_BYTE (c);
1229       if (c < 0 || UTF_8_1_OCTET_P (c))
1230         {
1231           nchars++;
1232           if (c == '\r')
1233             {
1234               if (src < src_end && *src == '\n')
1235                 {
1236                   eol_seen |= EOL_SEEN_CRLF;
1237                   src++;
1238                   nchars++;
1239                 }
1240               else
1241                 eol_seen |= EOL_SEEN_CR;
1242             }
1243           else if (c == '\n')
1244             eol_seen |= EOL_SEEN_LF;
1245           continue;
1246         }
1247       ONE_MORE_BYTE (c1);
1248       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1249         break;
1250       if (UTF_8_2_OCTET_LEADING_P (c))
1251         {
1252           nchars++;
1253           continue;
1254         }
1255       ONE_MORE_BYTE (c2);
1256       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1257         break;
1258       if (UTF_8_3_OCTET_LEADING_P (c))
1259         {
1260           nchars++;
1261           continue;
1262         }
1263       ONE_MORE_BYTE (c3);
1264       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1265         break;
1266       if (UTF_8_4_OCTET_LEADING_P (c))
1267         {
1268           nchars++;
1269           continue;
1270         }
1271       ONE_MORE_BYTE (c4);
1272       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1273         break;
1274       if (UTF_8_5_OCTET_LEADING_P (c))
1275         {
1276           nchars++;
1277           continue;
1278         }
1279       break;
1280     }
1281   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1282   return 0;
1283
1284  no_more_source:
1285   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1286     {
1287       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1288       return 0;
1289     }
1290   if (bom_found)
1291     {
1292       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1293       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1294     }
1295   else
1296     {
1297       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1298       if (nchars < src_end - coding->source)
1299         /* The found characters are less than source bytes, which
1300            means that we found a valid non-ASCII characters.  */
1301         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1302     }
1303   coding->detected_utf8_chars = nchars;
1304   return 1;
1305 }
1306
1307
1308 static void
1309 decode_coding_utf_8 (struct coding_system *coding)
1310 {
1311   const unsigned char *src = coding->source + coding->consumed;
1312   const unsigned char *src_end = coding->source + coding->src_bytes;
1313   const unsigned char *src_base;
1314   int *charbuf = coding->charbuf + coding->charbuf_used;
1315   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1316   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1317   bool multibytep = coding->src_multibyte;
1318   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1319   bool eol_dos
1320     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1321   int byte_after_cr = -1;
1322
1323   if (bom != utf_without_bom)
1324     {
1325       int c1, c2, c3;
1326
1327       src_base = src;
1328       ONE_MORE_BYTE (c1);
1329       if (! UTF_8_3_OCTET_LEADING_P (c1))
1330         src = src_base;
1331       else
1332         {
1333           ONE_MORE_BYTE (c2);
1334           if (! UTF_8_EXTRA_OCTET_P (c2))
1335             src = src_base;
1336           else
1337             {
1338               ONE_MORE_BYTE (c3);
1339               if (! UTF_8_EXTRA_OCTET_P (c3))
1340                 src = src_base;
1341               else
1342                 {
1343                   if ((c1 != UTF_8_BOM_1)
1344                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1345                     src = src_base;
1346                   else
1347                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1348                 }
1349             }
1350         }
1351     }
1352   CODING_UTF_8_BOM (coding) = utf_without_bom;
1353
1354   while (1)
1355     {
1356       int c, c1, c2, c3, c4, c5;
1357
1358       src_base = src;
1359       consumed_chars_base = consumed_chars;
1360
1361       if (charbuf >= charbuf_end)
1362         {
1363           if (byte_after_cr >= 0)
1364             src_base--;
1365           break;
1366         }
1367
1368       /* In the simple case, rapidly handle ordinary characters */
1369       if (multibytep && ! eol_dos
1370           && charbuf < charbuf_end - 6 && src < src_end - 6)
1371         {
1372           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1373             {
1374               c1 = *src;
1375               if (c1 & 0x80)
1376                 break;
1377               src++;
1378               consumed_chars++;
1379               *charbuf++ = c1;
1380
1381               c1 = *src;
1382               if (c1 & 0x80)
1383                 break;
1384               src++;
1385               consumed_chars++;
1386               *charbuf++ = c1;
1387
1388               c1 = *src;
1389               if (c1 & 0x80)
1390                 break;
1391               src++;
1392               consumed_chars++;
1393               *charbuf++ = c1;
1394
1395               c1 = *src;
1396               if (c1 & 0x80)
1397                 break;
1398               src++;
1399               consumed_chars++;
1400               *charbuf++ = c1;
1401             }
1402           /* If we handled at least one character, restart the main loop.  */
1403           if (src != src_base)
1404             continue;
1405         }
1406
1407       if (byte_after_cr >= 0)
1408         c1 = byte_after_cr, byte_after_cr = -1;
1409       else
1410         ONE_MORE_BYTE (c1);
1411       if (c1 < 0)
1412         {
1413           c = - c1;
1414         }
1415       else if (UTF_8_1_OCTET_P (c1))
1416         {
1417           if (eol_dos && c1 == '\r')
1418             ONE_MORE_BYTE (byte_after_cr);
1419           c = c1;
1420         }
1421       else
1422         {
1423           ONE_MORE_BYTE (c2);
1424           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1425             goto invalid_code;
1426           if (UTF_8_2_OCTET_LEADING_P (c1))
1427             {
1428               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1429               /* Reject overlong sequences here and below.  Encoders
1430                  producing them are incorrect, they can be misleading,
1431                  and they mess up read/write invariance.  */
1432               if (c < 128)
1433                 goto invalid_code;
1434             }
1435           else
1436             {
1437               ONE_MORE_BYTE (c3);
1438               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1439                 goto invalid_code;
1440               if (UTF_8_3_OCTET_LEADING_P (c1))
1441                 {
1442                   c = (((c1 & 0xF) << 12)
1443                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1444                   if (c < 0x800
1445                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1446                     goto invalid_code;
1447                 }
1448               else
1449                 {
1450                   ONE_MORE_BYTE (c4);
1451                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1452                     goto invalid_code;
1453                   if (UTF_8_4_OCTET_LEADING_P (c1))
1454                     {
1455                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1456                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1457                     if (c < 0x10000)
1458                       goto invalid_code;
1459                     }
1460                   else
1461                     {
1462                       ONE_MORE_BYTE (c5);
1463                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1464                         goto invalid_code;
1465                       if (UTF_8_5_OCTET_LEADING_P (c1))
1466                         {
1467                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1468                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1469                                | (c5 & 0x3F));
1470                           if ((c > MAX_CHAR) || (c < 0x200000))
1471                             goto invalid_code;
1472                         }
1473                       else
1474                         goto invalid_code;
1475                     }
1476                 }
1477             }
1478         }
1479
1480       *charbuf++ = c;
1481       continue;
1482
1483     invalid_code:
1484       src = src_base;
1485       consumed_chars = consumed_chars_base;
1486       ONE_MORE_BYTE (c);
1487       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1488       coding->errors++;
1489     }
1490
1491  no_more_source:
1492   coding->consumed_char += consumed_chars_base;
1493   coding->consumed = src_base - coding->source;
1494   coding->charbuf_used = charbuf - coding->charbuf;
1495 }
1496
1497
1498 static bool
1499 encode_coding_utf_8 (struct coding_system *coding)
1500 {
1501   bool multibytep = coding->dst_multibyte;
1502   int *charbuf = coding->charbuf;
1503   int *charbuf_end = charbuf + coding->charbuf_used;
1504   unsigned char *dst = coding->destination + coding->produced;
1505   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1506   ptrdiff_t produced_chars = 0;
1507   int c;
1508
1509   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1510     {
1511       ASSURE_DESTINATION (3);
1512       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1513       CODING_UTF_8_BOM (coding) = utf_without_bom;
1514     }
1515
1516   if (multibytep)
1517     {
1518       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1519
1520       while (charbuf < charbuf_end)
1521         {
1522           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1523
1524           ASSURE_DESTINATION (safe_room);
1525           c = *charbuf++;
1526           if (CHAR_BYTE8_P (c))
1527             {
1528               c = CHAR_TO_BYTE8 (c);
1529               EMIT_ONE_BYTE (c);
1530             }
1531           else
1532             {
1533               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1534               for (p = str; p < pend; p++)
1535                 EMIT_ONE_BYTE (*p);
1536             }
1537         }
1538     }
1539   else
1540     {
1541       int safe_room = MAX_MULTIBYTE_LENGTH;
1542
1543       while (charbuf < charbuf_end)
1544         {
1545           ASSURE_DESTINATION (safe_room);
1546           c = *charbuf++;
1547           if (CHAR_BYTE8_P (c))
1548             *dst++ = CHAR_TO_BYTE8 (c);
1549           else
1550             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1551           produced_chars++;
1552         }
1553     }
1554   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1555   coding->produced_char += produced_chars;
1556   coding->produced = dst - coding->destination;
1557   return 0;
1558 }
1559
1560
1561 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1562    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1563
1564 #define UTF_16_HIGH_SURROGATE_P(val) \
1565   (((val) & 0xFC00) == 0xD800)
1566
1567 #define UTF_16_LOW_SURROGATE_P(val) \
1568   (((val) & 0xFC00) == 0xDC00)
1569
1570
1571 static bool
1572 detect_coding_utf_16 (struct coding_system *coding,
1573                       struct coding_detection_info *detect_info)
1574 {
1575   const unsigned char *src = coding->source;
1576   const unsigned char *src_end = coding->source + coding->src_bytes;
1577   bool multibytep = coding->src_multibyte;
1578   int c1, c2;
1579
1580   detect_info->checked |= CATEGORY_MASK_UTF_16;
1581   if (coding->mode & CODING_MODE_LAST_BLOCK
1582       && (coding->src_chars & 1))
1583     {
1584       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1585       return 0;
1586     }
1587
1588   TWO_MORE_BYTES (c1, c2);
1589   if ((c1 == 0xFF) && (c2 == 0xFE))
1590     {
1591       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1592                              | CATEGORY_MASK_UTF_16_AUTO);
1593       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1594                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1595                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1596     }
1597   else if ((c1 == 0xFE) && (c2 == 0xFF))
1598     {
1599       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1600                              | CATEGORY_MASK_UTF_16_AUTO);
1601       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1602                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1603                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1604     }
1605   else if (c2 < 0)
1606     {
1607       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1608       return 0;
1609     }
1610   else
1611     {
1612       /* We check the dispersion of Eth and Oth bytes where E is even and
1613          O is odd.  If both are high, we assume binary data.*/
1614       unsigned char e[256], o[256];
1615       unsigned e_num = 1, o_num = 1;
1616
1617       memset (e, 0, 256);
1618       memset (o, 0, 256);
1619       e[c1] = 1;
1620       o[c2] = 1;
1621
1622       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1623                                 |CATEGORY_MASK_UTF_16_BE
1624                                 | CATEGORY_MASK_UTF_16_LE);
1625
1626       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1627              != CATEGORY_MASK_UTF_16)
1628         {
1629           TWO_MORE_BYTES (c1, c2);
1630           if (c2 < 0)
1631             break;
1632           if (! e[c1])
1633             {
1634               e[c1] = 1;
1635               e_num++;
1636               if (e_num >= 128)
1637                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1638             }
1639           if (! o[c2])
1640             {
1641               o[c2] = 1;
1642               o_num++;
1643               if (o_num >= 128)
1644                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1645             }
1646         }
1647       return 0;
1648     }
1649
1650  no_more_source:
1651   return 1;
1652 }
1653
1654 static void
1655 decode_coding_utf_16 (struct coding_system *coding)
1656 {
1657   const unsigned char *src = coding->source + coding->consumed;
1658   const unsigned char *src_end = coding->source + coding->src_bytes;
1659   const unsigned char *src_base;
1660   int *charbuf = coding->charbuf + coding->charbuf_used;
1661   /* We may produces at most 3 chars in one loop.  */
1662   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1663   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1664   bool multibytep = coding->src_multibyte;
1665   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1666   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1667   int surrogate = CODING_UTF_16_SURROGATE (coding);
1668   bool eol_dos
1669     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1670   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1671
1672   if (bom == utf_with_bom)
1673     {
1674       int c, c1, c2;
1675
1676       src_base = src;
1677       ONE_MORE_BYTE (c1);
1678       ONE_MORE_BYTE (c2);
1679       c = (c1 << 8) | c2;
1680
1681       if (endian == utf_16_big_endian
1682           ? c != 0xFEFF : c != 0xFFFE)
1683         {
1684           /* The first two bytes are not BOM.  Treat them as bytes
1685              for a normal character.  */
1686           src = src_base;
1687           coding->errors++;
1688         }
1689       CODING_UTF_16_BOM (coding) = utf_without_bom;
1690     }
1691   else if (bom == utf_detect_bom)
1692     {
1693       /* We have already tried to detect BOM and failed in
1694          detect_coding.  */
1695       CODING_UTF_16_BOM (coding) = utf_without_bom;
1696     }
1697
1698   while (1)
1699     {
1700       int c, c1, c2;
1701
1702       src_base = src;
1703       consumed_chars_base = consumed_chars;
1704
1705       if (charbuf >= charbuf_end)
1706         {
1707           if (byte_after_cr1 >= 0)
1708             src_base -= 2;
1709           break;
1710         }
1711
1712       if (byte_after_cr1 >= 0)
1713         c1 = byte_after_cr1, byte_after_cr1 = -1;
1714       else
1715         ONE_MORE_BYTE (c1);
1716       if (c1 < 0)
1717         {
1718           *charbuf++ = -c1;
1719           continue;
1720         }
1721       if (byte_after_cr2 >= 0)
1722         c2 = byte_after_cr2, byte_after_cr2 = -1;
1723       else
1724         ONE_MORE_BYTE (c2);
1725       if (c2 < 0)
1726         {
1727           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1728           *charbuf++ = -c2;
1729           continue;
1730         }
1731       c = (endian == utf_16_big_endian
1732            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1733
1734       if (surrogate)
1735         {
1736           if (! UTF_16_LOW_SURROGATE_P (c))
1737             {
1738               if (endian == utf_16_big_endian)
1739                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1740               else
1741                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1742               *charbuf++ = c1;
1743               *charbuf++ = c2;
1744               coding->errors++;
1745               if (UTF_16_HIGH_SURROGATE_P (c))
1746                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1747               else
1748                 *charbuf++ = c;
1749             }
1750           else
1751             {
1752               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1753               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1754               *charbuf++ = 0x10000 + c;
1755             }
1756         }
1757       else
1758         {
1759           if (UTF_16_HIGH_SURROGATE_P (c))
1760             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1761           else
1762             {
1763               if (eol_dos && c == '\r')
1764                 {
1765                   ONE_MORE_BYTE (byte_after_cr1);
1766                   ONE_MORE_BYTE (byte_after_cr2);
1767                 }
1768               *charbuf++ = c;
1769             }
1770         }
1771     }
1772
1773  no_more_source:
1774   coding->consumed_char += consumed_chars_base;
1775   coding->consumed = src_base - coding->source;
1776   coding->charbuf_used = charbuf - coding->charbuf;
1777 }
1778
1779 static bool
1780 encode_coding_utf_16 (struct coding_system *coding)
1781 {
1782   bool multibytep = coding->dst_multibyte;
1783   int *charbuf = coding->charbuf;
1784   int *charbuf_end = charbuf + coding->charbuf_used;
1785   unsigned char *dst = coding->destination + coding->produced;
1786   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1787   int safe_room = 8;
1788   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1789   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1790   ptrdiff_t produced_chars = 0;
1791   int c;
1792
1793   if (bom != utf_without_bom)
1794     {
1795       ASSURE_DESTINATION (safe_room);
1796       if (big_endian)
1797         EMIT_TWO_BYTES (0xFE, 0xFF);
1798       else
1799         EMIT_TWO_BYTES (0xFF, 0xFE);
1800       CODING_UTF_16_BOM (coding) = utf_without_bom;
1801     }
1802
1803   while (charbuf < charbuf_end)
1804     {
1805       ASSURE_DESTINATION (safe_room);
1806       c = *charbuf++;
1807       if (c > MAX_UNICODE_CHAR)
1808         c = coding->default_char;
1809
1810       if (c < 0x10000)
1811         {
1812           if (big_endian)
1813             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1814           else
1815             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1816         }
1817       else
1818         {
1819           int c1, c2;
1820
1821           c -= 0x10000;
1822           c1 = (c >> 10) + 0xD800;
1823           c2 = (c & 0x3FF) + 0xDC00;
1824           if (big_endian)
1825             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1826           else
1827             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1828         }
1829     }
1830   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1831   coding->produced = dst - coding->destination;
1832   coding->produced_char += produced_chars;
1833   return 0;
1834 }
1835
1836 \f
1837 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1838
1839 /* Emacs' internal format for representation of multiple character
1840    sets is a kind of multi-byte encoding, i.e. characters are
1841    represented by variable-length sequences of one-byte codes.
1842
1843    ASCII characters and control characters (e.g. `tab', `newline') are
1844    represented by one-byte sequences which are their ASCII codes, in
1845    the range 0x00 through 0x7F.
1846
1847    8-bit characters of the range 0x80..0x9F are represented by
1848    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1849    code + 0x20).
1850
1851    8-bit characters of the range 0xA0..0xFF are represented by
1852    one-byte sequences which are their 8-bit code.
1853
1854    The other characters are represented by a sequence of `base
1855    leading-code', optional `extended leading-code', and one or two
1856    `position-code's.  The length of the sequence is determined by the
1857    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1858    whereas extended leading-code and position-code take the range 0xA0
1859    through 0xFF.  See `charset.h' for more details about leading-code
1860    and position-code.
1861
1862    --- CODE RANGE of Emacs' internal format ---
1863    character set        range
1864    -------------        -----
1865    ascii                0x00..0x7F
1866    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1867    eight-bit-graphic    0xA0..0xBF
1868    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1869    ---------------------------------------------
1870
1871    As this is the internal character representation, the format is
1872    usually not used externally (i.e. in a file or in a data sent to a
1873    process).  But, it is possible to have a text externally in this
1874    format (i.e. by encoding by the coding system `emacs-mule').
1875
1876    In that case, a sequence of one-byte codes has a slightly different
1877    form.
1878
1879    At first, all characters in eight-bit-control are represented by
1880    one-byte sequences which are their 8-bit code.
1881
1882    Next, character composition data are represented by the byte
1883    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1884    where,
1885         METHOD is 0xF2 plus one of composition method (enum
1886         composition_method),
1887
1888         BYTES is 0xA0 plus a byte length of this composition data,
1889
1890         CHARS is 0xA0 plus a number of characters composed by this
1891         data,
1892
1893         COMPONENTs are characters of multibyte form or composition
1894         rules encoded by two-byte of ASCII codes.
1895
1896    In addition, for backward compatibility, the following formats are
1897    also recognized as composition data on decoding.
1898
1899    0x80 MSEQ ...
1900    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1901
1902    Here,
1903         MSEQ is a multibyte form but in these special format:
1904           ASCII: 0xA0 ASCII_CODE+0x80,
1905           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1906         RULE is a one byte code of the range 0xA0..0xF0 that
1907         represents a composition rule.
1908   */
1909
1910 char emacs_mule_bytes[256];
1911
1912
1913 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1914    Return true if a text is encoded in 'emacs-mule'.  */
1915
1916 static bool
1917 detect_coding_emacs_mule (struct coding_system *coding,
1918                           struct coding_detection_info *detect_info)
1919 {
1920   const unsigned char *src = coding->source, *src_base;
1921   const unsigned char *src_end = coding->source + coding->src_bytes;
1922   bool multibytep = coding->src_multibyte;
1923   ptrdiff_t consumed_chars = 0;
1924   int c;
1925   int found = 0;
1926
1927   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1928   /* A coding system of this category is always ASCII compatible.  */
1929   src += coding->head_ascii;
1930
1931   while (1)
1932     {
1933       src_base = src;
1934       ONE_MORE_BYTE (c);
1935       if (c < 0)
1936         continue;
1937       if (c == 0x80)
1938         {
1939           /* Perhaps the start of composite character.  We simply skip
1940              it because analyzing it is too heavy for detecting.  But,
1941              at least, we check that the composite character
1942              constitutes of more than 4 bytes.  */
1943           const unsigned char *src_start;
1944
1945         repeat:
1946           src_start = src;
1947           do
1948             {
1949               ONE_MORE_BYTE (c);
1950             }
1951           while (c >= 0xA0);
1952
1953           if (src - src_start <= 4)
1954             break;
1955           found = CATEGORY_MASK_EMACS_MULE;
1956           if (c == 0x80)
1957             goto repeat;
1958         }
1959
1960       if (c < 0x80)
1961         {
1962           if (c < 0x20
1963               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1964             break;
1965         }
1966       else
1967         {
1968           int more_bytes = emacs_mule_bytes[c] - 1;
1969
1970           while (more_bytes > 0)
1971             {
1972               ONE_MORE_BYTE (c);
1973               if (c < 0xA0)
1974                 {
1975                   src--;        /* Unread the last byte.  */
1976                   break;
1977                 }
1978               more_bytes--;
1979             }
1980           if (more_bytes != 0)
1981             break;
1982           found = CATEGORY_MASK_EMACS_MULE;
1983         }
1984     }
1985   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1986   return 0;
1987
1988  no_more_source:
1989   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1990     {
1991       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1992       return 0;
1993     }
1994   detect_info->found |= found;
1995   return 1;
1996 }
1997
1998
1999 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2000    character.  If CMP_STATUS indicates that we must expect MSEQ or
2001    RULE described above, decode it and return the negative value of
2002    the decoded character or rule.  If an invalid byte is found, return
2003    -1.  If SRC is too short, return -2.  */
2004
2005 static int
2006 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2007                  int *nbytes, int *nchars, int *id,
2008                  struct composition_status *cmp_status)
2009 {
2010   const unsigned char *src_end = coding->source + coding->src_bytes;
2011   const unsigned char *src_base = src;
2012   bool multibytep = coding->src_multibyte;
2013   int charset_ID;
2014   unsigned code;
2015   int c;
2016   int consumed_chars = 0;
2017   bool mseq_found = 0;
2018
2019   ONE_MORE_BYTE (c);
2020   if (c < 0)
2021     {
2022       c = -c;
2023       charset_ID = emacs_mule_charset[0];
2024     }
2025   else
2026     {
2027       if (c >= 0xA0)
2028         {
2029           if (cmp_status->state != COMPOSING_NO
2030               && cmp_status->old_form)
2031             {
2032               if (cmp_status->state == COMPOSING_CHAR)
2033                 {
2034                   if (c == 0xA0)
2035                     {
2036                       ONE_MORE_BYTE (c);
2037                       c -= 0x80;
2038                       if (c < 0)
2039                         goto invalid_code;
2040                     }
2041                   else
2042                     c -= 0x20;
2043                   mseq_found = 1;
2044                 }
2045               else
2046                 {
2047                   *nbytes = src - src_base;
2048                   *nchars = consumed_chars;
2049                   return -c;
2050                 }
2051             }
2052           else
2053             goto invalid_code;
2054         }
2055
2056       switch (emacs_mule_bytes[c])
2057         {
2058         case 2:
2059           if ((charset_ID = emacs_mule_charset[c]) < 0)
2060             goto invalid_code;
2061           ONE_MORE_BYTE (c);
2062           if (c < 0xA0)
2063             goto invalid_code;
2064           code = c & 0x7F;
2065           break;
2066
2067         case 3:
2068           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2069               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2070             {
2071               ONE_MORE_BYTE (c);
2072               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2073                 goto invalid_code;
2074               ONE_MORE_BYTE (c);
2075               if (c < 0xA0)
2076                 goto invalid_code;
2077               code = c & 0x7F;
2078             }
2079           else
2080             {
2081               if ((charset_ID = emacs_mule_charset[c]) < 0)
2082                 goto invalid_code;
2083               ONE_MORE_BYTE (c);
2084               if (c < 0xA0)
2085                 goto invalid_code;
2086               code = (c & 0x7F) << 8;
2087               ONE_MORE_BYTE (c);
2088               if (c < 0xA0)
2089                 goto invalid_code;
2090               code |= c & 0x7F;
2091             }
2092           break;
2093
2094         case 4:
2095           ONE_MORE_BYTE (c);
2096           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2097             goto invalid_code;
2098           ONE_MORE_BYTE (c);
2099           if (c < 0xA0)
2100             goto invalid_code;
2101           code = (c & 0x7F) << 8;
2102           ONE_MORE_BYTE (c);
2103           if (c < 0xA0)
2104             goto invalid_code;
2105           code |= c & 0x7F;
2106           break;
2107
2108         case 1:
2109           code = c;
2110           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2111           break;
2112
2113         default:
2114           emacs_abort ();
2115         }
2116       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2117                           CHARSET_FROM_ID (charset_ID), code, c);
2118       if (c < 0)
2119         goto invalid_code;
2120     }
2121   *nbytes = src - src_base;
2122   *nchars = consumed_chars;
2123   if (id)
2124     *id = charset_ID;
2125   return (mseq_found ? -c : c);
2126
2127  no_more_source:
2128   return -2;
2129
2130  invalid_code:
2131   return -1;
2132 }
2133
2134
2135 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2136
2137 /* Handle these composition sequence ('|': the end of header elements,
2138    BYTES and CHARS >= 0xA0):
2139
2140    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2141    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2142    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2143
2144    and these old form:
2145
2146    (4) relative composition: 0x80 | MSEQ ... MSEQ
2147    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2148
2149    When the starter 0x80 and the following header elements are found,
2150    this annotation header is produced.
2151
2152         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2153
2154    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2155    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2156
2157    Then, upon reading the following elements, these codes are produced
2158    until the composition end is found:
2159
2160    (1) CHAR ... CHAR
2161    (2) ALT ... ALT CHAR ... CHAR
2162    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2163    (4) CHAR ... CHAR
2164    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2165
2166    When the composition end is found, LENGTH and NCHARS in the
2167    annotation header is updated as below:
2168
2169    (1) LENGTH: unchanged, NCHARS: unchanged
2170    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2171    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2172    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2173    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2174
2175    If an error is found while composing, the annotation header is
2176    changed to the original composition header (plus filler -1s) as
2177    below:
2178
2179    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2180    (5)          [ 0x80 0xFF -1 -1- -1 ]
2181
2182    and the sequence [ -2 DECODED-RULE ] is changed to the original
2183    byte sequence as below:
2184         o the original byte sequence is B: [ B -1 ]
2185         o the original byte sequence is B1 B2: [ B1 B2 ]
2186
2187    Most of the routines are implemented by macros because many
2188    variables and labels in the caller decode_coding_emacs_mule must be
2189    accessible, and they are usually called just once (thus doesn't
2190    increase the size of compiled object).  */
2191
2192 /* Decode a composition rule represented by C as a component of
2193    composition sequence of Emacs 20 style.  Set RULE to the decoded
2194    rule. */
2195
2196 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2197   do {                                                  \
2198     int gref, nref;                                     \
2199                                                         \
2200     c -= 0xA0;                                          \
2201     if (c < 0 || c >= 81)                               \
2202       goto invalid_code;                                \
2203     gref = c / 9, nref = c % 9;                         \
2204     if (gref == 4) gref = 10;                           \
2205     if (nref == 4) nref = 10;                           \
2206     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2207   } while (0)
2208
2209
2210 /* Decode a composition rule represented by C and the following byte
2211    at SRC as a component of composition sequence of Emacs 21 style.
2212    Set RULE to the decoded rule.  */
2213
2214 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2215   do {                                                  \
2216     int gref, nref;                                     \
2217                                                         \
2218     gref = c - 0x20;                                    \
2219     if (gref < 0 || gref >= 81)                         \
2220       goto invalid_code;                                \
2221     ONE_MORE_BYTE (c);                                  \
2222     nref = c - 0x20;                                    \
2223     if (nref < 0 || nref >= 81)                         \
2224       goto invalid_code;                                \
2225     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2226   } while (0)
2227
2228
2229 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2230    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2231    byte length of this composition information, CHARS is the number of
2232    characters composed by this composition.  */
2233
2234 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2235   do {                                                                  \
2236     enum composition_method method = c - 0xF2;                          \
2237     int nbytes, nchars;                                                 \
2238                                                                         \
2239     ONE_MORE_BYTE (c);                                                  \
2240     if (c < 0)                                                          \
2241       goto invalid_code;                                                \
2242     nbytes = c - 0xA0;                                                  \
2243     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2244       goto invalid_code;                                                \
2245     ONE_MORE_BYTE (c);                                                  \
2246     nchars = c - 0xA0;                                                  \
2247     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2248       goto invalid_code;                                                \
2249     cmp_status->old_form = 0;                                           \
2250     cmp_status->method = method;                                        \
2251     if (method == COMPOSITION_RELATIVE)                                 \
2252       cmp_status->state = COMPOSING_CHAR;                               \
2253     else                                                                \
2254       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2255     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2256     cmp_status->nchars = nchars;                                        \
2257     cmp_status->ncomps = nbytes - 4;                                    \
2258     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2259   } while (0)
2260
2261
2262 /* Start of Emacs 20 style format for relative composition.  */
2263
2264 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2265   do {                                                          \
2266     cmp_status->old_form = 1;                                   \
2267     cmp_status->method = COMPOSITION_RELATIVE;                  \
2268     cmp_status->state = COMPOSING_CHAR;                         \
2269     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2270     cmp_status->nchars = cmp_status->ncomps = 0;                \
2271     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2272   } while (0)
2273
2274
2275 /* Start of Emacs 20 style format for rule-base composition.  */
2276
2277 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2278   do {                                                          \
2279     cmp_status->old_form = 1;                                   \
2280     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2281     cmp_status->state = COMPOSING_CHAR;                         \
2282     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2283     cmp_status->nchars = cmp_status->ncomps = 0;                \
2284     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2285   } while (0)
2286
2287
2288 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2289   do {                                                  \
2290     const unsigned char *current_src = src;             \
2291                                                         \
2292     ONE_MORE_BYTE (c);                                  \
2293     if (c < 0)                                          \
2294       goto invalid_code;                                \
2295     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2296         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2297       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2298     else if (c < 0xA0)                                  \
2299       goto invalid_code;                                \
2300     else if (c < 0xC0)                                  \
2301       {                                                 \
2302         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2303         /* Re-read C as a composition component.  */    \
2304         src = current_src;                              \
2305       }                                                 \
2306     else if (c == 0xFF)                                 \
2307       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2308     else                                                \
2309       goto invalid_code;                                \
2310   } while (0)
2311
2312 #define EMACS_MULE_COMPOSITION_END()                            \
2313   do {                                                          \
2314     int idx = - cmp_status->length;                             \
2315                                                                 \
2316     if (cmp_status->old_form)                                   \
2317       charbuf[idx + 2] = cmp_status->nchars;                    \
2318     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2319       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2320     cmp_status->state = COMPOSING_NO;                           \
2321   } while (0)
2322
2323
2324 static int
2325 emacs_mule_finish_composition (int *charbuf,
2326                                struct composition_status *cmp_status)
2327 {
2328   int idx = - cmp_status->length;
2329   int new_chars;
2330
2331   if (cmp_status->old_form && cmp_status->nchars > 0)
2332     {
2333       charbuf[idx + 2] = cmp_status->nchars;
2334       new_chars = 0;
2335       if (cmp_status->method == COMPOSITION_WITH_RULE
2336           && cmp_status->state == COMPOSING_CHAR)
2337         {
2338           /* The last rule was invalid.  */
2339           int rule = charbuf[-1] + 0xA0;
2340
2341           charbuf[-2] = BYTE8_TO_CHAR (rule);
2342           charbuf[-1] = -1;
2343           new_chars = 1;
2344         }
2345     }
2346   else
2347     {
2348       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2349
2350       if (cmp_status->method == COMPOSITION_WITH_RULE)
2351         {
2352           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2353           charbuf[idx++] = -3;
2354           charbuf[idx++] = 0;
2355           new_chars = 1;
2356         }
2357       else
2358         {
2359           int nchars = charbuf[idx + 1] + 0xA0;
2360           int nbytes = charbuf[idx + 2] + 0xA0;
2361
2362           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2363           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2364           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2365           charbuf[idx++] = -1;
2366           new_chars = 4;
2367         }
2368     }
2369   cmp_status->state = COMPOSING_NO;
2370   return new_chars;
2371 }
2372
2373 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2374   do {                                                                    \
2375     if (cmp_status->state != COMPOSING_NO)                                \
2376       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2377   } while (0)
2378
2379
2380 static void
2381 decode_coding_emacs_mule (struct coding_system *coding)
2382 {
2383   const unsigned char *src = coding->source + coding->consumed;
2384   const unsigned char *src_end = coding->source + coding->src_bytes;
2385   const unsigned char *src_base;
2386   int *charbuf = coding->charbuf + coding->charbuf_used;
2387   /* We may produce two annotations (charset and composition) in one
2388      loop and one more charset annotation at the end.  */
2389   int *charbuf_end
2390     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2391       /* We can produce up to 2 characters in a loop.  */
2392       - 1;
2393   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2394   bool multibytep = coding->src_multibyte;
2395   ptrdiff_t char_offset = coding->produced_char;
2396   ptrdiff_t last_offset = char_offset;
2397   int last_id = charset_ascii;
2398   bool eol_dos
2399     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2400   int byte_after_cr = -1;
2401   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2402
2403   if (cmp_status->state != COMPOSING_NO)
2404     {
2405       int i;
2406
2407       if (charbuf_end - charbuf < cmp_status->length)
2408         emacs_abort ();
2409       for (i = 0; i < cmp_status->length; i++)
2410         *charbuf++ = cmp_status->carryover[i];
2411       coding->annotated = 1;
2412     }
2413
2414   while (1)
2415     {
2416       int c, id IF_LINT (= 0);
2417
2418       src_base = src;
2419       consumed_chars_base = consumed_chars;
2420
2421       if (charbuf >= charbuf_end)
2422         {
2423           if (byte_after_cr >= 0)
2424             src_base--;
2425           break;
2426         }
2427
2428       if (byte_after_cr >= 0)
2429         c = byte_after_cr, byte_after_cr = -1;
2430       else
2431         ONE_MORE_BYTE (c);
2432
2433       if (c < 0 || c == 0x80)
2434         {
2435           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2436           if (c < 0)
2437             {
2438               *charbuf++ = -c;
2439               char_offset++;
2440             }
2441           else
2442             DECODE_EMACS_MULE_COMPOSITION_START ();
2443           continue;
2444         }
2445
2446       if (c < 0x80)
2447         {
2448           if (eol_dos && c == '\r')
2449             ONE_MORE_BYTE (byte_after_cr);
2450           id = charset_ascii;
2451           if (cmp_status->state != COMPOSING_NO)
2452             {
2453               if (cmp_status->old_form)
2454                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2455               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2456                 cmp_status->ncomps--;
2457             }
2458         }
2459       else
2460         {
2461           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2462           /* emacs_mule_char can load a charset map from a file, which
2463              allocates a large structure and might cause buffer text
2464              to be relocated as result.  Thus, we need to remember the
2465              original pointer to buffer text, and fix up all related
2466              pointers after the call.  */
2467           const unsigned char *orig = coding->source;
2468           ptrdiff_t offset;
2469
2470           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2471                                cmp_status);
2472           offset = coding->source - orig;
2473           if (offset)
2474             {
2475               src += offset;
2476               src_base += offset;
2477               src_end += offset;
2478             }
2479           if (c < 0)
2480             {
2481               if (c == -1)
2482                 goto invalid_code;
2483               if (c == -2)
2484                 break;
2485             }
2486           src = src_base + nbytes;
2487           consumed_chars = consumed_chars_base + nchars;
2488           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2489             cmp_status->ncomps -= nchars;
2490         }
2491
2492       /* Now if C >= 0, we found a normally encoded character, if C <
2493          0, we found an old-style composition component character or
2494          rule.  */
2495
2496       if (cmp_status->state == COMPOSING_NO)
2497         {
2498           if (last_id != id)
2499             {
2500               if (last_id != charset_ascii)
2501                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2502                                   last_id);
2503               last_id = id;
2504               last_offset = char_offset;
2505             }
2506           *charbuf++ = c;
2507           char_offset++;
2508         }
2509       else if (cmp_status->state == COMPOSING_CHAR)
2510         {
2511           if (cmp_status->old_form)
2512             {
2513               if (c >= 0)
2514                 {
2515                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2516                   *charbuf++ = c;
2517                   char_offset++;
2518                 }
2519               else
2520                 {
2521                   *charbuf++ = -c;
2522                   cmp_status->nchars++;
2523                   cmp_status->length++;
2524                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2525                     EMACS_MULE_COMPOSITION_END ();
2526                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2527                     cmp_status->state = COMPOSING_RULE;
2528                 }
2529             }
2530           else
2531             {
2532               *charbuf++ = c;
2533               cmp_status->length++;
2534               cmp_status->nchars--;
2535               if (cmp_status->nchars == 0)
2536                 EMACS_MULE_COMPOSITION_END ();
2537             }
2538         }
2539       else if (cmp_status->state == COMPOSING_RULE)
2540         {
2541           int rule;
2542
2543           if (c >= 0)
2544             {
2545               EMACS_MULE_COMPOSITION_END ();
2546               *charbuf++ = c;
2547               char_offset++;
2548             }
2549           else
2550             {
2551               c = -c;
2552               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2553               if (rule < 0)
2554                 goto invalid_code;
2555               *charbuf++ = -2;
2556               *charbuf++ = rule;
2557               cmp_status->length += 2;
2558               cmp_status->state = COMPOSING_CHAR;
2559             }
2560         }
2561       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2562         {
2563           *charbuf++ = c;
2564           cmp_status->length++;
2565           if (cmp_status->ncomps == 0)
2566             cmp_status->state = COMPOSING_CHAR;
2567           else if (cmp_status->ncomps > 0)
2568             {
2569               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2570                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2571             }
2572           else
2573             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2574         }
2575       else                      /* COMPOSING_COMPONENT_RULE */
2576         {
2577           int rule;
2578
2579           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2580           if (rule < 0)
2581             goto invalid_code;
2582           *charbuf++ = -2;
2583           *charbuf++ = rule;
2584           cmp_status->length += 2;
2585           cmp_status->ncomps--;
2586           if (cmp_status->ncomps > 0)
2587             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2588           else
2589             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2590         }
2591       continue;
2592
2593     invalid_code:
2594       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2595       src = src_base;
2596       consumed_chars = consumed_chars_base;
2597       ONE_MORE_BYTE (c);
2598       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2599       char_offset++;
2600       coding->errors++;
2601     }
2602
2603  no_more_source:
2604   if (cmp_status->state != COMPOSING_NO)
2605     {
2606       if (coding->mode & CODING_MODE_LAST_BLOCK)
2607         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2608       else
2609         {
2610           int i;
2611
2612           charbuf -= cmp_status->length;
2613           for (i = 0; i < cmp_status->length; i++)
2614             cmp_status->carryover[i] = charbuf[i];
2615         }
2616     }
2617   if (last_id != charset_ascii)
2618     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2619   coding->consumed_char += consumed_chars_base;
2620   coding->consumed = src_base - coding->source;
2621   coding->charbuf_used = charbuf - coding->charbuf;
2622 }
2623
2624
2625 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2626   do {                                          \
2627     if (id < 0xA0)                              \
2628       codes[0] = id, codes[1] = 0;              \
2629     else if (id < 0xE0)                         \
2630       codes[0] = 0x9A, codes[1] = id;           \
2631     else if (id < 0xF0)                         \
2632       codes[0] = 0x9B, codes[1] = id;           \
2633     else if (id < 0xF5)                         \
2634       codes[0] = 0x9C, codes[1] = id;           \
2635     else                                        \
2636       codes[0] = 0x9D, codes[1] = id;           \
2637   } while (0);
2638
2639
2640 static bool
2641 encode_coding_emacs_mule (struct coding_system *coding)
2642 {
2643   bool multibytep = coding->dst_multibyte;
2644   int *charbuf = coding->charbuf;
2645   int *charbuf_end = charbuf + coding->charbuf_used;
2646   unsigned char *dst = coding->destination + coding->produced;
2647   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2648   int safe_room = 8;
2649   ptrdiff_t produced_chars = 0;
2650   Lisp_Object attrs, charset_list;
2651   int c;
2652   int preferred_charset_id = -1;
2653
2654   CODING_GET_INFO (coding, attrs, charset_list);
2655   if (! EQ (charset_list, Vemacs_mule_charset_list))
2656     {
2657       charset_list = Vemacs_mule_charset_list;
2658       ASET (attrs, coding_attr_charset_list, charset_list);
2659     }
2660
2661   while (charbuf < charbuf_end)
2662     {
2663       ASSURE_DESTINATION (safe_room);
2664       c = *charbuf++;
2665
2666       if (c < 0)
2667         {
2668           /* Handle an annotation.  */
2669           switch (*charbuf)
2670             {
2671             case CODING_ANNOTATE_COMPOSITION_MASK:
2672               /* Not yet implemented.  */
2673               break;
2674             case CODING_ANNOTATE_CHARSET_MASK:
2675               preferred_charset_id = charbuf[3];
2676               if (preferred_charset_id >= 0
2677                   && NILP (Fmemq (make_number (preferred_charset_id),
2678                                   charset_list)))
2679                 preferred_charset_id = -1;
2680               break;
2681             default:
2682               emacs_abort ();
2683             }
2684           charbuf += -c - 1;
2685           continue;
2686         }
2687
2688       if (ASCII_CHAR_P (c))
2689         EMIT_ONE_ASCII_BYTE (c);
2690       else if (CHAR_BYTE8_P (c))
2691         {
2692           c = CHAR_TO_BYTE8 (c);
2693           EMIT_ONE_BYTE (c);
2694         }
2695       else
2696         {
2697           struct charset *charset;
2698           unsigned code;
2699           int dimension;
2700           int emacs_mule_id;
2701           unsigned char leading_codes[2];
2702
2703           if (preferred_charset_id >= 0)
2704             {
2705               bool result;
2706
2707               charset = CHARSET_FROM_ID (preferred_charset_id);
2708               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2709               if (result)
2710                 code = ENCODE_CHAR (charset, c);
2711               else
2712                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2713                                      &code, charset);
2714             }
2715           else
2716             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2717                                  &code, charset);
2718           if (! charset)
2719             {
2720               c = coding->default_char;
2721               if (ASCII_CHAR_P (c))
2722                 {
2723                   EMIT_ONE_ASCII_BYTE (c);
2724                   continue;
2725                 }
2726               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2727                                    &code, charset);
2728             }
2729           dimension = CHARSET_DIMENSION (charset);
2730           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2731           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2732           EMIT_ONE_BYTE (leading_codes[0]);
2733           if (leading_codes[1])
2734             EMIT_ONE_BYTE (leading_codes[1]);
2735           if (dimension == 1)
2736             EMIT_ONE_BYTE (code | 0x80);
2737           else
2738             {
2739               code |= 0x8080;
2740               EMIT_ONE_BYTE (code >> 8);
2741               EMIT_ONE_BYTE (code & 0xFF);
2742             }
2743         }
2744     }
2745   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2746   coding->produced_char += produced_chars;
2747   coding->produced = dst - coding->destination;
2748   return 0;
2749 }
2750
2751 \f
2752 /*** 7. ISO2022 handlers ***/
2753
2754 /* The following note describes the coding system ISO2022 briefly.
2755    Since the intention of this note is to help understand the
2756    functions in this file, some parts are NOT ACCURATE or are OVERLY
2757    SIMPLIFIED.  For thorough understanding, please refer to the
2758    original document of ISO2022.  This is equivalent to the standard
2759    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2760
2761    ISO2022 provides many mechanisms to encode several character sets
2762    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2763    is encoded using bytes less than 128.  This may make the encoded
2764    text a little bit longer, but the text passes more easily through
2765    several types of gateway, some of which strip off the MSB (Most
2766    Significant Bit).
2767
2768    There are two kinds of character sets: control character sets and
2769    graphic character sets.  The former contain control characters such
2770    as `newline' and `escape' to provide control functions (control
2771    functions are also provided by escape sequences).  The latter
2772    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2773    two control character sets and many graphic character sets.
2774
2775    Graphic character sets are classified into one of the following
2776    four classes, according to the number of bytes (DIMENSION) and
2777    number of characters in one dimension (CHARS) of the set:
2778    - DIMENSION1_CHARS94
2779    - DIMENSION1_CHARS96
2780    - DIMENSION2_CHARS94
2781    - DIMENSION2_CHARS96
2782
2783    In addition, each character set is assigned an identification tag,
2784    unique for each set, called the "final character" (denoted as <F>
2785    hereafter).  The <F> of each character set is decided by ECMA(*)
2786    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2787    (0x30..0x3F are for private use only).
2788
2789    Note (*): ECMA = European Computer Manufacturers Association
2790
2791    Here are examples of graphic character sets [NAME(<F>)]:
2792         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2793         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2794         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2795         o DIMENSION2_CHARS96 -- none for the moment
2796
2797    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2798         C0 [0x00..0x1F] -- control character plane 0
2799         GL [0x20..0x7F] -- graphic character plane 0
2800         C1 [0x80..0x9F] -- control character plane 1
2801         GR [0xA0..0xFF] -- graphic character plane 1
2802
2803    A control character set is directly designated and invoked to C0 or
2804    C1 by an escape sequence.  The most common case is that:
2805    - ISO646's  control character set is designated/invoked to C0, and
2806    - ISO6429's control character set is designated/invoked to C1,
2807    and usually these designations/invocations are omitted in encoded
2808    text.  In a 7-bit environment, only C0 can be used, and a control
2809    character for C1 is encoded by an appropriate escape sequence to
2810    fit into the environment.  All control characters for C1 are
2811    defined to have corresponding escape sequences.
2812
2813    A graphic character set is at first designated to one of four
2814    graphic registers (G0 through G3), then these graphic registers are
2815    invoked to GL or GR.  These designations and invocations can be
2816    done independently.  The most common case is that G0 is invoked to
2817    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2818    these invocations and designations are omitted in encoded text.
2819    In a 7-bit environment, only GL can be used.
2820
2821    When a graphic character set of CHARS94 is invoked to GL, codes
2822    0x20 and 0x7F of the GL area work as control characters SPACE and
2823    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2824    be used.
2825
2826    There are two ways of invocation: locking-shift and single-shift.
2827    With locking-shift, the invocation lasts until the next different
2828    invocation, whereas with single-shift, the invocation affects the
2829    following character only and doesn't affect the locking-shift
2830    state.  Invocations are done by the following control characters or
2831    escape sequences:
2832
2833    ----------------------------------------------------------------------
2834    abbrev  function                  cntrl escape seq   description
2835    ----------------------------------------------------------------------
2836    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2837    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2838    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2839    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2840    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2841    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2842    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2843    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2844    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2845    ----------------------------------------------------------------------
2846    (*) These are not used by any known coding system.
2847
2848    Control characters for these functions are defined by macros
2849    ISO_CODE_XXX in `coding.h'.
2850
2851    Designations are done by the following escape sequences:
2852    ----------------------------------------------------------------------
2853    escape sequence      description
2854    ----------------------------------------------------------------------
2855    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2856    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2857    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2858    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2859    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2860    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2861    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2862    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2863    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2864    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2865    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2866    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2867    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2868    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2869    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2870    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2871    ----------------------------------------------------------------------
2872
2873    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2874    of dimension 1, chars 94, and final character <F>, etc...
2875
2876    Note (*): Although these designations are not allowed in ISO2022,
2877    Emacs accepts them on decoding, and produces them on encoding
2878    CHARS96 character sets in a coding system which is characterized as
2879    7-bit environment, non-locking-shift, and non-single-shift.
2880
2881    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2882    '(' must be omitted.  We refer to this as "short-form" hereafter.
2883
2884    Now you may notice that there are a lot of ways of encoding the
2885    same multilingual text in ISO2022.  Actually, there exist many
2886    coding systems such as Compound Text (used in X11's inter client
2887    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2888    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2889    localized platforms), and all of these are variants of ISO2022.
2890
2891    In addition to the above, Emacs handles two more kinds of escape
2892    sequences: ISO6429's direction specification and Emacs' private
2893    sequence for specifying character composition.
2894
2895    ISO6429's direction specification takes the following form:
2896         o CSI ']'      -- end of the current direction
2897         o CSI '0' ']'  -- end of the current direction
2898         o CSI '1' ']'  -- start of left-to-right text
2899         o CSI '2' ']'  -- start of right-to-left text
2900    The control character CSI (0x9B: control sequence introducer) is
2901    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2902
2903    Character composition specification takes the following form:
2904         o ESC '0' -- start relative composition
2905         o ESC '1' -- end composition
2906         o ESC '2' -- start rule-base composition (*)
2907         o ESC '3' -- start relative composition with alternate chars  (**)
2908         o ESC '4' -- start rule-base composition with alternate chars  (**)
2909   Since these are not standard escape sequences of any ISO standard,
2910   the use of them with these meanings is restricted to Emacs only.
2911
2912   (*) This form is used only in Emacs 20.7 and older versions,
2913   but newer versions can safely decode it.
2914   (**) This form is used only in Emacs 21.1 and newer versions,
2915   and older versions can't decode it.
2916
2917   Here's a list of example usages of these composition escape
2918   sequences (categorized by `enum composition_method').
2919
2920   COMPOSITION_RELATIVE:
2921         ESC 0 CHAR [ CHAR ] ESC 1
2922   COMPOSITION_WITH_RULE:
2923         ESC 2 CHAR [ RULE CHAR ] ESC 1
2924   COMPOSITION_WITH_ALTCHARS:
2925         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2926   COMPOSITION_WITH_RULE_ALTCHARS:
2927         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2928
2929 static enum iso_code_class_type iso_code_class[256];
2930
2931 #define SAFE_CHARSET_P(coding, id)      \
2932   ((id) <= (coding)->max_charset_id     \
2933    && (coding)->safe_charsets[id] != 255)
2934
2935 static void
2936 setup_iso_safe_charsets (Lisp_Object attrs)
2937 {
2938   Lisp_Object charset_list, safe_charsets;
2939   Lisp_Object request;
2940   Lisp_Object reg_usage;
2941   Lisp_Object tail;
2942   EMACS_INT reg94, reg96;
2943   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2944   int max_charset_id;
2945
2946   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2947   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2948       && ! EQ (charset_list, Viso_2022_charset_list))
2949     {
2950       charset_list = Viso_2022_charset_list;
2951       ASET (attrs, coding_attr_charset_list, charset_list);
2952       ASET (attrs, coding_attr_safe_charsets, Qnil);
2953     }
2954
2955   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2956     return;
2957
2958   max_charset_id = 0;
2959   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2960     {
2961       int id = XINT (XCAR (tail));
2962       if (max_charset_id < id)
2963         max_charset_id = id;
2964     }
2965
2966   safe_charsets = make_uninit_string (max_charset_id + 1);
2967   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2968   request = AREF (attrs, coding_attr_iso_request);
2969   reg_usage = AREF (attrs, coding_attr_iso_usage);
2970   reg94 = XINT (XCAR (reg_usage));
2971   reg96 = XINT (XCDR (reg_usage));
2972
2973   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2974     {
2975       Lisp_Object id;
2976       Lisp_Object reg;
2977       struct charset *charset;
2978
2979       id = XCAR (tail);
2980       charset = CHARSET_FROM_ID (XINT (id));
2981       reg = Fcdr (Fassq (id, request));
2982       if (! NILP (reg))
2983         SSET (safe_charsets, XINT (id), XINT (reg));
2984       else if (charset->iso_chars_96)
2985         {
2986           if (reg96 < 4)
2987             SSET (safe_charsets, XINT (id), reg96);
2988         }
2989       else
2990         {
2991           if (reg94 < 4)
2992             SSET (safe_charsets, XINT (id), reg94);
2993         }
2994     }
2995   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2996 }
2997
2998
2999 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3000    Return true if a text is encoded in one of ISO-2022 based coding
3001    systems.  */
3002
3003 static bool
3004 detect_coding_iso_2022 (struct coding_system *coding,
3005                         struct coding_detection_info *detect_info)
3006 {
3007   const unsigned char *src = coding->source, *src_base = src;
3008   const unsigned char *src_end = coding->source + coding->src_bytes;
3009   bool multibytep = coding->src_multibyte;
3010   bool single_shifting = 0;
3011   int id;
3012   int c, c1;
3013   ptrdiff_t consumed_chars = 0;
3014   int i;
3015   int rejected = 0;
3016   int found = 0;
3017   int composition_count = -1;
3018
3019   detect_info->checked |= CATEGORY_MASK_ISO;
3020
3021   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3022     {
3023       struct coding_system *this = &(coding_categories[i]);
3024       Lisp_Object attrs, val;
3025
3026       if (this->id < 0)
3027         continue;
3028       attrs = CODING_ID_ATTRS (this->id);
3029       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3030           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3031         setup_iso_safe_charsets (attrs);
3032       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3033       this->max_charset_id = SCHARS (val) - 1;
3034       this->safe_charsets = SDATA (val);
3035     }
3036
3037   /* A coding system of this category is always ASCII compatible.  */
3038   src += coding->head_ascii;
3039
3040   while (rejected != CATEGORY_MASK_ISO)
3041     {
3042       src_base = src;
3043       ONE_MORE_BYTE (c);
3044       switch (c)
3045         {
3046         case ISO_CODE_ESC:
3047           if (inhibit_iso_escape_detection)
3048             break;
3049           single_shifting = 0;
3050           ONE_MORE_BYTE (c);
3051           if (c == 'N' || c == 'O')
3052             {
3053               /* ESC <Fe> for SS2 or SS3.  */
3054               single_shifting = 1;
3055               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3056             }
3057           else if (c == '1')
3058             {
3059               /* End of composition.  */
3060               if (composition_count < 0
3061                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3062                 /* Invalid */
3063                 break;
3064               composition_count = -1;
3065               found |= CATEGORY_MASK_ISO;
3066             }
3067           else if (c >= '0' && c <= '4')
3068             {
3069               /* ESC <Fp> for start/end composition.  */
3070               composition_count = 0;
3071             }
3072           else
3073             {
3074               if (c >= '(' && c <= '/')
3075                 {
3076                   /* Designation sequence for a charset of dimension 1.  */
3077                   ONE_MORE_BYTE (c1);
3078                   if (c1 < ' ' || c1 >= 0x80
3079                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3080                     /* Invalid designation sequence.  Just ignore.  */
3081                     break;
3082                 }
3083               else if (c == '$')
3084                 {
3085                   /* Designation sequence for a charset of dimension 2.  */
3086                   ONE_MORE_BYTE (c);
3087                   if (c >= '@' && c <= 'B')
3088                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3089                     id = iso_charset_table[1][0][c];
3090                   else if (c >= '(' && c <= '/')
3091                     {
3092                       ONE_MORE_BYTE (c1);
3093                       if (c1 < ' ' || c1 >= 0x80
3094                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3095                         /* Invalid designation sequence.  Just ignore.  */
3096                         break;
3097                     }
3098                   else
3099                     /* Invalid designation sequence.  Just ignore it.  */
3100                     break;
3101                 }
3102               else
3103                 {
3104                   /* Invalid escape sequence.  Just ignore it.  */
3105                   break;
3106                 }
3107
3108               /* We found a valid designation sequence for CHARSET.  */
3109               rejected |= CATEGORY_MASK_ISO_8BIT;
3110               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3111                                   id))
3112                 found |= CATEGORY_MASK_ISO_7;
3113               else
3114                 rejected |= CATEGORY_MASK_ISO_7;
3115               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3116                                   id))
3117                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3118               else
3119                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3120               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3121                                   id))
3122                 found |= CATEGORY_MASK_ISO_7_ELSE;
3123               else
3124                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3125               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3126                                   id))
3127                 found |= CATEGORY_MASK_ISO_8_ELSE;
3128               else
3129                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3130             }
3131           break;
3132
3133         case ISO_CODE_SO:
3134         case ISO_CODE_SI:
3135           /* Locking shift out/in.  */
3136           if (inhibit_iso_escape_detection)
3137             break;
3138           single_shifting = 0;
3139           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3140           break;
3141
3142         case ISO_CODE_CSI:
3143           /* Control sequence introducer.  */
3144           single_shifting = 0;
3145           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3146           found |= CATEGORY_MASK_ISO_8_ELSE;
3147           goto check_extra_latin;
3148
3149         case ISO_CODE_SS2:
3150         case ISO_CODE_SS3:
3151           /* Single shift.   */
3152           if (inhibit_iso_escape_detection)
3153             break;
3154           single_shifting = 0;
3155           rejected |= CATEGORY_MASK_ISO_7BIT;
3156           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3157               & CODING_ISO_FLAG_SINGLE_SHIFT)
3158             {
3159               found |= CATEGORY_MASK_ISO_8_1;
3160               single_shifting = 1;
3161             }
3162           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3163               & CODING_ISO_FLAG_SINGLE_SHIFT)
3164             {
3165               found |= CATEGORY_MASK_ISO_8_2;
3166               single_shifting = 1;
3167             }
3168           if (single_shifting)
3169             break;
3170           goto check_extra_latin;
3171
3172         default:
3173           if (c < 0)
3174             continue;
3175           if (c < 0x80)
3176             {
3177               if (composition_count >= 0)
3178                 composition_count++;
3179               single_shifting = 0;
3180               break;
3181             }
3182           if (c >= 0xA0)
3183             {
3184               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3185               found |= CATEGORY_MASK_ISO_8_1;
3186               /* Check the length of succeeding codes of the range
3187                  0xA0..0FF.  If the byte length is even, we include
3188                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3189                  only when we are not single shifting.  */
3190               if (! single_shifting
3191                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3192                 {
3193                   int len = 1;
3194                   while (src < src_end)
3195                     {
3196                       src_base = src;
3197                       ONE_MORE_BYTE (c);
3198                       if (c < 0xA0)
3199                         {
3200                           src = src_base;
3201                           break;
3202                         }
3203                       len++;
3204                     }
3205
3206                   if (len & 1 && src < src_end)
3207                     {
3208                       rejected |= CATEGORY_MASK_ISO_8_2;
3209                       if (composition_count >= 0)
3210                         composition_count += len;
3211                     }
3212                   else
3213                     {
3214                       found |= CATEGORY_MASK_ISO_8_2;
3215                       if (composition_count >= 0)
3216                         composition_count += len / 2;
3217                     }
3218                 }
3219               break;
3220             }
3221         check_extra_latin:
3222           if (! VECTORP (Vlatin_extra_code_table)
3223               || NILP (AREF (Vlatin_extra_code_table, c)))
3224             {
3225               rejected = CATEGORY_MASK_ISO;
3226               break;
3227             }
3228           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3229               & CODING_ISO_FLAG_LATIN_EXTRA)
3230             found |= CATEGORY_MASK_ISO_8_1;
3231           else
3232             rejected |= CATEGORY_MASK_ISO_8_1;
3233           rejected |= CATEGORY_MASK_ISO_8_2;
3234           break;
3235         }
3236     }
3237   detect_info->rejected |= CATEGORY_MASK_ISO;
3238   return 0;
3239
3240  no_more_source:
3241   detect_info->rejected |= rejected;
3242   detect_info->found |= (found & ~rejected);
3243   return 1;
3244 }
3245
3246
3247 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3248    escape sequence should be kept.  */
3249 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3250   do {                                                                  \
3251     int id, prev;                                                       \
3252                                                                         \
3253     if (final < '0' || final >= 128                                     \
3254         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3255         || !SAFE_CHARSET_P (coding, id))                                \
3256       {                                                                 \
3257         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3258         chars_96 = -1;                                                  \
3259         break;                                                          \
3260       }                                                                 \
3261     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3262     if (id == charset_jisx0201_roman)                                   \
3263       {                                                                 \
3264         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3265           id = charset_ascii;                                           \
3266       }                                                                 \
3267     else if (id == charset_jisx0208_1978)                               \
3268       {                                                                 \
3269         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3270           id = charset_jisx0208;                                        \
3271       }                                                                 \
3272     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3273     /* If there was an invalid designation to REG previously, and this  \
3274        designation is ASCII to REG, we should keep this designation     \
3275        sequence.  */                                                    \
3276     if (prev == -2 && id == charset_ascii)                              \
3277       chars_96 = -1;                                                    \
3278   } while (0)
3279
3280
3281 /* Handle these composition sequence (ALT: alternate char):
3282
3283    (1) relative composition: ESC 0 CHAR ... ESC 1
3284    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3285    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3286    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3287
3288    When the start sequence (ESC 0/2/3/4) is found, this annotation
3289    header is produced.
3290
3291         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3292
3293    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3294    produced until the end sequence (ESC 1) is found:
3295
3296    (1) CHAR ... CHAR
3297    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3298    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3299    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3300
3301    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3302    annotation header is updated as below:
3303
3304    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3305    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3306    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3307    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3308
3309    If an error is found while composing, the annotation header is
3310    changed to:
3311
3312         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3313
3314    and the sequence [ -2 DECODED-RULE ] is changed to the original
3315    byte sequence as below:
3316         o the original byte sequence is B: [ B -1 ]
3317         o the original byte sequence is B1 B2: [ B1 B2 ]
3318    and the sequence [ -1 -1 ] is changed to the original byte
3319    sequence:
3320         [ ESC '0' ]
3321 */
3322
3323 /* Decode a composition rule C1 and maybe one more byte from the
3324    source, and set RULE to the encoded composition rule.  If the rule
3325    is invalid, goto invalid_code.  */
3326
3327 #define DECODE_COMPOSITION_RULE(rule)                                   \
3328   do {                                                                  \
3329     rule = c1 - 32;                                                     \
3330     if (rule < 0)                                                       \
3331       goto invalid_code;                                                \
3332     if (rule < 81)              /* old format (before ver.21) */        \
3333       {                                                                 \
3334         int gref = (rule) / 9;                                          \
3335         int nref = (rule) % 9;                                          \
3336         if (gref == 4) gref = 10;                                       \
3337         if (nref == 4) nref = 10;                                       \
3338         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3339       }                                                                 \
3340     else                        /* new format (after ver.21) */         \
3341       {                                                                 \
3342         int b;                                                          \
3343                                                                         \
3344         ONE_MORE_BYTE (b);                                              \
3345         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3346           goto invalid_code;                                            \
3347         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3348         rule += 0x100;   /* Distinguish it from the old format.  */     \
3349       }                                                                 \
3350   } while (0)
3351
3352 #define ENCODE_COMPOSITION_RULE(rule)                           \
3353   do {                                                          \
3354     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3355                                                                 \
3356     if (rule < 0x100)           /* old format */                \
3357       {                                                         \
3358         if (gref == 10) gref = 4;                               \
3359         if (nref == 10) nref = 4;                               \
3360         charbuf[idx] = 32 + gref * 9 + nref;                    \
3361         charbuf[idx + 1] = -1;                                  \
3362         new_chars++;                                            \
3363       }                                                         \
3364     else                                /* new format */        \
3365       {                                                         \
3366         charbuf[idx] = 32 + 81 + gref;                          \
3367         charbuf[idx + 1] = 32 + nref;                           \
3368         new_chars += 2;                                         \
3369       }                                                         \
3370   } while (0)
3371
3372 /* Finish the current composition as invalid.  */
3373
3374 static int
3375 finish_composition (int *charbuf, struct composition_status *cmp_status)
3376 {
3377   int idx = - cmp_status->length;
3378   int new_chars;
3379
3380   /* Recover the original ESC sequence */
3381   charbuf[idx++] = ISO_CODE_ESC;
3382   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3383                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3384                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3385                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3386                     : '4');
3387   charbuf[idx++] = -2;
3388   charbuf[idx++] = 0;
3389   charbuf[idx++] = -1;
3390   new_chars = cmp_status->nchars;
3391   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3392     for (; idx < 0; idx++)
3393       {
3394         int elt = charbuf[idx];
3395
3396         if (elt == -2)
3397           {
3398             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3399             idx++;
3400           }
3401         else if (elt == -1)
3402           {
3403             charbuf[idx++] = ISO_CODE_ESC;
3404             charbuf[idx] = '0';
3405             new_chars += 2;
3406           }
3407       }
3408   cmp_status->state = COMPOSING_NO;
3409   return new_chars;
3410 }
3411
3412 /* If characters are under composition, finish the composition.  */
3413 #define MAYBE_FINISH_COMPOSITION()                              \
3414   do {                                                          \
3415     if (cmp_status->state != COMPOSING_NO)                      \
3416       char_offset += finish_composition (charbuf, cmp_status);  \
3417   } while (0)
3418
3419 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3420
3421    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3422    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3423    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3424    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3425
3426    Produce this annotation sequence now:
3427
3428    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3429 */
3430
3431 #define DECODE_COMPOSITION_START(c1)                                       \
3432   do {                                                                     \
3433     if (c1 == '0'                                                          \
3434         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3435              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3436             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3437                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3438       {                                                                    \
3439         *charbuf++ = -1;                                                   \
3440         *charbuf++= -1;                                                    \
3441         cmp_status->state = COMPOSING_CHAR;                                \
3442         cmp_status->length += 2;                                           \
3443       }                                                                    \
3444     else                                                                   \
3445       {                                                                    \
3446         MAYBE_FINISH_COMPOSITION ();                                       \
3447         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3448                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3449                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3450                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3451         cmp_status->state                                                  \
3452           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3453         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3454         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3455         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3456         coding->annotated = 1;                                             \
3457       }                                                                    \
3458   } while (0)
3459
3460
3461 /* Handle composition end sequence ESC 1.  */
3462
3463 #define DECODE_COMPOSITION_END()                                        \
3464   do {                                                                  \
3465     if (cmp_status->nchars == 0                                         \
3466         || ((cmp_status->state == COMPOSING_CHAR)                       \
3467             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3468       {                                                                 \
3469         MAYBE_FINISH_COMPOSITION ();                                    \
3470         goto invalid_code;                                              \
3471       }                                                                 \
3472     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3473       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3474     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3475       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3476     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3477     char_offset += cmp_status->nchars;                                  \
3478     cmp_status->state = COMPOSING_NO;                                   \
3479   } while (0)
3480
3481 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3482
3483 #define STORE_COMPOSITION_RULE(rule)    \
3484   do {                                  \
3485     *charbuf++ = -2;                    \
3486     *charbuf++ = rule;                  \
3487     cmp_status->length += 2;            \
3488     cmp_status->state--;                \
3489   } while (0)
3490
3491 /* Store a composed char or a component char C in charbuf, and update
3492    cmp_status.  */
3493
3494 #define STORE_COMPOSITION_CHAR(c)                                       \
3495   do {                                                                  \
3496     *charbuf++ = (c);                                                   \
3497     cmp_status->length++;                                               \
3498     if (cmp_status->state == COMPOSING_CHAR)                            \
3499       cmp_status->nchars++;                                             \
3500     else                                                                \
3501       cmp_status->ncomps++;                                             \
3502     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3503         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3504             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3505       cmp_status->state++;                                              \
3506   } while (0)
3507
3508
3509 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3510
3511 static void
3512 decode_coding_iso_2022 (struct coding_system *coding)
3513 {
3514   const unsigned char *src = coding->source + coding->consumed;
3515   const unsigned char *src_end = coding->source + coding->src_bytes;
3516   const unsigned char *src_base;
3517   int *charbuf = coding->charbuf + coding->charbuf_used;
3518   /* We may produce two annotations (charset and composition) in one
3519      loop and one more charset annotation at the end.  */
3520   int *charbuf_end
3521     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3522   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3523   bool multibytep = coding->src_multibyte;
3524   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3525   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3526   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3527   int charset_id_2, charset_id_3;
3528   struct charset *charset;
3529   int c;
3530   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3531   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3532   ptrdiff_t char_offset = coding->produced_char;
3533   ptrdiff_t last_offset = char_offset;
3534   int last_id = charset_ascii;
3535   bool eol_dos
3536     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3537   int byte_after_cr = -1;
3538   int i;
3539
3540   setup_iso_safe_charsets (attrs);
3541   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3542
3543   if (cmp_status->state != COMPOSING_NO)
3544     {
3545       if (charbuf_end - charbuf < cmp_status->length)
3546         emacs_abort ();
3547       for (i = 0; i < cmp_status->length; i++)
3548         *charbuf++ = cmp_status->carryover[i];
3549       coding->annotated = 1;
3550     }
3551
3552   while (1)
3553     {
3554       int c1, c2, c3;
3555
3556       src_base = src;
3557       consumed_chars_base = consumed_chars;
3558
3559       if (charbuf >= charbuf_end)
3560         {
3561           if (byte_after_cr >= 0)
3562             src_base--;
3563           break;
3564         }
3565
3566       if (byte_after_cr >= 0)
3567         c1 = byte_after_cr, byte_after_cr = -1;
3568       else
3569         ONE_MORE_BYTE (c1);
3570       if (c1 < 0)
3571         goto invalid_code;
3572
3573       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3574         {
3575           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3576           char_offset++;
3577           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3578           continue;
3579         }
3580
3581       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3582         {
3583           if (c1 == ISO_CODE_ESC)
3584             {
3585               if (src + 1 >= src_end)
3586                 goto no_more_source;
3587               *charbuf++ = ISO_CODE_ESC;
3588               char_offset++;
3589               if (src[0] == '%' && src[1] == '@')
3590                 {
3591                   src += 2;
3592                   consumed_chars += 2;
3593                   char_offset += 2;
3594                   /* We are sure charbuf can contain two more chars. */
3595                   *charbuf++ = '%';
3596                   *charbuf++ = '@';
3597                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3598                 }
3599             }
3600           else
3601             {
3602               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3603               char_offset++;
3604             }
3605           continue;
3606         }
3607
3608       if ((cmp_status->state == COMPOSING_RULE
3609            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3610           && c1 != ISO_CODE_ESC)
3611         {
3612           int rule;
3613
3614           DECODE_COMPOSITION_RULE (rule);
3615           STORE_COMPOSITION_RULE (rule);
3616           continue;
3617         }
3618
3619       /* We produce at most one character.  */
3620       switch (iso_code_class [c1])
3621         {
3622         case ISO_0x20_or_0x7F:
3623           if (charset_id_0 < 0
3624               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3625             /* This is SPACE or DEL.  */
3626             charset = CHARSET_FROM_ID (charset_ascii);
3627           else
3628             charset = CHARSET_FROM_ID (charset_id_0);
3629           break;
3630
3631         case ISO_graphic_plane_0:
3632           if (charset_id_0 < 0)
3633             charset = CHARSET_FROM_ID (charset_ascii);
3634           else
3635             charset = CHARSET_FROM_ID (charset_id_0);
3636           break;
3637
3638         case ISO_0xA0_or_0xFF:
3639           if (charset_id_1 < 0
3640               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3641               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3642             goto invalid_code;
3643           /* This is a graphic character, we fall down ... */
3644
3645         case ISO_graphic_plane_1:
3646           if (charset_id_1 < 0)
3647             goto invalid_code;
3648           charset = CHARSET_FROM_ID (charset_id_1);
3649           break;
3650
3651         case ISO_control_0:
3652           if (eol_dos && c1 == '\r')
3653             ONE_MORE_BYTE (byte_after_cr);
3654           MAYBE_FINISH_COMPOSITION ();
3655           charset = CHARSET_FROM_ID (charset_ascii);
3656           break;
3657
3658         case ISO_control_1:
3659           goto invalid_code;
3660
3661         case ISO_shift_out:
3662           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3663               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3664             goto invalid_code;
3665           CODING_ISO_INVOCATION (coding, 0) = 1;
3666           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3667           continue;
3668
3669         case ISO_shift_in:
3670           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3671             goto invalid_code;
3672           CODING_ISO_INVOCATION (coding, 0) = 0;
3673           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3674           continue;
3675
3676         case ISO_single_shift_2_7:
3677           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3678             goto invalid_code;
3679         case ISO_single_shift_2:
3680           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3681             goto invalid_code;
3682           /* SS2 is handled as an escape sequence of ESC 'N' */
3683           c1 = 'N';
3684           goto label_escape_sequence;
3685
3686         case ISO_single_shift_3:
3687           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3688             goto invalid_code;
3689           /* SS2 is handled as an escape sequence of ESC 'O' */
3690           c1 = 'O';
3691           goto label_escape_sequence;
3692
3693         case ISO_control_sequence_introducer:
3694           /* CSI is handled as an escape sequence of ESC '[' ...  */
3695           c1 = '[';
3696           goto label_escape_sequence;
3697
3698         case ISO_escape:
3699           ONE_MORE_BYTE (c1);
3700         label_escape_sequence:
3701           /* Escape sequences handled here are invocation,
3702              designation, direction specification, and character
3703              composition specification.  */
3704           switch (c1)
3705             {
3706             case '&':           /* revision of following character set */
3707               ONE_MORE_BYTE (c1);
3708               if (!(c1 >= '@' && c1 <= '~'))
3709                 goto invalid_code;
3710               ONE_MORE_BYTE (c1);
3711               if (c1 != ISO_CODE_ESC)
3712                 goto invalid_code;
3713               ONE_MORE_BYTE (c1);
3714               goto label_escape_sequence;
3715
3716             case '$':           /* designation of 2-byte character set */
3717               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3718                 goto invalid_code;
3719               {
3720                 int reg, chars96;
3721
3722                 ONE_MORE_BYTE (c1);
3723                 if (c1 >= '@' && c1 <= 'B')
3724                   {     /* designation of JISX0208.1978, GB2312.1980,
3725                            or JISX0208.1980 */
3726                     reg = 0, chars96 = 0;
3727                   }
3728                 else if (c1 >= 0x28 && c1 <= 0x2B)
3729                   { /* designation of DIMENSION2_CHARS94 character set */
3730                     reg = c1 - 0x28, chars96 = 0;
3731                     ONE_MORE_BYTE (c1);
3732                   }
3733                 else if (c1 >= 0x2C && c1 <= 0x2F)
3734                   { /* designation of DIMENSION2_CHARS96 character set */
3735                     reg = c1 - 0x2C, chars96 = 1;
3736                     ONE_MORE_BYTE (c1);
3737                   }
3738                 else
3739                   goto invalid_code;
3740                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3741                 /* We must update these variables now.  */
3742                 if (reg == 0)
3743                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3744                 else if (reg == 1)
3745                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3746                 if (chars96 < 0)
3747                   goto invalid_code;
3748               }
3749               continue;
3750
3751             case 'n':           /* invocation of locking-shift-2 */
3752               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3753                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3754                 goto invalid_code;
3755               CODING_ISO_INVOCATION (coding, 0) = 2;
3756               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3757               continue;
3758
3759             case 'o':           /* invocation of locking-shift-3 */
3760               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3761                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3762                 goto invalid_code;
3763               CODING_ISO_INVOCATION (coding, 0) = 3;
3764               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3765               continue;
3766
3767             case 'N':           /* invocation of single-shift-2 */
3768               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3769                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3770                 goto invalid_code;
3771               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3772               if (charset_id_2 < 0)
3773                 charset = CHARSET_FROM_ID (charset_ascii);
3774               else
3775                 charset = CHARSET_FROM_ID (charset_id_2);
3776               ONE_MORE_BYTE (c1);
3777               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3778                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3779                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3780                           ? c1 >= 0x80 : c1 < 0x80)))
3781                 goto invalid_code;
3782               break;
3783
3784             case 'O':           /* invocation of single-shift-3 */
3785               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3786                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3787                 goto invalid_code;
3788               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3789               if (charset_id_3 < 0)
3790                 charset = CHARSET_FROM_ID (charset_ascii);
3791               else
3792                 charset = CHARSET_FROM_ID (charset_id_3);
3793               ONE_MORE_BYTE (c1);
3794               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3795                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3796                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3797                           ? c1 >= 0x80 : c1 < 0x80)))
3798                 goto invalid_code;
3799               break;
3800
3801             case '0': case '2': case '3': case '4': /* start composition */
3802               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3803                 goto invalid_code;
3804               if (last_id != charset_ascii)
3805                 {
3806                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3807                   last_id = charset_ascii;
3808                   last_offset = char_offset;
3809                 }
3810               DECODE_COMPOSITION_START (c1);
3811               continue;
3812
3813             case '1':           /* end composition */
3814               if (cmp_status->state == COMPOSING_NO)
3815                 goto invalid_code;
3816               DECODE_COMPOSITION_END ();
3817               continue;
3818
3819             case '[':           /* specification of direction */
3820               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3821                 goto invalid_code;
3822               /* For the moment, nested direction is not supported.
3823                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3824                  left-to-right, and nonzero means right-to-left.  */
3825               ONE_MORE_BYTE (c1);
3826               switch (c1)
3827                 {
3828                 case ']':       /* end of the current direction */
3829                   coding->mode &= ~CODING_MODE_DIRECTION;
3830
3831                 case '0':       /* end of the current direction */
3832                 case '1':       /* start of left-to-right direction */
3833                   ONE_MORE_BYTE (c1);
3834                   if (c1 == ']')
3835                     coding->mode &= ~CODING_MODE_DIRECTION;
3836                   else
3837                     goto invalid_code;
3838                   break;
3839
3840                 case '2':       /* start of right-to-left direction */
3841                   ONE_MORE_BYTE (c1);
3842                   if (c1 == ']')
3843                     coding->mode |= CODING_MODE_DIRECTION;
3844                   else
3845                     goto invalid_code;
3846                   break;
3847
3848                 default:
3849                   goto invalid_code;
3850                 }
3851               continue;
3852
3853             case '%':
3854               ONE_MORE_BYTE (c1);
3855               if (c1 == '/')
3856                 {
3857                   /* CTEXT extended segment:
3858                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3859                      We keep these bytes as is for the moment.
3860                      They may be decoded by post-read-conversion.  */
3861                   int dim, M, L;
3862                   int size;
3863
3864                   ONE_MORE_BYTE (dim);
3865                   if (dim < '0' || dim > '4')
3866                     goto invalid_code;
3867                   ONE_MORE_BYTE (M);
3868                   if (M < 128)
3869                     goto invalid_code;
3870                   ONE_MORE_BYTE (L);
3871                   if (L < 128)
3872                     goto invalid_code;
3873                   size = ((M - 128) * 128) + (L - 128);
3874                   if (charbuf + 6 > charbuf_end)
3875                     goto break_loop;
3876                   *charbuf++ = ISO_CODE_ESC;
3877                   *charbuf++ = '%';
3878                   *charbuf++ = '/';
3879                   *charbuf++ = dim;
3880                   *charbuf++ = BYTE8_TO_CHAR (M);
3881                   *charbuf++ = BYTE8_TO_CHAR (L);
3882                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3883                 }
3884               else if (c1 == 'G')
3885                 {
3886                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3887                      ESC % G --UTF-8-BYTES-- ESC % @
3888                      We keep these bytes as is for the moment.
3889                      They may be decoded by post-read-conversion.  */
3890                   if (charbuf + 3 > charbuf_end)
3891                     goto break_loop;
3892                   *charbuf++ = ISO_CODE_ESC;
3893                   *charbuf++ = '%';
3894                   *charbuf++ = 'G';
3895                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3896                 }
3897               else
3898                 goto invalid_code;
3899               continue;
3900               break;
3901
3902             default:
3903               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3904                 goto invalid_code;
3905               {
3906                 int reg, chars96;
3907
3908                 if (c1 >= 0x28 && c1 <= 0x2B)
3909                   { /* designation of DIMENSION1_CHARS94 character set */
3910                     reg = c1 - 0x28, chars96 = 0;
3911                     ONE_MORE_BYTE (c1);
3912                   }
3913                 else if (c1 >= 0x2C && c1 <= 0x2F)
3914                   { /* designation of DIMENSION1_CHARS96 character set */
3915                     reg = c1 - 0x2C, chars96 = 1;
3916                     ONE_MORE_BYTE (c1);
3917                   }
3918                 else
3919                   goto invalid_code;
3920                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3921                 /* We must update these variables now.  */
3922                 if (reg == 0)
3923                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3924                 else if (reg == 1)
3925                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3926                 if (chars96 < 0)
3927                   goto invalid_code;
3928               }
3929               continue;
3930             }
3931           break;
3932
3933         default:
3934           emacs_abort ();
3935         }
3936
3937       if (cmp_status->state == COMPOSING_NO
3938           && charset->id != charset_ascii
3939           && last_id != charset->id)
3940         {
3941           if (last_id != charset_ascii)
3942             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3943           last_id = charset->id;
3944           last_offset = char_offset;
3945         }
3946
3947       /* Now we know CHARSET and 1st position code C1 of a character.
3948          Produce a decoded character while getting 2nd and 3rd
3949          position codes C2, C3 if necessary.  */
3950       if (CHARSET_DIMENSION (charset) > 1)
3951         {
3952           ONE_MORE_BYTE (c2);
3953           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3954               || ((c1 & 0x80) != (c2 & 0x80)))
3955             /* C2 is not in a valid range.  */
3956             goto invalid_code;
3957           if (CHARSET_DIMENSION (charset) == 2)
3958             c1 = (c1 << 8) | c2;
3959           else
3960             {
3961               ONE_MORE_BYTE (c3);
3962               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3963                   || ((c1 & 0x80) != (c3 & 0x80)))
3964                 /* C3 is not in a valid range.  */
3965                 goto invalid_code;
3966               c1 = (c1 << 16) | (c2 << 8) | c2;
3967             }
3968         }
3969       c1 &= 0x7F7F7F;
3970       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3971       if (c < 0)
3972         {
3973           MAYBE_FINISH_COMPOSITION ();
3974           for (; src_base < src; src_base++, char_offset++)
3975             {
3976               if (ASCII_BYTE_P (*src_base))
3977                 *charbuf++ = *src_base;
3978               else
3979                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3980             }
3981         }
3982       else if (cmp_status->state == COMPOSING_NO)
3983         {
3984           *charbuf++ = c;
3985           char_offset++;
3986         }
3987       else if ((cmp_status->state == COMPOSING_CHAR
3988                 ? cmp_status->nchars
3989                 : cmp_status->ncomps)
3990                >= MAX_COMPOSITION_COMPONENTS)
3991         {
3992           /* Too long composition.  */
3993           MAYBE_FINISH_COMPOSITION ();
3994           *charbuf++ = c;
3995           char_offset++;
3996         }
3997       else
3998         STORE_COMPOSITION_CHAR (c);
3999       continue;
4000
4001     invalid_code:
4002       MAYBE_FINISH_COMPOSITION ();
4003       src = src_base;
4004       consumed_chars = consumed_chars_base;
4005       ONE_MORE_BYTE (c);
4006       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4007       char_offset++;
4008       coding->errors++;
4009       /* Reset the invocation and designation status to the safest
4010          one; i.e. designate ASCII to the graphic register 0, and
4011          invoke that register to the graphic plane 0.  This typically
4012          helps the case that an designation sequence for ASCII "ESC (
4013          B" is somehow broken (e.g. broken by a newline).  */
4014       CODING_ISO_INVOCATION (coding, 0) = 0;
4015       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
4016       charset_id_0 = charset_ascii;
4017       continue;
4018
4019     break_loop:
4020       break;
4021     }
4022
4023  no_more_source:
4024   if (cmp_status->state != COMPOSING_NO)
4025     {
4026       if (coding->mode & CODING_MODE_LAST_BLOCK)
4027         MAYBE_FINISH_COMPOSITION ();
4028       else
4029         {
4030           charbuf -= cmp_status->length;
4031           for (i = 0; i < cmp_status->length; i++)
4032             cmp_status->carryover[i] = charbuf[i];
4033         }
4034     }
4035   else if (last_id != charset_ascii)
4036     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4037   coding->consumed_char += consumed_chars_base;
4038   coding->consumed = src_base - coding->source;
4039   coding->charbuf_used = charbuf - coding->charbuf;
4040 }
4041
4042
4043 /* ISO2022 encoding stuff.  */
4044
4045 /*
4046    It is not enough to say just "ISO2022" on encoding, we have to
4047    specify more details.  In Emacs, each coding system of ISO2022
4048    variant has the following specifications:
4049         1. Initial designation to G0 thru G3.
4050         2. Allows short-form designation?
4051         3. ASCII should be designated to G0 before control characters?
4052         4. ASCII should be designated to G0 at end of line?
4053         5. 7-bit environment or 8-bit environment?
4054         6. Use locking-shift?
4055         7. Use Single-shift?
4056    And the following two are only for Japanese:
4057         8. Use ASCII in place of JIS0201-1976-Roman?
4058         9. Use JISX0208-1983 in place of JISX0208-1978?
4059    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4060    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4061    details.
4062 */
4063
4064 /* Produce codes (escape sequence) for designating CHARSET to graphic
4065    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4066    '@', 'A', or 'B' and the coding system CODING allows, produce
4067    designation sequence of short-form.  */
4068
4069 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4070   do {                                                                  \
4071     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4072     const char *intermediate_char_94 = "()*+";                          \
4073     const char *intermediate_char_96 = ",-./";                          \
4074     int revision = -1;                                                  \
4075                                                                         \
4076     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4077       revision = CHARSET_ISO_REVISION (charset);                        \
4078                                                                         \
4079     if (revision >= 0)                                                  \
4080       {                                                                 \
4081         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4082         EMIT_ONE_BYTE ('@' + revision);                                 \
4083       }                                                                 \
4084     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4085     if (CHARSET_DIMENSION (charset) == 1)                               \
4086       {                                                                 \
4087         int b;                                                          \
4088         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4089           b = intermediate_char_94[reg];                                \
4090         else                                                            \
4091           b = intermediate_char_96[reg];                                \
4092         EMIT_ONE_ASCII_BYTE (b);                                        \
4093       }                                                                 \
4094     else                                                                \
4095       {                                                                 \
4096         EMIT_ONE_ASCII_BYTE ('$');                                      \
4097         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4098           {                                                             \
4099             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4100                 || reg != 0                                             \
4101                 || final_char < '@' || final_char > 'B')                \
4102               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4103           }                                                             \
4104         else                                                            \
4105           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4106       }                                                                 \
4107     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4108                                                                         \
4109     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4110   } while (0)
4111
4112
4113 /* The following two macros produce codes (control character or escape
4114    sequence) for ISO2022 single-shift functions (single-shift-2 and
4115    single-shift-3).  */
4116
4117 #define ENCODE_SINGLE_SHIFT_2                                           \
4118   do {                                                                  \
4119     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4120       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4121     else                                                                \
4122       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4123     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4124   } while (0)
4125
4126
4127 #define ENCODE_SINGLE_SHIFT_3                                           \
4128   do {                                                                  \
4129     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4130       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4131     else                                                                \
4132       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4133     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4134   } while (0)
4135
4136
4137 /* The following four macros produce codes (control character or
4138    escape sequence) for ISO2022 locking-shift functions (shift-in,
4139    shift-out, locking-shift-2, and locking-shift-3).  */
4140
4141 #define ENCODE_SHIFT_IN                                 \
4142   do {                                                  \
4143     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4144     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4145   } while (0)
4146
4147
4148 #define ENCODE_SHIFT_OUT                                \
4149   do {                                                  \
4150     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4151     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4152   } while (0)
4153
4154
4155 #define ENCODE_LOCKING_SHIFT_2                          \
4156   do {                                                  \
4157     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4158     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4159   } while (0)
4160
4161
4162 #define ENCODE_LOCKING_SHIFT_3                          \
4163   do {                                                  \
4164     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4165     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4166   } while (0)
4167
4168
4169 /* Produce codes for a DIMENSION1 character whose character set is
4170    CHARSET and whose position-code is C1.  Designation and invocation
4171    sequences are also produced in advance if necessary.  */
4172
4173 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4174   do {                                                                  \
4175     int id = CHARSET_ID (charset);                                      \
4176                                                                         \
4177     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4178         && id == charset_ascii)                                         \
4179       {                                                                 \
4180         id = charset_jisx0201_roman;                                    \
4181         charset = CHARSET_FROM_ID (id);                                 \
4182       }                                                                 \
4183                                                                         \
4184     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4185       {                                                                 \
4186         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4187           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4188         else                                                            \
4189           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4190         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4191         break;                                                          \
4192       }                                                                 \
4193     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4194       {                                                                 \
4195         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4196         break;                                                          \
4197       }                                                                 \
4198     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4199       {                                                                 \
4200         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4201         break;                                                          \
4202       }                                                                 \
4203     else                                                                \
4204       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4205          must invoke it, or, at first, designate it to some graphic     \
4206          register.  Then repeat the loop to actually produce the        \
4207          character.  */                                                 \
4208       dst = encode_invocation_designation (charset, coding, dst,        \
4209                                            &produced_chars);            \
4210   } while (1)
4211
4212
4213 /* Produce codes for a DIMENSION2 character whose character set is
4214    CHARSET and whose position-codes are C1 and C2.  Designation and
4215    invocation codes are also produced in advance if necessary.  */
4216
4217 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4218   do {                                                                  \
4219     int id = CHARSET_ID (charset);                                      \
4220                                                                         \
4221     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4222         && id == charset_jisx0208)                                      \
4223       {                                                                 \
4224         id = charset_jisx0208_1978;                                     \
4225         charset = CHARSET_FROM_ID (id);                                 \
4226       }                                                                 \
4227                                                                         \
4228     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4229       {                                                                 \
4230         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4231           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4232         else                                                            \
4233           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4234         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4235         break;                                                          \
4236       }                                                                 \
4237     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4238       {                                                                 \
4239         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4240         break;                                                          \
4241       }                                                                 \
4242     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4243       {                                                                 \
4244         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4245         break;                                                          \
4246       }                                                                 \
4247     else                                                                \
4248       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4249          must invoke it, or, at first, designate it to some graphic     \
4250          register.  Then repeat the loop to actually produce the        \
4251          character.  */                                                 \
4252       dst = encode_invocation_designation (charset, coding, dst,        \
4253                                            &produced_chars);            \
4254   } while (1)
4255
4256
4257 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4258   do {                                                                     \
4259     unsigned code;                                                         \
4260     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4261                                                                            \
4262     if (CHARSET_DIMENSION (charset) == 1)                                  \
4263       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4264     else                                                                   \
4265       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4266   } while (0)
4267
4268
4269 /* Produce designation and invocation codes at a place pointed by DST
4270    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4271    Return new DST.  */
4272
4273 static unsigned char *
4274 encode_invocation_designation (struct charset *charset,
4275                                struct coding_system *coding,
4276                                unsigned char *dst, ptrdiff_t *p_nchars)
4277 {
4278   bool multibytep = coding->dst_multibyte;
4279   ptrdiff_t produced_chars = *p_nchars;
4280   int reg;                      /* graphic register number */
4281   int id = CHARSET_ID (charset);
4282
4283   /* At first, check designations.  */
4284   for (reg = 0; reg < 4; reg++)
4285     if (id == CODING_ISO_DESIGNATION (coding, reg))
4286       break;
4287
4288   if (reg >= 4)
4289     {
4290       /* CHARSET is not yet designated to any graphic registers.  */
4291       /* At first check the requested designation.  */
4292       reg = CODING_ISO_REQUEST (coding, id);
4293       if (reg < 0)
4294         /* Since CHARSET requests no special designation, designate it
4295            to graphic register 0.  */
4296         reg = 0;
4297
4298       ENCODE_DESIGNATION (charset, reg, coding);
4299     }
4300
4301   if (CODING_ISO_INVOCATION (coding, 0) != reg
4302       && CODING_ISO_INVOCATION (coding, 1) != reg)
4303     {
4304       /* Since the graphic register REG is not invoked to any graphic
4305          planes, invoke it to graphic plane 0.  */
4306       switch (reg)
4307         {
4308         case 0:                 /* graphic register 0 */
4309           ENCODE_SHIFT_IN;
4310           break;
4311
4312         case 1:                 /* graphic register 1 */
4313           ENCODE_SHIFT_OUT;
4314           break;
4315
4316         case 2:                 /* graphic register 2 */
4317           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4318             ENCODE_SINGLE_SHIFT_2;
4319           else
4320             ENCODE_LOCKING_SHIFT_2;
4321           break;
4322
4323         case 3:                 /* graphic register 3 */
4324           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4325             ENCODE_SINGLE_SHIFT_3;
4326           else
4327             ENCODE_LOCKING_SHIFT_3;
4328           break;
4329         }
4330     }
4331
4332   *p_nchars = produced_chars;
4333   return dst;
4334 }
4335
4336
4337 /* Produce codes for designation and invocation to reset the graphic
4338    planes and registers to initial state.  */
4339 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4340   do {                                                                  \
4341     int reg;                                                            \
4342     struct charset *charset;                                            \
4343                                                                         \
4344     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4345       ENCODE_SHIFT_IN;                                                  \
4346     for (reg = 0; reg < 4; reg++)                                       \
4347       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4348           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4349               != CODING_ISO_INITIAL (coding, reg)))                     \
4350         {                                                               \
4351           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4352           ENCODE_DESIGNATION (charset, reg, coding);                    \
4353         }                                                               \
4354   } while (0)
4355
4356
4357 /* Produce designation sequences of charsets in the line started from
4358    CHARBUF to a place pointed by DST, and return the number of
4359    produced bytes.  DST should not directly point a buffer text area
4360    which may be relocated by char_charset call.
4361
4362    If the current block ends before any end-of-line, we may fail to
4363    find all the necessary designations.  */
4364
4365 static ptrdiff_t
4366 encode_designation_at_bol (struct coding_system *coding,
4367                            int *charbuf, int *charbuf_end,
4368                            unsigned char *dst)
4369 {
4370   unsigned char *orig = dst;
4371   struct charset *charset;
4372   /* Table of charsets to be designated to each graphic register.  */
4373   int r[4];
4374   int c, found = 0, reg;
4375   ptrdiff_t produced_chars = 0;
4376   bool multibytep = coding->dst_multibyte;
4377   Lisp_Object attrs;
4378   Lisp_Object charset_list;
4379
4380   attrs = CODING_ID_ATTRS (coding->id);
4381   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4382   if (EQ (charset_list, Qiso_2022))
4383     charset_list = Viso_2022_charset_list;
4384
4385   for (reg = 0; reg < 4; reg++)
4386     r[reg] = -1;
4387
4388   while (charbuf < charbuf_end && found < 4)
4389     {
4390       int id;
4391
4392       c = *charbuf++;
4393       if (c == '\n')
4394         break;
4395       charset = char_charset (c, charset_list, NULL);
4396       id = CHARSET_ID (charset);
4397       reg = CODING_ISO_REQUEST (coding, id);
4398       if (reg >= 0 && r[reg] < 0)
4399         {
4400           found++;
4401           r[reg] = id;
4402         }
4403     }
4404
4405   if (found)
4406     {
4407       for (reg = 0; reg < 4; reg++)
4408         if (r[reg] >= 0
4409             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4410           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4411     }
4412
4413   return dst - orig;
4414 }
4415
4416 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4417
4418 static bool
4419 encode_coding_iso_2022 (struct coding_system *coding)
4420 {
4421   bool multibytep = coding->dst_multibyte;
4422   int *charbuf = coding->charbuf;
4423   int *charbuf_end = charbuf + coding->charbuf_used;
4424   unsigned char *dst = coding->destination + coding->produced;
4425   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4426   int safe_room = 16;
4427   bool bol_designation
4428     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4429        && CODING_ISO_BOL (coding));
4430   ptrdiff_t produced_chars = 0;
4431   Lisp_Object attrs, eol_type, charset_list;
4432   bool ascii_compatible;
4433   int c;
4434   int preferred_charset_id = -1;
4435
4436   CODING_GET_INFO (coding, attrs, charset_list);
4437   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4438   if (VECTORP (eol_type))
4439     eol_type = Qunix;
4440
4441   setup_iso_safe_charsets (attrs);
4442   /* Charset list may have been changed.  */
4443   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4444   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4445
4446   ascii_compatible
4447     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4448        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4449                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4450
4451   while (charbuf < charbuf_end)
4452     {
4453       ASSURE_DESTINATION (safe_room);
4454
4455       if (bol_designation)
4456         {
4457           /* We have to produce designation sequences if any now.  */
4458           unsigned char desig_buf[16];
4459           int nbytes;
4460           ptrdiff_t offset;
4461
4462           charset_map_loaded = 0;
4463           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4464                                               desig_buf);
4465           if (charset_map_loaded
4466               && (offset = coding_change_destination (coding)))
4467             {
4468               dst += offset;
4469               dst_end += offset;
4470             }
4471           memcpy (dst, desig_buf, nbytes);
4472           dst += nbytes;
4473           /* We are sure that designation sequences are all ASCII bytes.  */
4474           produced_chars += nbytes;
4475           bol_designation = 0;
4476           ASSURE_DESTINATION (safe_room);
4477         }
4478
4479       c = *charbuf++;
4480
4481       if (c < 0)
4482         {
4483           /* Handle an annotation.  */
4484           switch (*charbuf)
4485             {
4486             case CODING_ANNOTATE_COMPOSITION_MASK:
4487               /* Not yet implemented.  */
4488               break;
4489             case CODING_ANNOTATE_CHARSET_MASK:
4490               preferred_charset_id = charbuf[2];
4491               if (preferred_charset_id >= 0
4492                   && NILP (Fmemq (make_number (preferred_charset_id),
4493                                   charset_list)))
4494                 preferred_charset_id = -1;
4495               break;
4496             default:
4497               emacs_abort ();
4498             }
4499           charbuf += -c - 1;
4500           continue;
4501         }
4502
4503       /* Now encode the character C.  */
4504       if (c < 0x20 || c == 0x7F)
4505         {
4506           if (c == '\n'
4507               || (c == '\r' && EQ (eol_type, Qmac)))
4508             {
4509               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4510                 ENCODE_RESET_PLANE_AND_REGISTER ();
4511               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4512                 {
4513                   int i;
4514
4515                   for (i = 0; i < 4; i++)
4516                     CODING_ISO_DESIGNATION (coding, i)
4517                       = CODING_ISO_INITIAL (coding, i);
4518                 }
4519               bol_designation = ((CODING_ISO_FLAGS (coding)
4520                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4521                                  != 0);
4522             }
4523           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4524             ENCODE_RESET_PLANE_AND_REGISTER ();
4525           EMIT_ONE_ASCII_BYTE (c);
4526         }
4527       else if (ASCII_CHAR_P (c))
4528         {
4529           if (ascii_compatible)
4530             EMIT_ONE_ASCII_BYTE (c);
4531           else
4532             {
4533               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4534               ENCODE_ISO_CHARACTER (charset, c);
4535             }
4536         }
4537       else if (CHAR_BYTE8_P (c))
4538         {
4539           c = CHAR_TO_BYTE8 (c);
4540           EMIT_ONE_BYTE (c);
4541         }
4542       else
4543         {
4544           struct charset *charset;
4545
4546           if (preferred_charset_id >= 0)
4547             {
4548               bool result;
4549
4550               charset = CHARSET_FROM_ID (preferred_charset_id);
4551               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4552               if (! result)
4553                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4554                                      NULL, charset);
4555             }
4556           else
4557             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4558                                  NULL, charset);
4559           if (!charset)
4560             {
4561               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4562                 {
4563                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4564                   charset = CHARSET_FROM_ID (charset_ascii);
4565                 }
4566               else
4567                 {
4568                   c = coding->default_char;
4569                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4570                                        charset_list, NULL, charset);
4571                 }
4572             }
4573           ENCODE_ISO_CHARACTER (charset, c);
4574         }
4575     }
4576
4577   if (coding->mode & CODING_MODE_LAST_BLOCK
4578       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4579     {
4580       ASSURE_DESTINATION (safe_room);
4581       ENCODE_RESET_PLANE_AND_REGISTER ();
4582     }
4583   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4584   CODING_ISO_BOL (coding) = bol_designation;
4585   coding->produced_char += produced_chars;
4586   coding->produced = dst - coding->destination;
4587   return 0;
4588 }
4589
4590 \f
4591 /*** 8,9. SJIS and BIG5 handlers ***/
4592
4593 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4594    quite widely.  So, for the moment, Emacs supports them in the bare
4595    C code.  But, in the future, they may be supported only by CCL.  */
4596
4597 /* SJIS is a coding system encoding three character sets: ASCII, right
4598    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4599    as is.  A character of charset katakana-jisx0201 is encoded by
4600    "position-code + 0x80".  A character of charset japanese-jisx0208
4601    is encoded in 2-byte but two position-codes are divided and shifted
4602    so that it fit in the range below.
4603
4604    --- CODE RANGE of SJIS ---
4605    (character set)      (range)
4606    ASCII                0x00 .. 0x7F
4607    KATAKANA-JISX0201    0xA0 .. 0xDF
4608    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4609             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4610    -------------------------------
4611
4612 */
4613
4614 /* BIG5 is a coding system encoding two character sets: ASCII and
4615    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4616    character set and is encoded in two-byte.
4617
4618    --- CODE RANGE of BIG5 ---
4619    (character set)      (range)
4620    ASCII                0x00 .. 0x7F
4621    Big5 (1st byte)      0xA1 .. 0xFE
4622         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4623    --------------------------
4624
4625   */
4626
4627 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4628    Return true if a text is encoded in SJIS.  */
4629
4630 static bool
4631 detect_coding_sjis (struct coding_system *coding,
4632                     struct coding_detection_info *detect_info)
4633 {
4634   const unsigned char *src = coding->source, *src_base;
4635   const unsigned char *src_end = coding->source + coding->src_bytes;
4636   bool multibytep = coding->src_multibyte;
4637   ptrdiff_t consumed_chars = 0;
4638   int found = 0;
4639   int c;
4640   Lisp_Object attrs, charset_list;
4641   int max_first_byte_of_2_byte_code;
4642
4643   CODING_GET_INFO (coding, attrs, charset_list);
4644   max_first_byte_of_2_byte_code
4645     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4646
4647   detect_info->checked |= CATEGORY_MASK_SJIS;
4648   /* A coding system of this category is always ASCII compatible.  */
4649   src += coding->head_ascii;
4650
4651   while (1)
4652     {
4653       src_base = src;
4654       ONE_MORE_BYTE (c);
4655       if (c < 0x80)
4656         continue;
4657       if ((c >= 0x81 && c <= 0x9F)
4658           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4659         {
4660           ONE_MORE_BYTE (c);
4661           if (c < 0x40 || c == 0x7F || c > 0xFC)
4662             break;
4663           found = CATEGORY_MASK_SJIS;
4664         }
4665       else if (c >= 0xA0 && c < 0xE0)
4666         found = CATEGORY_MASK_SJIS;
4667       else
4668         break;
4669     }
4670   detect_info->rejected |= CATEGORY_MASK_SJIS;
4671   return 0;
4672
4673  no_more_source:
4674   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4675     {
4676       detect_info->rejected |= CATEGORY_MASK_SJIS;
4677       return 0;
4678     }
4679   detect_info->found |= found;
4680   return 1;
4681 }
4682
4683 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4684    Return true if a text is encoded in BIG5.  */
4685
4686 static bool
4687 detect_coding_big5 (struct coding_system *coding,
4688                     struct coding_detection_info *detect_info)
4689 {
4690   const unsigned char *src = coding->source, *src_base;
4691   const unsigned char *src_end = coding->source + coding->src_bytes;
4692   bool multibytep = coding->src_multibyte;
4693   ptrdiff_t consumed_chars = 0;
4694   int found = 0;
4695   int c;
4696
4697   detect_info->checked |= CATEGORY_MASK_BIG5;
4698   /* A coding system of this category is always ASCII compatible.  */
4699   src += coding->head_ascii;
4700
4701   while (1)
4702     {
4703       src_base = src;
4704       ONE_MORE_BYTE (c);
4705       if (c < 0x80)
4706         continue;
4707       if (c >= 0xA1)
4708         {
4709           ONE_MORE_BYTE (c);
4710           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4711             return 0;
4712           found = CATEGORY_MASK_BIG5;
4713         }
4714       else
4715         break;
4716     }
4717   detect_info->rejected |= CATEGORY_MASK_BIG5;
4718   return 0;
4719
4720  no_more_source:
4721   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4722     {
4723       detect_info->rejected |= CATEGORY_MASK_BIG5;
4724       return 0;
4725     }
4726   detect_info->found |= found;
4727   return 1;
4728 }
4729
4730 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4731
4732 static void
4733 decode_coding_sjis (struct coding_system *coding)
4734 {
4735   const unsigned char *src = coding->source + coding->consumed;
4736   const unsigned char *src_end = coding->source + coding->src_bytes;
4737   const unsigned char *src_base;
4738   int *charbuf = coding->charbuf + coding->charbuf_used;
4739   /* We may produce one charset annotation in one loop and one more at
4740      the end.  */
4741   int *charbuf_end
4742     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4743   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4744   bool multibytep = coding->src_multibyte;
4745   struct charset *charset_roman, *charset_kanji, *charset_kana;
4746   struct charset *charset_kanji2;
4747   Lisp_Object attrs, charset_list, val;
4748   ptrdiff_t char_offset = coding->produced_char;
4749   ptrdiff_t last_offset = char_offset;
4750   int last_id = charset_ascii;
4751   bool eol_dos
4752     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4753   int byte_after_cr = -1;
4754
4755   CODING_GET_INFO (coding, attrs, charset_list);
4756
4757   val = charset_list;
4758   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4759   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4760   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4761   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4762
4763   while (1)
4764     {
4765       int c, c1;
4766       struct charset *charset;
4767
4768       src_base = src;
4769       consumed_chars_base = consumed_chars;
4770
4771       if (charbuf >= charbuf_end)
4772         {
4773           if (byte_after_cr >= 0)
4774             src_base--;
4775           break;
4776         }
4777
4778       if (byte_after_cr >= 0)
4779         c = byte_after_cr, byte_after_cr = -1;
4780       else
4781         ONE_MORE_BYTE (c);
4782       if (c < 0)
4783         goto invalid_code;
4784       if (c < 0x80)
4785         {
4786           if (eol_dos && c == '\r')
4787             ONE_MORE_BYTE (byte_after_cr);
4788           charset = charset_roman;
4789         }
4790       else if (c == 0x80 || c == 0xA0)
4791         goto invalid_code;
4792       else if (c >= 0xA1 && c <= 0xDF)
4793         {
4794           /* SJIS -> JISX0201-Kana */
4795           c &= 0x7F;
4796           charset = charset_kana;
4797         }
4798       else if (c <= 0xEF)
4799         {
4800           /* SJIS -> JISX0208 */
4801           ONE_MORE_BYTE (c1);
4802           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4803             goto invalid_code;
4804           c = (c << 8) | c1;
4805           SJIS_TO_JIS (c);
4806           charset = charset_kanji;
4807         }
4808       else if (c <= 0xFC && charset_kanji2)
4809         {
4810           /* SJIS -> JISX0213-2 */
4811           ONE_MORE_BYTE (c1);
4812           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4813             goto invalid_code;
4814           c = (c << 8) | c1;
4815           SJIS_TO_JIS2 (c);
4816           charset = charset_kanji2;
4817         }
4818       else
4819         goto invalid_code;
4820       if (charset->id != charset_ascii
4821           && last_id != charset->id)
4822         {
4823           if (last_id != charset_ascii)
4824             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4825           last_id = charset->id;
4826           last_offset = char_offset;
4827         }
4828       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4829       *charbuf++ = c;
4830       char_offset++;
4831       continue;
4832
4833     invalid_code:
4834       src = src_base;
4835       consumed_chars = consumed_chars_base;
4836       ONE_MORE_BYTE (c);
4837       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4838       char_offset++;
4839       coding->errors++;
4840     }
4841
4842  no_more_source:
4843   if (last_id != charset_ascii)
4844     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4845   coding->consumed_char += consumed_chars_base;
4846   coding->consumed = src_base - coding->source;
4847   coding->charbuf_used = charbuf - coding->charbuf;
4848 }
4849
4850 static void
4851 decode_coding_big5 (struct coding_system *coding)
4852 {
4853   const unsigned char *src = coding->source + coding->consumed;
4854   const unsigned char *src_end = coding->source + coding->src_bytes;
4855   const unsigned char *src_base;
4856   int *charbuf = coding->charbuf + coding->charbuf_used;
4857   /* We may produce one charset annotation in one loop and one more at
4858      the end.  */
4859   int *charbuf_end
4860     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4861   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4862   bool multibytep = coding->src_multibyte;
4863   struct charset *charset_roman, *charset_big5;
4864   Lisp_Object attrs, charset_list, val;
4865   ptrdiff_t char_offset = coding->produced_char;
4866   ptrdiff_t last_offset = char_offset;
4867   int last_id = charset_ascii;
4868   bool eol_dos
4869     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4870   int byte_after_cr = -1;
4871
4872   CODING_GET_INFO (coding, attrs, charset_list);
4873   val = charset_list;
4874   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4875   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4876
4877   while (1)
4878     {
4879       int c, c1;
4880       struct charset *charset;
4881
4882       src_base = src;
4883       consumed_chars_base = consumed_chars;
4884
4885       if (charbuf >= charbuf_end)
4886         {
4887           if (byte_after_cr >= 0)
4888             src_base--;
4889           break;
4890         }
4891
4892       if (byte_after_cr >= 0)
4893         c = byte_after_cr, byte_after_cr = -1;
4894       else
4895         ONE_MORE_BYTE (c);
4896
4897       if (c < 0)
4898         goto invalid_code;
4899       if (c < 0x80)
4900         {
4901           if (eol_dos && c == '\r')
4902             ONE_MORE_BYTE (byte_after_cr);
4903           charset = charset_roman;
4904         }
4905       else
4906         {
4907           /* BIG5 -> Big5 */
4908           if (c < 0xA1 || c > 0xFE)
4909             goto invalid_code;
4910           ONE_MORE_BYTE (c1);
4911           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4912             goto invalid_code;
4913           c = c << 8 | c1;
4914           charset = charset_big5;
4915         }
4916       if (charset->id != charset_ascii
4917           && last_id != charset->id)
4918         {
4919           if (last_id != charset_ascii)
4920             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4921           last_id = charset->id;
4922           last_offset = char_offset;
4923         }
4924       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4925       *charbuf++ = c;
4926       char_offset++;
4927       continue;
4928
4929     invalid_code:
4930       src = src_base;
4931       consumed_chars = consumed_chars_base;
4932       ONE_MORE_BYTE (c);
4933       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4934       char_offset++;
4935       coding->errors++;
4936     }
4937
4938  no_more_source:
4939   if (last_id != charset_ascii)
4940     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4941   coding->consumed_char += consumed_chars_base;
4942   coding->consumed = src_base - coding->source;
4943   coding->charbuf_used = charbuf - coding->charbuf;
4944 }
4945
4946 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4947    This function can encode charsets `ascii', `katakana-jisx0201',
4948    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4949    are sure that all these charsets are registered as official charset
4950    (i.e. do not have extended leading-codes).  Characters of other
4951    charsets are produced without any encoding.  */
4952
4953 static bool
4954 encode_coding_sjis (struct coding_system *coding)
4955 {
4956   bool multibytep = coding->dst_multibyte;
4957   int *charbuf = coding->charbuf;
4958   int *charbuf_end = charbuf + coding->charbuf_used;
4959   unsigned char *dst = coding->destination + coding->produced;
4960   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4961   int safe_room = 4;
4962   ptrdiff_t produced_chars = 0;
4963   Lisp_Object attrs, charset_list, val;
4964   bool ascii_compatible;
4965   struct charset *charset_kanji, *charset_kana;
4966   struct charset *charset_kanji2;
4967   int c;
4968
4969   CODING_GET_INFO (coding, attrs, charset_list);
4970   val = XCDR (charset_list);
4971   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4972   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4973   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4974
4975   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4976
4977   while (charbuf < charbuf_end)
4978     {
4979       ASSURE_DESTINATION (safe_room);
4980       c = *charbuf++;
4981       /* Now encode the character C.  */
4982       if (ASCII_CHAR_P (c) && ascii_compatible)
4983         EMIT_ONE_ASCII_BYTE (c);
4984       else if (CHAR_BYTE8_P (c))
4985         {
4986           c = CHAR_TO_BYTE8 (c);
4987           EMIT_ONE_BYTE (c);
4988         }
4989       else
4990         {
4991           unsigned code;
4992           struct charset *charset;
4993           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4994                                &code, charset);
4995
4996           if (!charset)
4997             {
4998               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4999                 {
5000                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5001                   charset = CHARSET_FROM_ID (charset_ascii);
5002                 }
5003               else
5004                 {
5005                   c = coding->default_char;
5006                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5007                                        charset_list, &code, charset);
5008                 }
5009             }
5010           if (code == CHARSET_INVALID_CODE (charset))
5011             emacs_abort ();
5012           if (charset == charset_kanji)
5013             {
5014               int c1, c2;
5015               JIS_TO_SJIS (code);
5016               c1 = code >> 8, c2 = code & 0xFF;
5017               EMIT_TWO_BYTES (c1, c2);
5018             }
5019           else if (charset == charset_kana)
5020             EMIT_ONE_BYTE (code | 0x80);
5021           else if (charset_kanji2 && charset == charset_kanji2)
5022             {
5023               int c1, c2;
5024
5025               c1 = code >> 8;
5026               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5027                   || c1 == 0x28
5028                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5029                 {
5030                   JIS_TO_SJIS2 (code);
5031                   c1 = code >> 8, c2 = code & 0xFF;
5032                   EMIT_TWO_BYTES (c1, c2);
5033                 }
5034               else
5035                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5036             }
5037           else
5038             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5039         }
5040     }
5041   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5042   coding->produced_char += produced_chars;
5043   coding->produced = dst - coding->destination;
5044   return 0;
5045 }
5046
5047 static bool
5048 encode_coding_big5 (struct coding_system *coding)
5049 {
5050   bool multibytep = coding->dst_multibyte;
5051   int *charbuf = coding->charbuf;
5052   int *charbuf_end = charbuf + coding->charbuf_used;
5053   unsigned char *dst = coding->destination + coding->produced;
5054   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5055   int safe_room = 4;
5056   ptrdiff_t produced_chars = 0;
5057   Lisp_Object attrs, charset_list, val;
5058   bool ascii_compatible;
5059   struct charset *charset_big5;
5060   int c;
5061
5062   CODING_GET_INFO (coding, attrs, charset_list);
5063   val = XCDR (charset_list);
5064   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5065   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5066
5067   while (charbuf < charbuf_end)
5068     {
5069       ASSURE_DESTINATION (safe_room);
5070       c = *charbuf++;
5071       /* Now encode the character C.  */
5072       if (ASCII_CHAR_P (c) && ascii_compatible)
5073         EMIT_ONE_ASCII_BYTE (c);
5074       else if (CHAR_BYTE8_P (c))
5075         {
5076           c = CHAR_TO_BYTE8 (c);
5077           EMIT_ONE_BYTE (c);
5078         }
5079       else
5080         {
5081           unsigned code;
5082           struct charset *charset;
5083           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5084                                &code, charset);
5085
5086           if (! charset)
5087             {
5088               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5089                 {
5090                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5091                   charset = CHARSET_FROM_ID (charset_ascii);
5092                 }
5093               else
5094                 {
5095                   c = coding->default_char;
5096                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5097                                        charset_list, &code, charset);
5098                 }
5099             }
5100           if (code == CHARSET_INVALID_CODE (charset))
5101             emacs_abort ();
5102           if (charset == charset_big5)
5103             {
5104               int c1, c2;
5105
5106               c1 = code >> 8, c2 = code & 0xFF;
5107               EMIT_TWO_BYTES (c1, c2);
5108             }
5109           else
5110             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5111         }
5112     }
5113   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5114   coding->produced_char += produced_chars;
5115   coding->produced = dst - coding->destination;
5116   return 0;
5117 }
5118
5119 \f
5120 /*** 10. CCL handlers ***/
5121
5122 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5123    Return true if a text is encoded in a coding system of which
5124    encoder/decoder are written in CCL program.  */
5125
5126 static bool
5127 detect_coding_ccl (struct coding_system *coding,
5128                    struct coding_detection_info *detect_info)
5129 {
5130   const unsigned char *src = coding->source, *src_base;
5131   const unsigned char *src_end = coding->source + coding->src_bytes;
5132   bool multibytep = coding->src_multibyte;
5133   ptrdiff_t consumed_chars = 0;
5134   int found = 0;
5135   unsigned char *valids;
5136   ptrdiff_t head_ascii = coding->head_ascii;
5137   Lisp_Object attrs;
5138
5139   detect_info->checked |= CATEGORY_MASK_CCL;
5140
5141   coding = &coding_categories[coding_category_ccl];
5142   valids = CODING_CCL_VALIDS (coding);
5143   attrs = CODING_ID_ATTRS (coding->id);
5144   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5145     src += head_ascii;
5146
5147   while (1)
5148     {
5149       int c;
5150
5151       src_base = src;
5152       ONE_MORE_BYTE (c);
5153       if (c < 0 || ! valids[c])
5154         break;
5155       if ((valids[c] > 1))
5156         found = CATEGORY_MASK_CCL;
5157     }
5158   detect_info->rejected |= CATEGORY_MASK_CCL;
5159   return 0;
5160
5161  no_more_source:
5162   detect_info->found |= found;
5163   return 1;
5164 }
5165
5166 static void
5167 decode_coding_ccl (struct coding_system *coding)
5168 {
5169   const unsigned char *src = coding->source + coding->consumed;
5170   const unsigned char *src_end = coding->source + coding->src_bytes;
5171   int *charbuf = coding->charbuf + coding->charbuf_used;
5172   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5173   ptrdiff_t consumed_chars = 0;
5174   bool multibytep = coding->src_multibyte;
5175   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5176   int source_charbuf[1024];
5177   int source_byteidx[1025];
5178   Lisp_Object attrs, charset_list;
5179
5180   CODING_GET_INFO (coding, attrs, charset_list);
5181
5182   while (1)
5183     {
5184       const unsigned char *p = src;
5185       ptrdiff_t offset;
5186       int i = 0;
5187
5188       if (multibytep)
5189         {
5190           while (i < 1024 && p < src_end)
5191             {
5192               source_byteidx[i] = p - src;
5193               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5194             }
5195           source_byteidx[i] = p - src;
5196         }
5197       else
5198         while (i < 1024 && p < src_end)
5199           source_charbuf[i++] = *p++;
5200
5201       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5202         ccl->last_block = 1;
5203       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5204       charset_map_loaded = 0;
5205       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5206                   charset_list);
5207       if (charset_map_loaded
5208           && (offset = coding_change_source (coding)))
5209         {
5210           p += offset;
5211           src += offset;
5212           src_end += offset;
5213         }
5214       charbuf += ccl->produced;
5215       if (multibytep)
5216         src += source_byteidx[ccl->consumed];
5217       else
5218         src += ccl->consumed;
5219       consumed_chars += ccl->consumed;
5220       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5221         break;
5222     }
5223
5224   switch (ccl->status)
5225     {
5226     case CCL_STAT_SUSPEND_BY_SRC:
5227       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5228       break;
5229     case CCL_STAT_SUSPEND_BY_DST:
5230       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5231       break;
5232     case CCL_STAT_QUIT:
5233     case CCL_STAT_INVALID_CMD:
5234       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5235       break;
5236     default:
5237       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5238       break;
5239     }
5240   coding->consumed_char += consumed_chars;
5241   coding->consumed = src - coding->source;
5242   coding->charbuf_used = charbuf - coding->charbuf;
5243 }
5244
5245 static bool
5246 encode_coding_ccl (struct coding_system *coding)
5247 {
5248   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5249   bool multibytep = coding->dst_multibyte;
5250   int *charbuf = coding->charbuf;
5251   int *charbuf_end = charbuf + coding->charbuf_used;
5252   unsigned char *dst = coding->destination + coding->produced;
5253   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5254   int destination_charbuf[1024];
5255   ptrdiff_t produced_chars = 0;
5256   int i;
5257   Lisp_Object attrs, charset_list;
5258
5259   CODING_GET_INFO (coding, attrs, charset_list);
5260   if (coding->consumed_char == coding->src_chars
5261       && coding->mode & CODING_MODE_LAST_BLOCK)
5262     ccl->last_block = 1;
5263
5264   do
5265     {
5266       ptrdiff_t offset;
5267
5268       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5269       charset_map_loaded = 0;
5270       ccl_driver (ccl, charbuf, destination_charbuf,
5271                   charbuf_end - charbuf, 1024, charset_list);
5272       if (charset_map_loaded
5273           && (offset = coding_change_destination (coding)))
5274         dst += offset;
5275       if (multibytep)
5276         {
5277           ASSURE_DESTINATION (ccl->produced * 2);
5278           for (i = 0; i < ccl->produced; i++)
5279             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5280         }
5281       else
5282         {
5283           ASSURE_DESTINATION (ccl->produced);
5284           for (i = 0; i < ccl->produced; i++)
5285             *dst++ = destination_charbuf[i] & 0xFF;
5286           produced_chars += ccl->produced;
5287         }
5288       charbuf += ccl->consumed;
5289       if (ccl->status == CCL_STAT_QUIT
5290           || ccl->status == CCL_STAT_INVALID_CMD)
5291         break;
5292     }
5293   while (charbuf < charbuf_end);
5294
5295   switch (ccl->status)
5296     {
5297     case CCL_STAT_SUSPEND_BY_SRC:
5298       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5299       break;
5300     case CCL_STAT_SUSPEND_BY_DST:
5301       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5302       break;
5303     case CCL_STAT_QUIT:
5304     case CCL_STAT_INVALID_CMD:
5305       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5306       break;
5307     default:
5308       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5309       break;
5310     }
5311
5312   coding->produced_char += produced_chars;
5313   coding->produced = dst - coding->destination;
5314   return 0;
5315 }
5316
5317 \f
5318 /*** 10, 11. no-conversion handlers ***/
5319
5320 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5321
5322 static void
5323 decode_coding_raw_text (struct coding_system *coding)
5324 {
5325   bool eol_dos
5326     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5327
5328   coding->chars_at_source = 1;
5329   coding->consumed_char = coding->src_chars;
5330   coding->consumed = coding->src_bytes;
5331   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5332     {
5333       coding->consumed_char--;
5334       coding->consumed--;
5335       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5336     }
5337   else
5338     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5339 }
5340
5341 static bool
5342 encode_coding_raw_text (struct coding_system *coding)
5343 {
5344   bool multibytep = coding->dst_multibyte;
5345   int *charbuf = coding->charbuf;
5346   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5347   unsigned char *dst = coding->destination + coding->produced;
5348   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5349   ptrdiff_t produced_chars = 0;
5350   int c;
5351
5352   if (multibytep)
5353     {
5354       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5355
5356       if (coding->src_multibyte)
5357         while (charbuf < charbuf_end)
5358           {
5359             ASSURE_DESTINATION (safe_room);
5360             c = *charbuf++;
5361             if (ASCII_CHAR_P (c))
5362               EMIT_ONE_ASCII_BYTE (c);
5363             else if (CHAR_BYTE8_P (c))
5364               {
5365                 c = CHAR_TO_BYTE8 (c);
5366                 EMIT_ONE_BYTE (c);
5367               }
5368             else
5369               {
5370                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5371
5372                 CHAR_STRING_ADVANCE (c, p1);
5373                 do
5374                   {
5375                     EMIT_ONE_BYTE (*p0);
5376                     p0++;
5377                   }
5378                 while (p0 < p1);
5379               }
5380           }
5381       else
5382         while (charbuf < charbuf_end)
5383           {
5384             ASSURE_DESTINATION (safe_room);
5385             c = *charbuf++;
5386             EMIT_ONE_BYTE (c);
5387           }
5388     }
5389   else
5390     {
5391       if (coding->src_multibyte)
5392         {
5393           int safe_room = MAX_MULTIBYTE_LENGTH;
5394
5395           while (charbuf < charbuf_end)
5396             {
5397               ASSURE_DESTINATION (safe_room);
5398               c = *charbuf++;
5399               if (ASCII_CHAR_P (c))
5400                 *dst++ = c;
5401               else if (CHAR_BYTE8_P (c))
5402                 *dst++ = CHAR_TO_BYTE8 (c);
5403               else
5404                 CHAR_STRING_ADVANCE (c, dst);
5405             }
5406         }
5407       else
5408         {
5409           ASSURE_DESTINATION (charbuf_end - charbuf);
5410           while (charbuf < charbuf_end && dst < dst_end)
5411             *dst++ = *charbuf++;
5412         }
5413       produced_chars = dst - (coding->destination + coding->produced);
5414     }
5415   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5416   coding->produced_char += produced_chars;
5417   coding->produced = dst - coding->destination;
5418   return 0;
5419 }
5420
5421 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5422    Return true if a text is encoded in a charset-based coding system.  */
5423
5424 static bool
5425 detect_coding_charset (struct coding_system *coding,
5426                        struct coding_detection_info *detect_info)
5427 {
5428   const unsigned char *src = coding->source, *src_base;
5429   const unsigned char *src_end = coding->source + coding->src_bytes;
5430   bool multibytep = coding->src_multibyte;
5431   ptrdiff_t consumed_chars = 0;
5432   Lisp_Object attrs, valids, name;
5433   int found = 0;
5434   ptrdiff_t head_ascii = coding->head_ascii;
5435   bool check_latin_extra = 0;
5436
5437   detect_info->checked |= CATEGORY_MASK_CHARSET;
5438
5439   coding = &coding_categories[coding_category_charset];
5440   attrs = CODING_ID_ATTRS (coding->id);
5441   valids = AREF (attrs, coding_attr_charset_valids);
5442   name = CODING_ID_NAME (coding->id);
5443   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5444                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5445       || strncmp (SSDATA (SYMBOL_NAME (name)),
5446                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5447     check_latin_extra = 1;
5448
5449   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5450     src += head_ascii;
5451
5452   while (1)
5453     {
5454       int c;
5455       Lisp_Object val;
5456       struct charset *charset;
5457       int dim, idx;
5458
5459       src_base = src;
5460       ONE_MORE_BYTE (c);
5461       if (c < 0)
5462         continue;
5463       val = AREF (valids, c);
5464       if (NILP (val))
5465         break;
5466       if (c >= 0x80)
5467         {
5468           if (c < 0xA0
5469               && check_latin_extra
5470               && (!VECTORP (Vlatin_extra_code_table)
5471                   || NILP (AREF (Vlatin_extra_code_table, c))))
5472             break;
5473           found = CATEGORY_MASK_CHARSET;
5474         }
5475       if (INTEGERP (val))
5476         {
5477           charset = CHARSET_FROM_ID (XFASTINT (val));
5478           dim = CHARSET_DIMENSION (charset);
5479           for (idx = 1; idx < dim; idx++)
5480             {
5481               if (src == src_end)
5482                 goto too_short;
5483               ONE_MORE_BYTE (c);
5484               if (c < charset->code_space[(dim - 1 - idx) * 4]
5485                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5486                 break;
5487             }
5488           if (idx < dim)
5489             break;
5490         }
5491       else
5492         {
5493           idx = 1;
5494           for (; CONSP (val); val = XCDR (val))
5495             {
5496               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5497               dim = CHARSET_DIMENSION (charset);
5498               while (idx < dim)
5499                 {
5500                   if (src == src_end)
5501                     goto too_short;
5502                   ONE_MORE_BYTE (c);
5503                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5504                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5505                     break;
5506                   idx++;
5507                 }
5508               if (idx == dim)
5509                 {
5510                   val = Qnil;
5511                   break;
5512                 }
5513             }
5514           if (CONSP (val))
5515             break;
5516         }
5517     }
5518  too_short:
5519   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5520   return 0;
5521
5522  no_more_source:
5523   detect_info->found |= found;
5524   return 1;
5525 }
5526
5527 static void
5528 decode_coding_charset (struct coding_system *coding)
5529 {
5530   const unsigned char *src = coding->source + coding->consumed;
5531   const unsigned char *src_end = coding->source + coding->src_bytes;
5532   const unsigned char *src_base;
5533   int *charbuf = coding->charbuf + coding->charbuf_used;
5534   /* We may produce one charset annotation in one loop and one more at
5535      the end.  */
5536   int *charbuf_end
5537     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5538   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5539   bool multibytep = coding->src_multibyte;
5540   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5541   Lisp_Object valids;
5542   ptrdiff_t char_offset = coding->produced_char;
5543   ptrdiff_t last_offset = char_offset;
5544   int last_id = charset_ascii;
5545   bool eol_dos
5546     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5547   int byte_after_cr = -1;
5548
5549   valids = AREF (attrs, coding_attr_charset_valids);
5550
5551   while (1)
5552     {
5553       int c;
5554       Lisp_Object val;
5555       struct charset *charset;
5556       int dim;
5557       int len = 1;
5558       unsigned code;
5559
5560       src_base = src;
5561       consumed_chars_base = consumed_chars;
5562
5563       if (charbuf >= charbuf_end)
5564         {
5565           if (byte_after_cr >= 0)
5566             src_base--;
5567           break;
5568         }
5569
5570       if (byte_after_cr >= 0)
5571         {
5572           c = byte_after_cr;
5573           byte_after_cr = -1;
5574         }
5575       else
5576         {
5577           ONE_MORE_BYTE (c);
5578           if (eol_dos && c == '\r')
5579             ONE_MORE_BYTE (byte_after_cr);
5580         }
5581       if (c < 0)
5582         goto invalid_code;
5583       code = c;
5584
5585       val = AREF (valids, c);
5586       if (! INTEGERP (val) && ! CONSP (val))
5587         goto invalid_code;
5588       if (INTEGERP (val))
5589         {
5590           charset = CHARSET_FROM_ID (XFASTINT (val));
5591           dim = CHARSET_DIMENSION (charset);
5592           while (len < dim)
5593             {
5594               ONE_MORE_BYTE (c);
5595               code = (code << 8) | c;
5596               len++;
5597             }
5598           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5599                               charset, code, c);
5600         }
5601       else
5602         {
5603           /* VAL is a list of charset IDs.  It is assured that the
5604              list is sorted by charset dimensions (smaller one
5605              comes first).  */
5606           while (CONSP (val))
5607             {
5608               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5609               dim = CHARSET_DIMENSION (charset);
5610               while (len < dim)
5611                 {
5612                   ONE_MORE_BYTE (c);
5613                   code = (code << 8) | c;
5614                   len++;
5615                 }
5616               CODING_DECODE_CHAR (coding, src, src_base,
5617                                   src_end, charset, code, c);
5618               if (c >= 0)
5619                 break;
5620               val = XCDR (val);
5621             }
5622         }
5623       if (c < 0)
5624         goto invalid_code;
5625       if (charset->id != charset_ascii
5626           && last_id != charset->id)
5627         {
5628           if (last_id != charset_ascii)
5629             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5630           last_id = charset->id;
5631           last_offset = char_offset;
5632         }
5633
5634       *charbuf++ = c;
5635       char_offset++;
5636       continue;
5637
5638     invalid_code:
5639       src = src_base;
5640       consumed_chars = consumed_chars_base;
5641       ONE_MORE_BYTE (c);
5642       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5643       char_offset++;
5644       coding->errors++;
5645     }
5646
5647  no_more_source:
5648   if (last_id != charset_ascii)
5649     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5650   coding->consumed_char += consumed_chars_base;
5651   coding->consumed = src_base - coding->source;
5652   coding->charbuf_used = charbuf - coding->charbuf;
5653 }
5654
5655 static bool
5656 encode_coding_charset (struct coding_system *coding)
5657 {
5658   bool multibytep = coding->dst_multibyte;
5659   int *charbuf = coding->charbuf;
5660   int *charbuf_end = charbuf + coding->charbuf_used;
5661   unsigned char *dst = coding->destination + coding->produced;
5662   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5663   int safe_room = MAX_MULTIBYTE_LENGTH;
5664   ptrdiff_t produced_chars = 0;
5665   Lisp_Object attrs, charset_list;
5666   bool ascii_compatible;
5667   int c;
5668
5669   CODING_GET_INFO (coding, attrs, charset_list);
5670   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5671
5672   while (charbuf < charbuf_end)
5673     {
5674       struct charset *charset;
5675       unsigned code;
5676
5677       ASSURE_DESTINATION (safe_room);
5678       c = *charbuf++;
5679       if (ascii_compatible && ASCII_CHAR_P (c))
5680         EMIT_ONE_ASCII_BYTE (c);
5681       else if (CHAR_BYTE8_P (c))
5682         {
5683           c = CHAR_TO_BYTE8 (c);
5684           EMIT_ONE_BYTE (c);
5685         }
5686       else
5687         {
5688           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5689                                &code, charset);
5690
5691           if (charset)
5692             {
5693               if (CHARSET_DIMENSION (charset) == 1)
5694                 EMIT_ONE_BYTE (code);
5695               else if (CHARSET_DIMENSION (charset) == 2)
5696                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5697               else if (CHARSET_DIMENSION (charset) == 3)
5698                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5699               else
5700                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5701                                  (code >> 8) & 0xFF, code & 0xFF);
5702             }
5703           else
5704             {
5705               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5706                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5707               else
5708                 c = coding->default_char;
5709               EMIT_ONE_BYTE (c);
5710             }
5711         }
5712     }
5713
5714   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5715   coding->produced_char += produced_chars;
5716   coding->produced = dst - coding->destination;
5717   return 0;
5718 }
5719
5720 \f
5721 /*** 7. C library functions ***/
5722
5723 /* Setup coding context CODING from information about CODING_SYSTEM.
5724    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5725    CODING_SYSTEM is invalid, signal an error.  */
5726
5727 void
5728 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5729 {
5730   Lisp_Object attrs;
5731   Lisp_Object eol_type;
5732   Lisp_Object coding_type;
5733   Lisp_Object val;
5734
5735   if (NILP (coding_system))
5736     coding_system = Qundecided;
5737
5738   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5739
5740   attrs = CODING_ID_ATTRS (coding->id);
5741   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5742
5743   coding->mode = 0;
5744   if (VECTORP (eol_type))
5745     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5746                             | CODING_REQUIRE_DETECTION_MASK);
5747   else if (! EQ (eol_type, Qunix))
5748     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5749                             | CODING_REQUIRE_ENCODING_MASK);
5750   else
5751     coding->common_flags = 0;
5752   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5753     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5754   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5755     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5756   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5757     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5758
5759   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5760   coding->max_charset_id = SCHARS (val) - 1;
5761   coding->safe_charsets = SDATA (val);
5762   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5763   coding->carryover_bytes = 0;
5764
5765   coding_type = CODING_ATTR_TYPE (attrs);
5766   if (EQ (coding_type, Qundecided))
5767     {
5768       coding->detector = NULL;
5769       coding->decoder = decode_coding_raw_text;
5770       coding->encoder = encode_coding_raw_text;
5771       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5772       coding->spec.undecided.inhibit_nbd
5773         = (encode_inhibit_flag
5774            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5775       coding->spec.undecided.inhibit_ied
5776         = (encode_inhibit_flag
5777            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5778       coding->spec.undecided.prefer_utf_8
5779         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5780     }
5781   else if (EQ (coding_type, Qiso_2022))
5782     {
5783       int i;
5784       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5785
5786       /* Invoke graphic register 0 to plane 0.  */
5787       CODING_ISO_INVOCATION (coding, 0) = 0;
5788       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5789       CODING_ISO_INVOCATION (coding, 1)
5790         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5791       /* Setup the initial status of designation.  */
5792       for (i = 0; i < 4; i++)
5793         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5794       /* Not single shifting initially.  */
5795       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5796       /* Beginning of buffer should also be regarded as bol. */
5797       CODING_ISO_BOL (coding) = 1;
5798       coding->detector = detect_coding_iso_2022;
5799       coding->decoder = decode_coding_iso_2022;
5800       coding->encoder = encode_coding_iso_2022;
5801       if (flags & CODING_ISO_FLAG_SAFE)
5802         coding->mode |= CODING_MODE_SAFE_ENCODING;
5803       coding->common_flags
5804         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5805             | CODING_REQUIRE_FLUSHING_MASK);
5806       if (flags & CODING_ISO_FLAG_COMPOSITION)
5807         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5808       if (flags & CODING_ISO_FLAG_DESIGNATION)
5809         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5810       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5811         {
5812           setup_iso_safe_charsets (attrs);
5813           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5814           coding->max_charset_id = SCHARS (val) - 1;
5815           coding->safe_charsets = SDATA (val);
5816         }
5817       CODING_ISO_FLAGS (coding) = flags;
5818       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5819       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5820       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5821       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5822     }
5823   else if (EQ (coding_type, Qcharset))
5824     {
5825       coding->detector = detect_coding_charset;
5826       coding->decoder = decode_coding_charset;
5827       coding->encoder = encode_coding_charset;
5828       coding->common_flags
5829         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5830     }
5831   else if (EQ (coding_type, Qutf_8))
5832     {
5833       val = AREF (attrs, coding_attr_utf_bom);
5834       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5835                                    : EQ (val, Qt) ? utf_with_bom
5836                                    : utf_without_bom);
5837       coding->detector = detect_coding_utf_8;
5838       coding->decoder = decode_coding_utf_8;
5839       coding->encoder = encode_coding_utf_8;
5840       coding->common_flags
5841         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5842       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5843         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5844     }
5845   else if (EQ (coding_type, Qutf_16))
5846     {
5847       val = AREF (attrs, coding_attr_utf_bom);
5848       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5849                                     : EQ (val, Qt) ? utf_with_bom
5850                                     : utf_without_bom);
5851       val = AREF (attrs, coding_attr_utf_16_endian);
5852       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5853                                        : utf_16_little_endian);
5854       CODING_UTF_16_SURROGATE (coding) = 0;
5855       coding->detector = detect_coding_utf_16;
5856       coding->decoder = decode_coding_utf_16;
5857       coding->encoder = encode_coding_utf_16;
5858       coding->common_flags
5859         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5860       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5861         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5862     }
5863   else if (EQ (coding_type, Qccl))
5864     {
5865       coding->detector = detect_coding_ccl;
5866       coding->decoder = decode_coding_ccl;
5867       coding->encoder = encode_coding_ccl;
5868       coding->common_flags
5869         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5870             | CODING_REQUIRE_FLUSHING_MASK);
5871     }
5872   else if (EQ (coding_type, Qemacs_mule))
5873     {
5874       coding->detector = detect_coding_emacs_mule;
5875       coding->decoder = decode_coding_emacs_mule;
5876       coding->encoder = encode_coding_emacs_mule;
5877       coding->common_flags
5878         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5879       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5880           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5881         {
5882           Lisp_Object tail, safe_charsets;
5883           int max_charset_id = 0;
5884
5885           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5886                tail = XCDR (tail))
5887             if (max_charset_id < XFASTINT (XCAR (tail)))
5888               max_charset_id = XFASTINT (XCAR (tail));
5889           safe_charsets = make_uninit_string (max_charset_id + 1);
5890           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5891           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5892                tail = XCDR (tail))
5893             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5894           coding->max_charset_id = max_charset_id;
5895           coding->safe_charsets = SDATA (safe_charsets);
5896         }
5897       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5898       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5899     }
5900   else if (EQ (coding_type, Qshift_jis))
5901     {
5902       coding->detector = detect_coding_sjis;
5903       coding->decoder = decode_coding_sjis;
5904       coding->encoder = encode_coding_sjis;
5905       coding->common_flags
5906         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5907     }
5908   else if (EQ (coding_type, Qbig5))
5909     {
5910       coding->detector = detect_coding_big5;
5911       coding->decoder = decode_coding_big5;
5912       coding->encoder = encode_coding_big5;
5913       coding->common_flags
5914         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5915     }
5916   else                          /* EQ (coding_type, Qraw_text) */
5917     {
5918       coding->detector = NULL;
5919       coding->decoder = decode_coding_raw_text;
5920       coding->encoder = encode_coding_raw_text;
5921       if (! EQ (eol_type, Qunix))
5922         {
5923           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5924           if (! VECTORP (eol_type))
5925             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5926         }
5927
5928     }
5929
5930   return;
5931 }
5932
5933 /* Return a list of charsets supported by CODING.  */
5934
5935 Lisp_Object
5936 coding_charset_list (struct coding_system *coding)
5937 {
5938   Lisp_Object attrs, charset_list;
5939
5940   CODING_GET_INFO (coding, attrs, charset_list);
5941   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5942     {
5943       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5944
5945       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5946         charset_list = Viso_2022_charset_list;
5947     }
5948   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5949     {
5950       charset_list = Vemacs_mule_charset_list;
5951     }
5952   return charset_list;
5953 }
5954
5955
5956 /* Return a list of charsets supported by CODING-SYSTEM.  */
5957
5958 Lisp_Object
5959 coding_system_charset_list (Lisp_Object coding_system)
5960 {
5961   ptrdiff_t id;
5962   Lisp_Object attrs, charset_list;
5963
5964   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5965   attrs = CODING_ID_ATTRS (id);
5966
5967   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5968     {
5969       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5970
5971       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5972         charset_list = Viso_2022_charset_list;
5973       else
5974         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5975     }
5976   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5977     {
5978       charset_list = Vemacs_mule_charset_list;
5979     }
5980   else
5981     {
5982       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5983     }
5984   return charset_list;
5985 }
5986
5987
5988 /* Return raw-text or one of its subsidiaries that has the same
5989    eol_type as CODING-SYSTEM.  */
5990
5991 Lisp_Object
5992 raw_text_coding_system (Lisp_Object coding_system)
5993 {
5994   Lisp_Object spec, attrs;
5995   Lisp_Object eol_type, raw_text_eol_type;
5996
5997   if (NILP (coding_system))
5998     return Qraw_text;
5999   spec = CODING_SYSTEM_SPEC (coding_system);
6000   attrs = AREF (spec, 0);
6001
6002   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6003     return coding_system;
6004
6005   eol_type = AREF (spec, 2);
6006   if (VECTORP (eol_type))
6007     return Qraw_text;
6008   spec = CODING_SYSTEM_SPEC (Qraw_text);
6009   raw_text_eol_type = AREF (spec, 2);
6010   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6011           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6012           : AREF (raw_text_eol_type, 2));
6013 }
6014
6015
6016 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6017    the subsidiary that has the same eol-spec as PARENT (if it is not
6018    nil and specifies end-of-line format) or the system's setting
6019    (system_eol_type).  */
6020
6021 Lisp_Object
6022 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6023 {
6024   Lisp_Object spec, eol_type;
6025
6026   if (NILP (coding_system))
6027     coding_system = Qraw_text;
6028   spec = CODING_SYSTEM_SPEC (coding_system);
6029   eol_type = AREF (spec, 2);
6030   if (VECTORP (eol_type))
6031     {
6032       Lisp_Object parent_eol_type;
6033
6034       if (! NILP (parent))
6035         {
6036           Lisp_Object parent_spec;
6037
6038           parent_spec = CODING_SYSTEM_SPEC (parent);
6039           parent_eol_type = AREF (parent_spec, 2);
6040           if (VECTORP (parent_eol_type))
6041             parent_eol_type = system_eol_type;
6042         }
6043       else
6044         parent_eol_type = system_eol_type;
6045       if (EQ (parent_eol_type, Qunix))
6046         coding_system = AREF (eol_type, 0);
6047       else if (EQ (parent_eol_type, Qdos))
6048         coding_system = AREF (eol_type, 1);
6049       else if (EQ (parent_eol_type, Qmac))
6050         coding_system = AREF (eol_type, 2);
6051     }
6052   return coding_system;
6053 }
6054
6055
6056 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6057    decided for writing to a process.  If not, complement them, and
6058    return a new coding system.  */
6059
6060 Lisp_Object
6061 complement_process_encoding_system (Lisp_Object coding_system)
6062 {
6063   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6064   Lisp_Object spec, attrs;
6065   int i;
6066
6067   for (i = 0; i < 3; i++)
6068     {
6069       if (i == 1)
6070         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6071       else if (i == 2)
6072         coding_system = preferred_coding_system ();
6073       spec = CODING_SYSTEM_SPEC (coding_system);
6074       if (NILP (spec))
6075         continue;
6076       attrs = AREF (spec, 0);
6077       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6078         coding_base = CODING_ATTR_BASE_NAME (attrs);
6079       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6080         eol_base = coding_system;
6081       if (! NILP (coding_base) && ! NILP (eol_base))
6082         break;
6083     }
6084
6085   if (i > 0)
6086     /* The original CODING_SYSTEM didn't specify text-conversion or
6087        eol-conversion.  Be sure that we return a fully complemented
6088        coding system.  */
6089     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6090   return coding_system;
6091 }
6092
6093
6094 /* Emacs has a mechanism to automatically detect a coding system if it
6095    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6096    it's impossible to distinguish some coding systems accurately
6097    because they use the same range of codes.  So, at first, coding
6098    systems are categorized into 7, those are:
6099
6100    o coding-category-emacs-mule
6101
6102         The category for a coding system which has the same code range
6103         as Emacs' internal format.  Assigned the coding-system (Lisp
6104         symbol) `emacs-mule' by default.
6105
6106    o coding-category-sjis
6107
6108         The category for a coding system which has the same code range
6109         as SJIS.  Assigned the coding-system (Lisp
6110         symbol) `japanese-shift-jis' by default.
6111
6112    o coding-category-iso-7
6113
6114         The category for a coding system which has the same code range
6115         as ISO2022 of 7-bit environment.  This doesn't use any locking
6116         shift and single shift functions.  This can encode/decode all
6117         charsets.  Assigned the coding-system (Lisp symbol)
6118         `iso-2022-7bit' by default.
6119
6120    o coding-category-iso-7-tight
6121
6122         Same as coding-category-iso-7 except that this can
6123         encode/decode only the specified charsets.
6124
6125    o coding-category-iso-8-1
6126
6127         The category for a coding system which has the same code range
6128         as ISO2022 of 8-bit environment and graphic plane 1 used only
6129         for DIMENSION1 charset.  This doesn't use any locking shift
6130         and single shift functions.  Assigned the coding-system (Lisp
6131         symbol) `iso-latin-1' by default.
6132
6133    o coding-category-iso-8-2
6134
6135         The category for a coding system which has the same code range
6136         as ISO2022 of 8-bit environment and graphic plane 1 used only
6137         for DIMENSION2 charset.  This doesn't use any locking shift
6138         and single shift functions.  Assigned the coding-system (Lisp
6139         symbol) `japanese-iso-8bit' by default.
6140
6141    o coding-category-iso-7-else
6142
6143         The category for a coding system which has the same code range
6144         as ISO2022 of 7-bit environment but uses locking shift or
6145         single shift functions.  Assigned the coding-system (Lisp
6146         symbol) `iso-2022-7bit-lock' by default.
6147
6148    o coding-category-iso-8-else
6149
6150         The category for a coding system which has the same code range
6151         as ISO2022 of 8-bit environment but uses locking shift or
6152         single shift functions.  Assigned the coding-system (Lisp
6153         symbol) `iso-2022-8bit-ss2' by default.
6154
6155    o coding-category-big5
6156
6157         The category for a coding system which has the same code range
6158         as BIG5.  Assigned the coding-system (Lisp symbol)
6159         `cn-big5' by default.
6160
6161    o coding-category-utf-8
6162
6163         The category for a coding system which has the same code range
6164         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6165         symbol) `utf-8' by default.
6166
6167    o coding-category-utf-16-be
6168
6169         The category for a coding system in which a text has an
6170         Unicode signature (cf. Unicode Standard) in the order of BIG
6171         endian at the head.  Assigned the coding-system (Lisp symbol)
6172         `utf-16-be' by default.
6173
6174    o coding-category-utf-16-le
6175
6176         The category for a coding system in which a text has an
6177         Unicode signature (cf. Unicode Standard) in the order of
6178         LITTLE endian at the head.  Assigned the coding-system (Lisp
6179         symbol) `utf-16-le' by default.
6180
6181    o coding-category-ccl
6182
6183         The category for a coding system of which encoder/decoder is
6184         written in CCL programs.  The default value is nil, i.e., no
6185         coding system is assigned.
6186
6187    o coding-category-binary
6188
6189         The category for a coding system not categorized in any of the
6190         above.  Assigned the coding-system (Lisp symbol)
6191         `no-conversion' by default.
6192
6193    Each of them is a Lisp symbol and the value is an actual
6194    `coding-system's (this is also a Lisp symbol) assigned by a user.
6195    What Emacs does actually is to detect a category of coding system.
6196    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6197    decide only one possible category, it selects a category of the
6198    highest priority.  Priorities of categories are also specified by a
6199    user in a Lisp variable `coding-category-list'.
6200
6201 */
6202
6203 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6204                                            int eol_seen);
6205
6206
6207 /* Return the number of ASCII characters at the head of the source.
6208    By side effects, set coding->head_ascii and update
6209    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6210    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6211    reliable only when all the source bytes are ASCII.  */
6212
6213 static int
6214 check_ascii (struct coding_system *coding)
6215 {
6216   const unsigned char *src, *end;
6217   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6218   int eol_seen = coding->eol_seen;
6219
6220   coding_set_source (coding);
6221   src = coding->source;
6222   end = src + coding->src_bytes;
6223
6224   if (inhibit_eol_conversion
6225       || SYMBOLP (eol_type))
6226     {
6227       /* We don't have to check EOL format.  */
6228       while (src < end && !( *src & 0x80))
6229         {
6230           if (*src++ == '\n')
6231             eol_seen |= EOL_SEEN_LF;
6232         }
6233     }
6234   else
6235     {
6236       end--;                /* We look ahead one byte for "CR LF".  */
6237       while (src < end)
6238         {
6239           int c = *src;
6240
6241           if (c & 0x80)
6242             break;
6243           src++;
6244           if (c == '\r')
6245             {
6246               if (*src == '\n')
6247                 {
6248                   eol_seen |= EOL_SEEN_CRLF;
6249                   src++;
6250                 }
6251               else
6252                 eol_seen |= EOL_SEEN_CR;
6253             }
6254           else if (c == '\n')
6255             eol_seen |= EOL_SEEN_LF;
6256         }
6257       if (src == end)
6258         {
6259           int c = *src;
6260
6261           /* All bytes but the last one C are ASCII.  */
6262           if (! (c & 0x80))
6263             {
6264               if (c == '\r')
6265                 eol_seen |= EOL_SEEN_CR;
6266               else if (c  == '\n')
6267                 eol_seen |= EOL_SEEN_LF;
6268               src++;
6269             }
6270         }
6271     }
6272   coding->head_ascii = src - coding->source;
6273   coding->eol_seen = eol_seen;
6274   return (coding->head_ascii);
6275 }
6276
6277
6278 /* Return the number of characters at the source if all the bytes are
6279    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6280    effects, update coding->eol_seen.  The value of coding->eol_seen is
6281    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6282    the value is reliable only when all the source bytes are valid
6283    UTF-8.  */
6284
6285 static int
6286 check_utf_8 (struct coding_system *coding)
6287 {
6288   const unsigned char *src, *end;
6289   int eol_seen;
6290   int nchars = coding->head_ascii;
6291
6292   if (coding->head_ascii < 0)
6293     check_ascii (coding);
6294   else
6295     coding_set_source (coding);
6296   src = coding->source + coding->head_ascii;
6297   /* We look ahead one byte for CR LF.  */
6298   end = coding->source + coding->src_bytes - 1;
6299   eol_seen = coding->eol_seen;
6300   while (src < end)
6301     {
6302       int c = *src;
6303
6304       if (UTF_8_1_OCTET_P (*src))
6305         {
6306           src++;
6307           if (c < 0x20)
6308             {
6309               if (c == '\r')
6310                 {
6311                   if (*src == '\n')
6312                     {
6313                       eol_seen |= EOL_SEEN_CRLF;
6314                       src++;
6315                       nchars++;
6316                     }
6317                   else
6318                     eol_seen |= EOL_SEEN_CR;
6319                 }
6320               else if (c == '\n')
6321                 eol_seen |= EOL_SEEN_LF;
6322             }
6323         }
6324       else if (UTF_8_2_OCTET_LEADING_P (c))
6325         {
6326           if (c < 0xC2          /* overlong sequence */
6327               || src + 1 >= end
6328               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6329             return -1;
6330           src += 2;
6331         }
6332       else if (UTF_8_3_OCTET_LEADING_P (c))
6333         {
6334           if (src + 2 >= end
6335               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6336                     && UTF_8_EXTRA_OCTET_P (src[2])))
6337             return -1;
6338           c = (((c & 0xF) << 12)
6339                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6340           if (c < 0x800                       /* overlong sequence */
6341               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6342             return -1;
6343           src += 3;
6344         }
6345       else if (UTF_8_4_OCTET_LEADING_P (c))
6346         {
6347           if (src + 3 >= end
6348               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6349                     && UTF_8_EXTRA_OCTET_P (src[2])
6350                     && UTF_8_EXTRA_OCTET_P (src[3])))
6351             return -1;
6352           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6353                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6354           if (c < 0x10000       /* overlong sequence */
6355               || c >= 0x110000) /* non-Unicode character  */
6356             return -1;
6357           src += 4;
6358         }
6359       else
6360         return -1;
6361       nchars++;
6362     }
6363
6364   if (src == end)
6365     {
6366       if (! UTF_8_1_OCTET_P (*src))
6367         return -1;
6368       nchars++;
6369       if (*src == '\r')
6370         eol_seen |= EOL_SEEN_CR;
6371       else if (*src  == '\n')
6372         eol_seen |= EOL_SEEN_LF;
6373     }
6374   coding->eol_seen = eol_seen;
6375   return nchars;
6376 }
6377
6378
6379 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6380    SOURCE is encoded.  If CATEGORY is one of
6381    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6382    two-byte, else they are encoded by one-byte.
6383
6384    Return one of EOL_SEEN_XXX.  */
6385
6386 #define MAX_EOL_CHECK_COUNT 3
6387
6388 static int
6389 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6390             enum coding_category category)
6391 {
6392   const unsigned char *src = source, *src_end = src + src_bytes;
6393   unsigned char c;
6394   int total  = 0;
6395   int eol_seen = EOL_SEEN_NONE;
6396
6397   if ((1 << category) & CATEGORY_MASK_UTF_16)
6398     {
6399       bool msb = category == (coding_category_utf_16_le
6400                               | coding_category_utf_16_le_nosig);
6401       bool lsb = !msb;
6402
6403       while (src + 1 < src_end)
6404         {
6405           c = src[lsb];
6406           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6407             {
6408               int this_eol;
6409
6410               if (c == '\n')
6411                 this_eol = EOL_SEEN_LF;
6412               else if (src + 3 >= src_end
6413                        || src[msb + 2] != 0
6414                        || src[lsb + 2] != '\n')
6415                 this_eol = EOL_SEEN_CR;
6416               else
6417                 {
6418                   this_eol = EOL_SEEN_CRLF;
6419                   src += 2;
6420                 }
6421
6422               if (eol_seen == EOL_SEEN_NONE)
6423                 /* This is the first end-of-line.  */
6424                 eol_seen = this_eol;
6425               else if (eol_seen != this_eol)
6426                 {
6427                   /* The found type is different from what found before.
6428                      Allow for stray ^M characters in DOS EOL files.  */
6429                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6430                       || (eol_seen == EOL_SEEN_CRLF
6431                           && this_eol == EOL_SEEN_CR))
6432                     eol_seen = EOL_SEEN_CRLF;
6433                   else
6434                     {
6435                       eol_seen = EOL_SEEN_LF;
6436                       break;
6437                     }
6438                 }
6439               if (++total == MAX_EOL_CHECK_COUNT)
6440                 break;
6441             }
6442           src += 2;
6443         }
6444     }
6445   else
6446     while (src < src_end)
6447       {
6448         c = *src++;
6449         if (c == '\n' || c == '\r')
6450           {
6451             int this_eol;
6452
6453             if (c == '\n')
6454               this_eol = EOL_SEEN_LF;
6455             else if (src >= src_end || *src != '\n')
6456               this_eol = EOL_SEEN_CR;
6457             else
6458               this_eol = EOL_SEEN_CRLF, src++;
6459
6460             if (eol_seen == EOL_SEEN_NONE)
6461               /* This is the first end-of-line.  */
6462               eol_seen = this_eol;
6463             else if (eol_seen != this_eol)
6464               {
6465                 /* The found type is different from what found before.
6466                    Allow for stray ^M characters in DOS EOL files.  */
6467                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6468                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6469                   eol_seen = EOL_SEEN_CRLF;
6470                 else
6471                   {
6472                     eol_seen = EOL_SEEN_LF;
6473                     break;
6474                   }
6475               }
6476             if (++total == MAX_EOL_CHECK_COUNT)
6477               break;
6478           }
6479       }
6480   return eol_seen;
6481 }
6482
6483
6484 static Lisp_Object
6485 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6486 {
6487   Lisp_Object eol_type;
6488
6489   eol_type = CODING_ID_EOL_TYPE (coding->id);
6490   if (! VECTORP (eol_type))
6491     /* Already adjusted.  */
6492     return eol_type;
6493   if (eol_seen & EOL_SEEN_LF)
6494     {
6495       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6496       eol_type = Qunix;
6497     }
6498   else if (eol_seen & EOL_SEEN_CRLF)
6499     {
6500       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6501       eol_type = Qdos;
6502     }
6503   else if (eol_seen & EOL_SEEN_CR)
6504     {
6505       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6506       eol_type = Qmac;
6507     }
6508   return eol_type;
6509 }
6510
6511 /* Detect how a text specified in CODING is encoded.  If a coding
6512    system is detected, update fields of CODING by the detected coding
6513    system.  */
6514
6515 static void
6516 detect_coding (struct coding_system *coding)
6517 {
6518   const unsigned char *src, *src_end;
6519   unsigned int saved_mode = coding->mode;
6520   Lisp_Object found = Qnil;
6521   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6522
6523   coding->consumed = coding->consumed_char = 0;
6524   coding->produced = coding->produced_char = 0;
6525   coding_set_source (coding);
6526
6527   src_end = coding->source + coding->src_bytes;
6528
6529   coding->eol_seen = EOL_SEEN_NONE;
6530   /* If we have not yet decided the text encoding type, detect it
6531      now.  */
6532   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6533     {
6534       int c, i;
6535       struct coding_detection_info detect_info;
6536       bool null_byte_found = 0, eight_bit_found = 0;
6537       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6538                                        inhibit_null_byte_detection);
6539       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6540                                        inhibit_iso_escape_detection);
6541       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6542
6543       coding->head_ascii = 0;
6544       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6545       for (src = coding->source; src < src_end; src++)
6546         {
6547           c = *src;
6548           if (c & 0x80)
6549             {
6550               eight_bit_found = 1;
6551               if (null_byte_found)
6552                 break;
6553             }
6554           else if (c < 0x20)
6555             {
6556               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6557                   && ! inhibit_ied
6558                   && ! detect_info.checked)
6559                 {
6560                   if (detect_coding_iso_2022 (coding, &detect_info))
6561                     {
6562                       /* We have scanned the whole data.  */
6563                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6564                         {
6565                           /* We didn't find an 8-bit code.  We may
6566                              have found a null-byte, but it's very
6567                              rare that a binary file conforms to
6568                              ISO-2022.  */
6569                           src = src_end;
6570                           coding->head_ascii = src - coding->source;
6571                         }
6572                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6573                       break;
6574                     }
6575                 }
6576               else if (! c && !inhibit_nbd)
6577                 {
6578                   null_byte_found = 1;
6579                   if (eight_bit_found)
6580                     break;
6581                 }
6582               else if (! disable_ascii_optimization
6583                        && ! inhibit_eol_conversion)
6584                 {
6585                   if (c == '\r')
6586                     {
6587                       if (src < src_end && src[1] == '\n')
6588                         {
6589                           coding->eol_seen |= EOL_SEEN_CRLF;
6590                           src++;
6591                           if (! eight_bit_found)
6592                             coding->head_ascii++;
6593                         }
6594                       else
6595                         coding->eol_seen |= EOL_SEEN_CR;
6596                     }
6597                   else if (c == '\n')
6598                     {
6599                       coding->eol_seen |= EOL_SEEN_LF;
6600                     }
6601                 }
6602
6603               if (! eight_bit_found)
6604                 coding->head_ascii++;
6605             }
6606           else if (! eight_bit_found)
6607             coding->head_ascii++;
6608         }
6609
6610       if (null_byte_found || eight_bit_found
6611           || coding->head_ascii < coding->src_bytes
6612           || detect_info.found)
6613         {
6614           enum coding_category category;
6615           struct coding_system *this;
6616
6617           if (coding->head_ascii == coding->src_bytes)
6618             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6619             for (i = 0; i < coding_category_raw_text; i++)
6620               {
6621                 category = coding_priorities[i];
6622                 this = coding_categories + category;
6623                 if (detect_info.found & (1 << category))
6624                   break;
6625               }
6626           else
6627             {
6628               if (null_byte_found)
6629                 {
6630                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6631                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6632                 }
6633               else if (prefer_utf_8
6634                        && detect_coding_utf_8 (coding, &detect_info))
6635                 {
6636                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6637                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6638                 }
6639               for (i = 0; i < coding_category_raw_text; i++)
6640                 {
6641                   category = coding_priorities[i];
6642                   this = coding_categories + category;
6643                   /* Some of this->detector (e.g. detect_coding_sjis)
6644                      require this information.  */
6645                   coding->id = this->id;
6646                   if (this->id < 0)
6647                     {
6648                       /* No coding system of this category is defined.  */
6649                       detect_info.rejected |= (1 << category);
6650                     }
6651                   else if (category >= coding_category_raw_text)
6652                     continue;
6653                   else if (detect_info.checked & (1 << category))
6654                     {
6655                       if (detect_info.found & (1 << category))
6656                         break;
6657                     }
6658                   else if ((*(this->detector)) (coding, &detect_info)
6659                            && detect_info.found & (1 << category))
6660                     break;
6661                 }
6662             }
6663
6664           if (i < coding_category_raw_text)
6665             {
6666               if (category == coding_category_utf_8_auto)
6667                 {
6668                   Lisp_Object coding_systems;
6669
6670                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6671                                          coding_attr_utf_bom);
6672                   if (CONSP (coding_systems))
6673                     {
6674                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6675                         found = XCAR (coding_systems);
6676                       else
6677                         found = XCDR (coding_systems);
6678                     }
6679                   else
6680                     found = CODING_ID_NAME (this->id);
6681                 }
6682               else if (category == coding_category_utf_16_auto)
6683                 {
6684                   Lisp_Object coding_systems;
6685
6686                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6687                                          coding_attr_utf_bom);
6688                   if (CONSP (coding_systems))
6689                     {
6690                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6691                         found = XCAR (coding_systems);
6692                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6693                         found = XCDR (coding_systems);
6694                     }
6695                   else
6696                     found = CODING_ID_NAME (this->id);
6697                 }
6698               else
6699                 found = CODING_ID_NAME (this->id);
6700             }
6701           else if (null_byte_found)
6702             found = Qno_conversion;
6703           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6704                    == CATEGORY_MASK_ANY)
6705             found = Qraw_text;
6706           else if (detect_info.rejected)
6707             for (i = 0; i < coding_category_raw_text; i++)
6708               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6709                 {
6710                   this = coding_categories + coding_priorities[i];
6711                   found = CODING_ID_NAME (this->id);
6712                   break;
6713                 }
6714         }
6715     }
6716   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6717            == coding_category_utf_8_auto)
6718     {
6719       Lisp_Object coding_systems;
6720       struct coding_detection_info detect_info;
6721
6722       coding_systems
6723         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6724       detect_info.found = detect_info.rejected = 0;
6725       if (check_ascii (coding) == coding->src_bytes)
6726         {
6727           if (CONSP (coding_systems))
6728             found = XCDR (coding_systems);
6729         }
6730       else
6731         {
6732           if (CONSP (coding_systems)
6733               && detect_coding_utf_8 (coding, &detect_info))
6734             {
6735               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6736                 found = XCAR (coding_systems);
6737               else
6738                 found = XCDR (coding_systems);
6739             }
6740         }
6741     }
6742   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6743            == coding_category_utf_16_auto)
6744     {
6745       Lisp_Object coding_systems;
6746       struct coding_detection_info detect_info;
6747
6748       coding_systems
6749         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6750       detect_info.found = detect_info.rejected = 0;
6751       coding->head_ascii = 0;
6752       if (CONSP (coding_systems)
6753           && detect_coding_utf_16 (coding, &detect_info))
6754         {
6755           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6756             found = XCAR (coding_systems);
6757           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6758             found = XCDR (coding_systems);
6759         }
6760     }
6761
6762   if (! NILP (found))
6763     {
6764       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6765                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6766                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6767                            : EOL_SEEN_LF);
6768
6769       setup_coding_system (found, coding);
6770       if (specified_eol != EOL_SEEN_NONE)
6771         adjust_coding_eol_type (coding, specified_eol);
6772     }
6773
6774   coding->mode = saved_mode;
6775 }
6776
6777
6778 static void
6779 decode_eol (struct coding_system *coding)
6780 {
6781   Lisp_Object eol_type;
6782   unsigned char *p, *pbeg, *pend;
6783
6784   eol_type = CODING_ID_EOL_TYPE (coding->id);
6785   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6786     return;
6787
6788   if (NILP (coding->dst_object))
6789     pbeg = coding->destination;
6790   else
6791     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6792   pend = pbeg + coding->produced;
6793
6794   if (VECTORP (eol_type))
6795     {
6796       int eol_seen = EOL_SEEN_NONE;
6797
6798       for (p = pbeg; p < pend; p++)
6799         {
6800           if (*p == '\n')
6801             eol_seen |= EOL_SEEN_LF;
6802           else if (*p == '\r')
6803             {
6804               if (p + 1 < pend && *(p + 1) == '\n')
6805                 {
6806                   eol_seen |= EOL_SEEN_CRLF;
6807                   p++;
6808                 }
6809               else
6810                 eol_seen |= EOL_SEEN_CR;
6811             }
6812         }
6813       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6814       if ((eol_seen & EOL_SEEN_CRLF) != 0
6815           && (eol_seen & EOL_SEEN_CR) != 0
6816           && (eol_seen & EOL_SEEN_LF) == 0)
6817         eol_seen = EOL_SEEN_CRLF;
6818       else if (eol_seen != EOL_SEEN_NONE
6819           && eol_seen != EOL_SEEN_LF
6820           && eol_seen != EOL_SEEN_CRLF
6821           && eol_seen != EOL_SEEN_CR)
6822         eol_seen = EOL_SEEN_LF;
6823       if (eol_seen != EOL_SEEN_NONE)
6824         eol_type = adjust_coding_eol_type (coding, eol_seen);
6825     }
6826
6827   if (EQ (eol_type, Qmac))
6828     {
6829       for (p = pbeg; p < pend; p++)
6830         if (*p == '\r')
6831           *p = '\n';
6832     }
6833   else if (EQ (eol_type, Qdos))
6834     {
6835       ptrdiff_t n = 0;
6836
6837       if (NILP (coding->dst_object))
6838         {
6839           /* Start deleting '\r' from the tail to minimize the memory
6840              movement.  */
6841           for (p = pend - 2; p >= pbeg; p--)
6842             if (*p == '\r')
6843               {
6844                 memmove (p, p + 1, pend-- - p - 1);
6845                 n++;
6846               }
6847         }
6848       else
6849         {
6850           ptrdiff_t pos_byte = coding->dst_pos_byte;
6851           ptrdiff_t pos = coding->dst_pos;
6852           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6853
6854           while (pos < pos_end)
6855             {
6856               p = BYTE_POS_ADDR (pos_byte);
6857               if (*p == '\r' && p[1] == '\n')
6858                 {
6859                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6860                   n++;
6861                   pos_end--;
6862                 }
6863               pos++;
6864               if (coding->dst_multibyte)
6865                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6866               else
6867                 pos_byte++;
6868             }
6869         }
6870       coding->produced -= n;
6871       coding->produced_char -= n;
6872     }
6873 }
6874
6875
6876 /* Return a translation table (or list of them) from coding system
6877    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6878    not ENCODEP). */
6879
6880 static Lisp_Object
6881 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6882 {
6883   Lisp_Object standard, translation_table;
6884   Lisp_Object val;
6885
6886   if (NILP (Venable_character_translation))
6887     {
6888       if (max_lookup)
6889         *max_lookup = 0;
6890       return Qnil;
6891     }
6892   if (encodep)
6893     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6894       standard = Vstandard_translation_table_for_encode;
6895   else
6896     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6897       standard = Vstandard_translation_table_for_decode;
6898   if (NILP (translation_table))
6899     translation_table = standard;
6900   else
6901     {
6902       if (SYMBOLP (translation_table))
6903         translation_table = Fget (translation_table, Qtranslation_table);
6904       else if (CONSP (translation_table))
6905         {
6906           translation_table = Fcopy_sequence (translation_table);
6907           for (val = translation_table; CONSP (val); val = XCDR (val))
6908             if (SYMBOLP (XCAR (val)))
6909               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6910         }
6911       if (CHAR_TABLE_P (standard))
6912         {
6913           if (CONSP (translation_table))
6914             translation_table = nconc2 (translation_table, list1 (standard));
6915           else
6916             translation_table = list2 (translation_table, standard);
6917         }
6918     }
6919
6920   if (max_lookup)
6921     {
6922       *max_lookup = 1;
6923       if (CHAR_TABLE_P (translation_table)
6924           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6925         {
6926           val = XCHAR_TABLE (translation_table)->extras[1];
6927           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6928             *max_lookup = XFASTINT (val);
6929         }
6930       else if (CONSP (translation_table))
6931         {
6932           Lisp_Object tail;
6933
6934           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6935             if (CHAR_TABLE_P (XCAR (tail))
6936                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6937               {
6938                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6939                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6940                   *max_lookup = XFASTINT (tailval);
6941               }
6942         }
6943     }
6944   return translation_table;
6945 }
6946
6947 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6948   do {                                                          \
6949     trans = Qnil;                                               \
6950     if (CHAR_TABLE_P (table))                                   \
6951       {                                                         \
6952         trans = CHAR_TABLE_REF (table, c);                      \
6953         if (CHARACTERP (trans))                                 \
6954           c = XFASTINT (trans), trans = Qnil;                   \
6955       }                                                         \
6956     else if (CONSP (table))                                     \
6957       {                                                         \
6958         Lisp_Object tail;                                       \
6959                                                                 \
6960         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6961           if (CHAR_TABLE_P (XCAR (tail)))                       \
6962             {                                                   \
6963               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6964               if (CHARACTERP (trans))                           \
6965                 c = XFASTINT (trans), trans = Qnil;             \
6966               else if (! NILP (trans))                          \
6967                 break;                                          \
6968             }                                                   \
6969       }                                                         \
6970   } while (0)
6971
6972
6973 /* Return a translation of character(s) at BUF according to TRANS.
6974    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6975    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6976    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6977    translation is found, and Qnil if not found..
6978    If BUF is too short to lookup characters in FROM, return Qt.  */
6979
6980 static Lisp_Object
6981 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6982 {
6983
6984   if (INTEGERP (trans))
6985     return trans;
6986   for (; CONSP (trans); trans = XCDR (trans))
6987     {
6988       Lisp_Object val = XCAR (trans);
6989       Lisp_Object from = XCAR (val);
6990       ptrdiff_t len = ASIZE (from);
6991       ptrdiff_t i;
6992
6993       for (i = 0; i < len; i++)
6994         {
6995           if (buf + i == buf_end)
6996             return Qt;
6997           if (XINT (AREF (from, i)) != buf[i])
6998             break;
6999         }
7000       if (i == len)
7001         return val;
7002     }
7003   return Qnil;
7004 }
7005
7006
7007 static int
7008 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7009                bool last_block)
7010 {
7011   unsigned char *dst = coding->destination + coding->produced;
7012   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7013   ptrdiff_t produced;
7014   ptrdiff_t produced_chars = 0;
7015   int carryover = 0;
7016
7017   if (! coding->chars_at_source)
7018     {
7019       /* Source characters are in coding->charbuf.  */
7020       int *buf = coding->charbuf;
7021       int *buf_end = buf + coding->charbuf_used;
7022
7023       if (EQ (coding->src_object, coding->dst_object))
7024         {
7025           coding_set_source (coding);
7026           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7027         }
7028
7029       while (buf < buf_end)
7030         {
7031           int c = *buf;
7032           ptrdiff_t i;
7033
7034           if (c >= 0)
7035             {
7036               ptrdiff_t from_nchars = 1, to_nchars = 1;
7037               Lisp_Object trans = Qnil;
7038
7039               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7040               if (! NILP (trans))
7041                 {
7042                   trans = get_translation (trans, buf, buf_end);
7043                   if (INTEGERP (trans))
7044                     c = XINT (trans);
7045                   else if (CONSP (trans))
7046                     {
7047                       from_nchars = ASIZE (XCAR (trans));
7048                       trans = XCDR (trans);
7049                       if (INTEGERP (trans))
7050                         c = XINT (trans);
7051                       else
7052                         {
7053                           to_nchars = ASIZE (trans);
7054                           c = XINT (AREF (trans, 0));
7055                         }
7056                     }
7057                   else if (EQ (trans, Qt) && ! last_block)
7058                     break;
7059                 }
7060
7061               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7062                 {
7063                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7064                        / MAX_MULTIBYTE_LENGTH)
7065                       < to_nchars)
7066                     memory_full (SIZE_MAX);
7067                   dst = alloc_destination (coding,
7068                                            buf_end - buf
7069                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7070                                            dst);
7071                   if (EQ (coding->src_object, coding->dst_object))
7072                     {
7073                       coding_set_source (coding);
7074                       dst_end = (((unsigned char *) coding->source)
7075                                  + coding->consumed);
7076                     }
7077                   else
7078                     dst_end = coding->destination + coding->dst_bytes;
7079                 }
7080
7081               for (i = 0; i < to_nchars; i++)
7082                 {
7083                   if (i > 0)
7084                     c = XINT (AREF (trans, i));
7085                   if (coding->dst_multibyte
7086                       || ! CHAR_BYTE8_P (c))
7087                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7088                   else
7089                     *dst++ = CHAR_TO_BYTE8 (c);
7090                 }
7091               produced_chars += to_nchars;
7092               buf += from_nchars;
7093             }
7094           else
7095             /* This is an annotation datum.  (-C) is the length.  */
7096             buf += -c;
7097         }
7098       carryover = buf_end - buf;
7099     }
7100   else
7101     {
7102       /* Source characters are at coding->source.  */
7103       const unsigned char *src = coding->source;
7104       const unsigned char *src_end = src + coding->consumed;
7105
7106       if (EQ (coding->dst_object, coding->src_object))
7107         dst_end = (unsigned char *) src;
7108       if (coding->src_multibyte != coding->dst_multibyte)
7109         {
7110           if (coding->src_multibyte)
7111             {
7112               bool multibytep = 1;
7113               ptrdiff_t consumed_chars = 0;
7114
7115               while (1)
7116                 {
7117                   const unsigned char *src_base = src;
7118                   int c;
7119
7120                   ONE_MORE_BYTE (c);
7121                   if (dst == dst_end)
7122                     {
7123                       if (EQ (coding->src_object, coding->dst_object))
7124                         dst_end = (unsigned char *) src;
7125                       if (dst == dst_end)
7126                         {
7127                           ptrdiff_t offset = src - coding->source;
7128
7129                           dst = alloc_destination (coding, src_end - src + 1,
7130                                                    dst);
7131                           dst_end = coding->destination + coding->dst_bytes;
7132                           coding_set_source (coding);
7133                           src = coding->source + offset;
7134                           src_end = coding->source + coding->consumed;
7135                           if (EQ (coding->src_object, coding->dst_object))
7136                             dst_end = (unsigned char *) src;
7137                         }
7138                     }
7139                   *dst++ = c;
7140                   produced_chars++;
7141                 }
7142             no_more_source:
7143               ;
7144             }
7145           else
7146             while (src < src_end)
7147               {
7148                 bool multibytep = 1;
7149                 int c = *src++;
7150
7151                 if (dst >= dst_end - 1)
7152                   {
7153                     if (EQ (coding->src_object, coding->dst_object))
7154                       dst_end = (unsigned char *) src;
7155                     if (dst >= dst_end - 1)
7156                       {
7157                         ptrdiff_t offset = src - coding->source;
7158                         ptrdiff_t more_bytes;
7159
7160                         if (EQ (coding->src_object, coding->dst_object))
7161                           more_bytes = ((src_end - src) / 2) + 2;
7162                         else
7163                           more_bytes = src_end - src + 2;
7164                         dst = alloc_destination (coding, more_bytes, dst);
7165                         dst_end = coding->destination + coding->dst_bytes;
7166                         coding_set_source (coding);
7167                         src = coding->source + offset;
7168                         src_end = coding->source + coding->consumed;
7169                         if (EQ (coding->src_object, coding->dst_object))
7170                           dst_end = (unsigned char *) src;
7171                       }
7172                   }
7173                 EMIT_ONE_BYTE (c);
7174               }
7175         }
7176       else
7177         {
7178           if (!EQ (coding->src_object, coding->dst_object))
7179             {
7180               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7181
7182               if (require > 0)
7183                 {
7184                   ptrdiff_t offset = src - coding->source;
7185
7186                   dst = alloc_destination (coding, require, dst);
7187                   coding_set_source (coding);
7188                   src = coding->source + offset;
7189                   src_end = coding->source + coding->consumed;
7190                 }
7191             }
7192           produced_chars = coding->consumed_char;
7193           while (src < src_end)
7194             *dst++ = *src++;
7195         }
7196     }
7197
7198   produced = dst - (coding->destination + coding->produced);
7199   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7200     insert_from_gap (produced_chars, produced, 0);
7201   coding->produced += produced;
7202   coding->produced_char += produced_chars;
7203   return carryover;
7204 }
7205
7206 /* Compose text in CODING->object according to the annotation data at
7207    CHARBUF.  CHARBUF is an array:
7208      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7209  */
7210
7211 static void
7212 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7213 {
7214   int len;
7215   ptrdiff_t to;
7216   enum composition_method method;
7217   Lisp_Object components;
7218
7219   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7220   to = pos + charbuf[2];
7221   method = (enum composition_method) (charbuf[4]);
7222
7223   if (method == COMPOSITION_RELATIVE)
7224     components = Qnil;
7225   else
7226     {
7227       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7228       int i, j;
7229
7230       if (method == COMPOSITION_WITH_RULE)
7231         len = charbuf[2] * 3 - 2;
7232       charbuf += MAX_ANNOTATION_LENGTH;
7233       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7234       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7235         {
7236           if (charbuf[i] >= 0)
7237             args[j] = make_number (charbuf[i]);
7238           else
7239             {
7240               i++;
7241               args[j] = make_number (charbuf[i] % 0x100);
7242             }
7243         }
7244       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7245     }
7246   compose_text (pos, to, components, Qnil, coding->dst_object);
7247 }
7248
7249
7250 /* Put `charset' property on text in CODING->object according to
7251    the annotation data at CHARBUF.  CHARBUF is an array:
7252      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7253  */
7254
7255 static void
7256 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7257 {
7258   ptrdiff_t from = pos - charbuf[2];
7259   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7260
7261   Fput_text_property (make_number (from), make_number (pos),
7262                       Qcharset, CHARSET_NAME (charset),
7263                       coding->dst_object);
7264 }
7265
7266
7267 #define CHARBUF_SIZE 0x4000
7268
7269 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7270   do {                                                                  \
7271     coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int));        \
7272     coding->charbuf_size = CHARBUF_SIZE;                                \
7273   } while (0)
7274
7275
7276 static void
7277 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7278 {
7279   int *charbuf = coding->charbuf;
7280   int *charbuf_end = charbuf + coding->charbuf_used;
7281
7282   if (NILP (coding->dst_object))
7283     return;
7284
7285   while (charbuf < charbuf_end)
7286     {
7287       if (*charbuf >= 0)
7288         pos++, charbuf++;
7289       else
7290         {
7291           int len = -*charbuf;
7292
7293           if (len > 2)
7294             switch (charbuf[1])
7295               {
7296               case CODING_ANNOTATE_COMPOSITION_MASK:
7297                 produce_composition (coding, charbuf, pos);
7298                 break;
7299               case CODING_ANNOTATE_CHARSET_MASK:
7300                 produce_charset (coding, charbuf, pos);
7301                 break;
7302               }
7303           charbuf += len;
7304         }
7305     }
7306 }
7307
7308 /* Decode the data at CODING->src_object into CODING->dst_object.
7309    CODING->src_object is a buffer, a string, or nil.
7310    CODING->dst_object is a buffer.
7311
7312    If CODING->src_object is a buffer, it must be the current buffer.
7313    In this case, if CODING->src_pos is positive, it is a position of
7314    the source text in the buffer, otherwise, the source text is in the
7315    gap area of the buffer, and CODING->src_pos specifies the offset of
7316    the text from GPT (which must be the same as PT).  If this is the
7317    same buffer as CODING->dst_object, CODING->src_pos must be
7318    negative.
7319
7320    If CODING->src_object is a string, CODING->src_pos is an index to
7321    that string.
7322
7323    If CODING->src_object is nil, CODING->source must already point to
7324    the non-relocatable memory area.  In this case, CODING->src_pos is
7325    an offset from CODING->source.
7326
7327    The decoded data is inserted at the current point of the buffer
7328    CODING->dst_object.
7329 */
7330
7331 static void
7332 decode_coding (struct coding_system *coding)
7333 {
7334   Lisp_Object attrs;
7335   Lisp_Object undo_list;
7336   Lisp_Object translation_table;
7337   struct ccl_spec cclspec;
7338   int carryover;
7339   int i;
7340
7341   USE_SAFE_ALLOCA;
7342
7343   if (BUFFERP (coding->src_object)
7344       && coding->src_pos > 0
7345       && coding->src_pos < GPT
7346       && coding->src_pos + coding->src_chars > GPT)
7347     move_gap_both (coding->src_pos, coding->src_pos_byte);
7348
7349   undo_list = Qt;
7350   if (BUFFERP (coding->dst_object))
7351     {
7352       set_buffer_internal (XBUFFER (coding->dst_object));
7353       if (GPT != PT)
7354         move_gap_both (PT, PT_BYTE);
7355
7356       /* We must disable undo_list in order to record the whole insert
7357          transaction via record_insert at the end.  But doing so also
7358          disables the recording of the first change to the undo_list.
7359          Therefore we check for first change here and record it via
7360          record_first_change if needed.  */
7361       if (MODIFF <= SAVE_MODIFF)
7362         record_first_change ();
7363
7364       undo_list = BVAR (current_buffer, undo_list);
7365       bset_undo_list (current_buffer, Qt);
7366     }
7367
7368   coding->consumed = coding->consumed_char = 0;
7369   coding->produced = coding->produced_char = 0;
7370   coding->chars_at_source = 0;
7371   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7372   coding->errors = 0;
7373
7374   ALLOC_CONVERSION_WORK_AREA (coding);
7375
7376   attrs = CODING_ID_ATTRS (coding->id);
7377   translation_table = get_translation_table (attrs, 0, NULL);
7378
7379   carryover = 0;
7380   if (coding->decoder == decode_coding_ccl)
7381     {
7382       coding->spec.ccl = &cclspec;
7383       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7384     }
7385   do
7386     {
7387       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7388
7389       coding_set_source (coding);
7390       coding->annotated = 0;
7391       coding->charbuf_used = carryover;
7392       (*(coding->decoder)) (coding);
7393       coding_set_destination (coding);
7394       carryover = produce_chars (coding, translation_table, 0);
7395       if (coding->annotated)
7396         produce_annotation (coding, pos);
7397       for (i = 0; i < carryover; i++)
7398         coding->charbuf[i]
7399           = coding->charbuf[coding->charbuf_used - carryover + i];
7400     }
7401   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7402          || (coding->consumed < coding->src_bytes
7403              && (coding->result == CODING_RESULT_SUCCESS
7404                  || coding->result == CODING_RESULT_INVALID_SRC)));
7405
7406   if (carryover > 0)
7407     {
7408       coding_set_destination (coding);
7409       coding->charbuf_used = carryover;
7410       produce_chars (coding, translation_table, 1);
7411     }
7412
7413   coding->carryover_bytes = 0;
7414   if (coding->consumed < coding->src_bytes)
7415     {
7416       int nbytes = coding->src_bytes - coding->consumed;
7417       const unsigned char *src;
7418
7419       coding_set_source (coding);
7420       coding_set_destination (coding);
7421       src = coding->source + coding->consumed;
7422
7423       if (coding->mode & CODING_MODE_LAST_BLOCK)
7424         {
7425           /* Flush out unprocessed data as binary chars.  We are sure
7426              that the number of data is less than the size of
7427              coding->charbuf.  */
7428           coding->charbuf_used = 0;
7429           coding->chars_at_source = 0;
7430
7431           while (nbytes-- > 0)
7432             {
7433               int c = *src++;
7434
7435               if (c & 0x80)
7436                 c = BYTE8_TO_CHAR (c);
7437               coding->charbuf[coding->charbuf_used++] = c;
7438             }
7439           produce_chars (coding, Qnil, 1);
7440         }
7441       else
7442         {
7443           /* Record unprocessed bytes in coding->carryover.  We are
7444              sure that the number of data is less than the size of
7445              coding->carryover.  */
7446           unsigned char *p = coding->carryover;
7447
7448           if (nbytes > sizeof coding->carryover)
7449             nbytes = sizeof coding->carryover;
7450           coding->carryover_bytes = nbytes;
7451           while (nbytes-- > 0)
7452             *p++ = *src++;
7453         }
7454       coding->consumed = coding->src_bytes;
7455     }
7456
7457   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7458       && !inhibit_eol_conversion)
7459     decode_eol (coding);
7460   if (BUFFERP (coding->dst_object))
7461     {
7462       bset_undo_list (current_buffer, undo_list);
7463       record_insert (coding->dst_pos, coding->produced_char);
7464     }
7465
7466   SAFE_FREE ();
7467 }
7468
7469
7470 /* Extract an annotation datum from a composition starting at POS and
7471    ending before LIMIT of CODING->src_object (buffer or string), store
7472    the data in BUF, set *STOP to a starting position of the next
7473    composition (if any) or to LIMIT, and return the address of the
7474    next element of BUF.
7475
7476    If such an annotation is not found, set *STOP to a starting
7477    position of a composition after POS (if any) or to LIMIT, and
7478    return BUF.  */
7479
7480 static int *
7481 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7482                                struct coding_system *coding, int *buf,
7483                                ptrdiff_t *stop)
7484 {
7485   ptrdiff_t start, end;
7486   Lisp_Object prop;
7487
7488   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7489       || end > limit)
7490     *stop = limit;
7491   else if (start > pos)
7492     *stop = start;
7493   else
7494     {
7495       if (start == pos)
7496         {
7497           /* We found a composition.  Store the corresponding
7498              annotation data in BUF.  */
7499           int *head = buf;
7500           enum composition_method method = COMPOSITION_METHOD (prop);
7501           int nchars = COMPOSITION_LENGTH (prop);
7502
7503           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7504           if (method != COMPOSITION_RELATIVE)
7505             {
7506               Lisp_Object components;
7507               ptrdiff_t i, len, i_byte;
7508
7509               components = COMPOSITION_COMPONENTS (prop);
7510               if (VECTORP (components))
7511                 {
7512                   len = ASIZE (components);
7513                   for (i = 0; i < len; i++)
7514                     *buf++ = XINT (AREF (components, i));
7515                 }
7516               else if (STRINGP (components))
7517                 {
7518                   len = SCHARS (components);
7519                   i = i_byte = 0;
7520                   while (i < len)
7521                     {
7522                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7523                       buf++;
7524                     }
7525                 }
7526               else if (INTEGERP (components))
7527                 {
7528                   len = 1;
7529                   *buf++ = XINT (components);
7530                 }
7531               else if (CONSP (components))
7532                 {
7533                   for (len = 0; CONSP (components);
7534                        len++, components = XCDR (components))
7535                     *buf++ = XINT (XCAR (components));
7536                 }
7537               else
7538                 emacs_abort ();
7539               *head -= len;
7540             }
7541         }
7542
7543       if (find_composition (end, limit, &start, &end, &prop,
7544                             coding->src_object)
7545           && end <= limit)
7546         *stop = start;
7547       else
7548         *stop = limit;
7549     }
7550   return buf;
7551 }
7552
7553
7554 /* Extract an annotation datum from a text property `charset' at POS of
7555    CODING->src_object (buffer of string), store the data in BUF, set
7556    *STOP to the position where the value of `charset' property changes
7557    (limiting by LIMIT), and return the address of the next element of
7558    BUF.
7559
7560    If the property value is nil, set *STOP to the position where the
7561    property value is non-nil (limiting by LIMIT), and return BUF.  */
7562
7563 static int *
7564 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7565                            struct coding_system *coding, int *buf,
7566                            ptrdiff_t *stop)
7567 {
7568   Lisp_Object val, next;
7569   int id;
7570
7571   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7572   if (! NILP (val) && CHARSETP (val))
7573     id = XINT (CHARSET_SYMBOL_ID (val));
7574   else
7575     id = -1;
7576   ADD_CHARSET_DATA (buf, 0, id);
7577   next = Fnext_single_property_change (make_number (pos), Qcharset,
7578                                        coding->src_object,
7579                                        make_number (limit));
7580   *stop = XINT (next);
7581   return buf;
7582 }
7583
7584
7585 static void
7586 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7587                int max_lookup)
7588 {
7589   int *buf = coding->charbuf;
7590   int *buf_end = coding->charbuf + coding->charbuf_size;
7591   const unsigned char *src = coding->source + coding->consumed;
7592   const unsigned char *src_end = coding->source + coding->src_bytes;
7593   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7594   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7595   bool multibytep = coding->src_multibyte;
7596   Lisp_Object eol_type;
7597   int c;
7598   ptrdiff_t stop, stop_composition, stop_charset;
7599   int *lookup_buf = NULL;
7600
7601   if (! NILP (translation_table))
7602     lookup_buf = alloca (sizeof (int) * max_lookup);
7603
7604   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7605   if (VECTORP (eol_type))
7606     eol_type = Qunix;
7607
7608   /* Note: composition handling is not yet implemented.  */
7609   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7610
7611   if (NILP (coding->src_object))
7612     stop = stop_composition = stop_charset = end_pos;
7613   else
7614     {
7615       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7616         stop = stop_composition = pos;
7617       else
7618         stop = stop_composition = end_pos;
7619       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7620         stop = stop_charset = pos;
7621       else
7622         stop_charset = end_pos;
7623     }
7624
7625   /* Compensate for CRLF and conversion.  */
7626   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7627   while (buf < buf_end)
7628     {
7629       Lisp_Object trans;
7630
7631       if (pos == stop)
7632         {
7633           if (pos == end_pos)
7634             break;
7635           if (pos == stop_composition)
7636             buf = handle_composition_annotation (pos, end_pos, coding,
7637                                                  buf, &stop_composition);
7638           if (pos == stop_charset)
7639             buf = handle_charset_annotation (pos, end_pos, coding,
7640                                              buf, &stop_charset);
7641           stop = (stop_composition < stop_charset
7642                   ? stop_composition : stop_charset);
7643         }
7644
7645       if (! multibytep)
7646         {
7647           int bytes;
7648
7649           if (coding->encoder == encode_coding_raw_text
7650               || coding->encoder == encode_coding_ccl)
7651             c = *src++, pos++;
7652           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7653             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7654           else
7655             c = BYTE8_TO_CHAR (*src), src++, pos++;
7656         }
7657       else
7658         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7659       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7660         c = '\n';
7661       if (! EQ (eol_type, Qunix))
7662         {
7663           if (c == '\n')
7664             {
7665               if (EQ (eol_type, Qdos))
7666                 *buf++ = '\r';
7667               else
7668                 c = '\r';
7669             }
7670         }
7671
7672       trans = Qnil;
7673       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7674       if (NILP (trans))
7675         *buf++ = c;
7676       else
7677         {
7678           ptrdiff_t from_nchars = 1, to_nchars = 1;
7679           int *lookup_buf_end;
7680           const unsigned char *p = src;
7681           int i;
7682
7683           lookup_buf[0] = c;
7684           for (i = 1; i < max_lookup && p < src_end; i++)
7685             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7686           lookup_buf_end = lookup_buf + i;
7687           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7688           if (INTEGERP (trans))
7689             c = XINT (trans);
7690           else if (CONSP (trans))
7691             {
7692               from_nchars = ASIZE (XCAR (trans));
7693               trans = XCDR (trans);
7694               if (INTEGERP (trans))
7695                 c = XINT (trans);
7696               else
7697                 {
7698                   to_nchars = ASIZE (trans);
7699                   if (buf_end - buf < to_nchars)
7700                     break;
7701                   c = XINT (AREF (trans, 0));
7702                 }
7703             }
7704           else
7705             break;
7706           *buf++ = c;
7707           for (i = 1; i < to_nchars; i++)
7708             *buf++ = XINT (AREF (trans, i));
7709           for (i = 1; i < from_nchars; i++, pos++)
7710             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7711         }
7712     }
7713
7714   coding->consumed = src - coding->source;
7715   coding->consumed_char = pos - coding->src_pos;
7716   coding->charbuf_used = buf - coding->charbuf;
7717   coding->chars_at_source = 0;
7718 }
7719
7720
7721 /* Encode the text at CODING->src_object into CODING->dst_object.
7722    CODING->src_object is a buffer or a string.
7723    CODING->dst_object is a buffer or nil.
7724
7725    If CODING->src_object is a buffer, it must be the current buffer.
7726    In this case, if CODING->src_pos is positive, it is a position of
7727    the source text in the buffer, otherwise. the source text is in the
7728    gap area of the buffer, and coding->src_pos specifies the offset of
7729    the text from GPT (which must be the same as PT).  If this is the
7730    same buffer as CODING->dst_object, CODING->src_pos must be
7731    negative and CODING should not have `pre-write-conversion'.
7732
7733    If CODING->src_object is a string, CODING should not have
7734    `pre-write-conversion'.
7735
7736    If CODING->dst_object is a buffer, the encoded data is inserted at
7737    the current point of that buffer.
7738
7739    If CODING->dst_object is nil, the encoded data is placed at the
7740    memory area specified by CODING->destination.  */
7741
7742 static void
7743 encode_coding (struct coding_system *coding)
7744 {
7745   Lisp_Object attrs;
7746   Lisp_Object translation_table;
7747   int max_lookup;
7748   struct ccl_spec cclspec;
7749
7750   USE_SAFE_ALLOCA;
7751
7752   attrs = CODING_ID_ATTRS (coding->id);
7753   if (coding->encoder == encode_coding_raw_text)
7754     translation_table = Qnil, max_lookup = 0;
7755   else
7756     translation_table = get_translation_table (attrs, 1, &max_lookup);
7757
7758   if (BUFFERP (coding->dst_object))
7759     {
7760       set_buffer_internal (XBUFFER (coding->dst_object));
7761       coding->dst_multibyte
7762         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7763     }
7764
7765   coding->consumed = coding->consumed_char = 0;
7766   coding->produced = coding->produced_char = 0;
7767   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7768   coding->errors = 0;
7769
7770   ALLOC_CONVERSION_WORK_AREA (coding);
7771
7772   if (coding->encoder == encode_coding_ccl)
7773     {
7774       coding->spec.ccl = &cclspec;
7775       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7776     }
7777   do {
7778     coding_set_source (coding);
7779     consume_chars (coding, translation_table, max_lookup);
7780     coding_set_destination (coding);
7781     (*(coding->encoder)) (coding);
7782   } while (coding->consumed_char < coding->src_chars);
7783
7784   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7785     insert_from_gap (coding->produced_char, coding->produced, 0);
7786
7787   SAFE_FREE ();
7788 }
7789
7790
7791 /* Name (or base name) of work buffer for code conversion.  */
7792 static Lisp_Object Vcode_conversion_workbuf_name;
7793
7794 /* A working buffer used by the top level conversion.  Once it is
7795    created, it is never destroyed.  It has the name
7796    Vcode_conversion_workbuf_name.  The other working buffers are
7797    destroyed after the use is finished, and their names are modified
7798    versions of Vcode_conversion_workbuf_name.  */
7799 static Lisp_Object Vcode_conversion_reused_workbuf;
7800
7801 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7802 static bool reused_workbuf_in_use;
7803
7804
7805 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7806    multibyteness of returning buffer.  */
7807
7808 static Lisp_Object
7809 make_conversion_work_buffer (bool multibyte)
7810 {
7811   Lisp_Object name, workbuf;
7812   struct buffer *current;
7813
7814   if (reused_workbuf_in_use)
7815     {
7816       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7817       workbuf = Fget_buffer_create (name);
7818     }
7819   else
7820     {
7821       reused_workbuf_in_use = 1;
7822       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7823         Vcode_conversion_reused_workbuf
7824           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7825       workbuf = Vcode_conversion_reused_workbuf;
7826     }
7827   current = current_buffer;
7828   set_buffer_internal (XBUFFER (workbuf));
7829   /* We can't allow modification hooks to run in the work buffer.  For
7830      instance, directory_files_internal assumes that file decoding
7831      doesn't compile new regexps.  */
7832   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7833   Ferase_buffer ();
7834   bset_undo_list (current_buffer, Qt);
7835   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7836   set_buffer_internal (current);
7837   return workbuf;
7838 }
7839
7840
7841 static void
7842 code_conversion_restore (Lisp_Object arg)
7843 {
7844   Lisp_Object current, workbuf;
7845   struct gcpro gcpro1;
7846
7847   GCPRO1 (arg);
7848   current = XCAR (arg);
7849   workbuf = XCDR (arg);
7850   if (! NILP (workbuf))
7851     {
7852       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7853         reused_workbuf_in_use = 0;
7854       else
7855         Fkill_buffer (workbuf);
7856     }
7857   set_buffer_internal (XBUFFER (current));
7858   UNGCPRO;
7859 }
7860
7861 Lisp_Object
7862 code_conversion_save (bool with_work_buf, bool multibyte)
7863 {
7864   Lisp_Object workbuf = Qnil;
7865
7866   if (with_work_buf)
7867     workbuf = make_conversion_work_buffer (multibyte);
7868   record_unwind_protect (code_conversion_restore,
7869                          Fcons (Fcurrent_buffer (), workbuf));
7870   return workbuf;
7871 }
7872
7873 void
7874 decode_coding_gap (struct coding_system *coding,
7875                    ptrdiff_t chars, ptrdiff_t bytes)
7876 {
7877   ptrdiff_t count = SPECPDL_INDEX ();
7878   Lisp_Object attrs;
7879
7880   coding->src_object = Fcurrent_buffer ();
7881   coding->src_chars = chars;
7882   coding->src_bytes = bytes;
7883   coding->src_pos = -chars;
7884   coding->src_pos_byte = -bytes;
7885   coding->src_multibyte = chars < bytes;
7886   coding->dst_object = coding->src_object;
7887   coding->dst_pos = PT;
7888   coding->dst_pos_byte = PT_BYTE;
7889   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7890
7891   coding->head_ascii = -1;
7892   coding->detected_utf8_chars = -1;
7893   coding->eol_seen = EOL_SEEN_NONE;
7894   if (CODING_REQUIRE_DETECTION (coding))
7895     detect_coding (coding);
7896   attrs = CODING_ID_ATTRS (coding->id);
7897   if (! disable_ascii_optimization
7898       && ! coding->src_multibyte
7899       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7900       && NILP (CODING_ATTR_POST_READ (attrs))
7901       && NILP (get_translation_table (attrs, 0, NULL)))
7902     {
7903       chars = coding->head_ascii;
7904       if (chars < 0)
7905         chars = check_ascii (coding);
7906       if (chars != bytes)
7907         {
7908           /* There exists a non-ASCII byte.  */
7909           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7910             {
7911               if (coding->detected_utf8_chars >= 0)
7912                 chars = coding->detected_utf8_chars;
7913               else
7914                 chars = check_utf_8 (coding);
7915               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7916                   && coding->head_ascii == 0
7917                   && coding->source[0] == UTF_8_BOM_1
7918                   && coding->source[1] == UTF_8_BOM_2
7919                   && coding->source[2] == UTF_8_BOM_3)
7920                 {
7921                   chars--;
7922                   bytes -= 3;
7923                   coding->src_bytes -= 3;
7924                 }
7925             }
7926           else
7927             chars = -1;
7928         }
7929       if (chars >= 0)
7930         {
7931           Lisp_Object eol_type;
7932
7933           eol_type = CODING_ID_EOL_TYPE (coding->id);
7934           if (VECTORP (eol_type))
7935             {
7936               if (coding->eol_seen != EOL_SEEN_NONE)
7937                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7938             }
7939           if (EQ (eol_type, Qmac))
7940             {
7941               unsigned char *src_end = GAP_END_ADDR;
7942               unsigned char *src = src_end - coding->src_bytes;
7943
7944               while (src < src_end)
7945                 {
7946                   if (*src++ == '\r')
7947                     src[-1] = '\n';
7948                 }
7949             }
7950           else if (EQ (eol_type, Qdos))
7951             {
7952               unsigned char *src = GAP_END_ADDR;
7953               unsigned char *src_beg = src - coding->src_bytes;
7954               unsigned char *dst = src;
7955               ptrdiff_t diff;
7956
7957               while (src_beg < src)
7958                 {
7959                   *--dst = *--src;
7960                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7961                     src--;
7962                 }
7963               diff = dst - src;
7964               bytes -= diff;
7965               chars -= diff;
7966             }
7967           coding->produced = bytes;
7968           coding->produced_char = chars;
7969           insert_from_gap (chars, bytes, 1);
7970           return;
7971         }
7972     }
7973   code_conversion_save (0, 0);
7974
7975   coding->mode |= CODING_MODE_LAST_BLOCK;
7976   current_buffer->text->inhibit_shrinking = 1;
7977   decode_coding (coding);
7978   current_buffer->text->inhibit_shrinking = 0;
7979
7980   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7981     {
7982       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7983       Lisp_Object val;
7984
7985       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7986       val = call1 (CODING_ATTR_POST_READ (attrs),
7987                    make_number (coding->produced_char));
7988       CHECK_NATNUM (val);
7989       coding->produced_char += Z - prev_Z;
7990       coding->produced += Z_BYTE - prev_Z_BYTE;
7991     }
7992
7993   unbind_to (count, Qnil);
7994 }
7995
7996
7997 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7998    SRC_OBJECT into DST_OBJECT by coding context CODING.
7999
8000    SRC_OBJECT is a buffer, a string, or Qnil.
8001
8002    If it is a buffer, the text is at point of the buffer.  FROM and TO
8003    are positions in the buffer.
8004
8005    If it is a string, the text is at the beginning of the string.
8006    FROM and TO are indices to the string.
8007
8008    If it is nil, the text is at coding->source.  FROM and TO are
8009    indices to coding->source.
8010
8011    DST_OBJECT is a buffer, Qt, or Qnil.
8012
8013    If it is a buffer, the decoded text is inserted at point of the
8014    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8015    is deleted.
8016
8017    If it is Qt, a string is made from the decoded text, and
8018    set in CODING->dst_object.
8019
8020    If it is Qnil, the decoded text is stored at CODING->destination.
8021    The caller must allocate CODING->dst_bytes bytes at
8022    CODING->destination by xmalloc.  If the decoded text is longer than
8023    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8024  */
8025
8026 void
8027 decode_coding_object (struct coding_system *coding,
8028                       Lisp_Object src_object,
8029                       ptrdiff_t from, ptrdiff_t from_byte,
8030                       ptrdiff_t to, ptrdiff_t to_byte,
8031                       Lisp_Object dst_object)
8032 {
8033   ptrdiff_t count = SPECPDL_INDEX ();
8034   unsigned char *destination IF_LINT (= NULL);
8035   ptrdiff_t dst_bytes IF_LINT (= 0);
8036   ptrdiff_t chars = to - from;
8037   ptrdiff_t bytes = to_byte - from_byte;
8038   Lisp_Object attrs;
8039   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8040   bool need_marker_adjustment = 0;
8041   Lisp_Object old_deactivate_mark;
8042
8043   old_deactivate_mark = Vdeactivate_mark;
8044
8045   if (NILP (dst_object))
8046     {
8047       destination = coding->destination;
8048       dst_bytes = coding->dst_bytes;
8049     }
8050
8051   coding->src_object = src_object;
8052   coding->src_chars = chars;
8053   coding->src_bytes = bytes;
8054   coding->src_multibyte = chars < bytes;
8055
8056   if (STRINGP (src_object))
8057     {
8058       coding->src_pos = from;
8059       coding->src_pos_byte = from_byte;
8060     }
8061   else if (BUFFERP (src_object))
8062     {
8063       set_buffer_internal (XBUFFER (src_object));
8064       if (from != GPT)
8065         move_gap_both (from, from_byte);
8066       if (EQ (src_object, dst_object))
8067         {
8068           struct Lisp_Marker *tail;
8069
8070           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8071             {
8072               tail->need_adjustment
8073                 = tail->charpos == (tail->insertion_type ? from : to);
8074               need_marker_adjustment |= tail->need_adjustment;
8075             }
8076           saved_pt = PT, saved_pt_byte = PT_BYTE;
8077           TEMP_SET_PT_BOTH (from, from_byte);
8078           current_buffer->text->inhibit_shrinking = 1;
8079           del_range_both (from, from_byte, to, to_byte, 1);
8080           coding->src_pos = -chars;
8081           coding->src_pos_byte = -bytes;
8082         }
8083       else
8084         {
8085           coding->src_pos = from;
8086           coding->src_pos_byte = from_byte;
8087         }
8088     }
8089
8090   if (CODING_REQUIRE_DETECTION (coding))
8091     detect_coding (coding);
8092   attrs = CODING_ID_ATTRS (coding->id);
8093
8094   if (EQ (dst_object, Qt)
8095       || (! NILP (CODING_ATTR_POST_READ (attrs))
8096           && NILP (dst_object)))
8097     {
8098       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8099       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8100       coding->dst_pos = BEG;
8101       coding->dst_pos_byte = BEG_BYTE;
8102     }
8103   else if (BUFFERP (dst_object))
8104     {
8105       code_conversion_save (0, 0);
8106       coding->dst_object = dst_object;
8107       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8108       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8109       coding->dst_multibyte
8110         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8111     }
8112   else
8113     {
8114       code_conversion_save (0, 0);
8115       coding->dst_object = Qnil;
8116       /* Most callers presume this will return a multibyte result, and they
8117          won't use `binary' or `raw-text' anyway, so let's not worry about
8118          CODING_FOR_UNIBYTE.  */
8119       coding->dst_multibyte = 1;
8120     }
8121
8122   decode_coding (coding);
8123
8124   if (BUFFERP (coding->dst_object))
8125     set_buffer_internal (XBUFFER (coding->dst_object));
8126
8127   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8128     {
8129       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8130       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8131       Lisp_Object val;
8132
8133       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8134       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8135               old_deactivate_mark);
8136       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8137                         make_number (coding->produced_char));
8138       UNGCPRO;
8139       CHECK_NATNUM (val);
8140       coding->produced_char += Z - prev_Z;
8141       coding->produced += Z_BYTE - prev_Z_BYTE;
8142     }
8143
8144   if (EQ (dst_object, Qt))
8145     {
8146       coding->dst_object = Fbuffer_string ();
8147     }
8148   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8149     {
8150       set_buffer_internal (XBUFFER (coding->dst_object));
8151       if (dst_bytes < coding->produced)
8152         {
8153           eassert (coding->produced > 0);
8154           destination = xrealloc (destination, coding->produced);
8155           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8156             move_gap_both (BEGV, BEGV_BYTE);
8157           memcpy (destination, BEGV_ADDR, coding->produced);
8158           coding->destination = destination;
8159         }
8160     }
8161
8162   if (saved_pt >= 0)
8163     {
8164       /* This is the case of:
8165          (BUFFERP (src_object) && EQ (src_object, dst_object))
8166          As we have moved PT while replacing the original buffer
8167          contents, we must recover it now.  */
8168       set_buffer_internal (XBUFFER (src_object));
8169       current_buffer->text->inhibit_shrinking = 0;
8170       if (saved_pt < from)
8171         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8172       else if (saved_pt < from + chars)
8173         TEMP_SET_PT_BOTH (from, from_byte);
8174       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8175         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8176                           saved_pt_byte + (coding->produced - bytes));
8177       else
8178         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8179                           saved_pt_byte + (coding->produced - bytes));
8180
8181       if (need_marker_adjustment)
8182         {
8183           struct Lisp_Marker *tail;
8184
8185           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8186             if (tail->need_adjustment)
8187               {
8188                 tail->need_adjustment = 0;
8189                 if (tail->insertion_type)
8190                   {
8191                     tail->bytepos = from_byte;
8192                     tail->charpos = from;
8193                   }
8194                 else
8195                   {
8196                     tail->bytepos = from_byte + coding->produced;
8197                     tail->charpos
8198                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8199                          ? tail->bytepos : from + coding->produced_char);
8200                   }
8201               }
8202         }
8203     }
8204
8205   Vdeactivate_mark = old_deactivate_mark;
8206   unbind_to (count, coding->dst_object);
8207 }
8208
8209
8210 void
8211 encode_coding_object (struct coding_system *coding,
8212                       Lisp_Object src_object,
8213                       ptrdiff_t from, ptrdiff_t from_byte,
8214                       ptrdiff_t to, ptrdiff_t to_byte,
8215                       Lisp_Object dst_object)
8216 {
8217   ptrdiff_t count = SPECPDL_INDEX ();
8218   ptrdiff_t chars = to - from;
8219   ptrdiff_t bytes = to_byte - from_byte;
8220   Lisp_Object attrs;
8221   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8222   bool need_marker_adjustment = 0;
8223   bool kill_src_buffer = 0;
8224   Lisp_Object old_deactivate_mark;
8225
8226   old_deactivate_mark = Vdeactivate_mark;
8227
8228   coding->src_object = src_object;
8229   coding->src_chars = chars;
8230   coding->src_bytes = bytes;
8231   coding->src_multibyte = chars < bytes;
8232
8233   attrs = CODING_ID_ATTRS (coding->id);
8234
8235   if (EQ (src_object, dst_object))
8236     {
8237       struct Lisp_Marker *tail;
8238
8239       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8240         {
8241           tail->need_adjustment
8242             = tail->charpos == (tail->insertion_type ? from : to);
8243           need_marker_adjustment |= tail->need_adjustment;
8244         }
8245     }
8246
8247   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8248     {
8249       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8250       set_buffer_internal (XBUFFER (coding->src_object));
8251       if (STRINGP (src_object))
8252         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8253       else if (BUFFERP (src_object))
8254         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8255       else
8256         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8257
8258       if (EQ (src_object, dst_object))
8259         {
8260           set_buffer_internal (XBUFFER (src_object));
8261           saved_pt = PT, saved_pt_byte = PT_BYTE;
8262           del_range_both (from, from_byte, to, to_byte, 1);
8263           set_buffer_internal (XBUFFER (coding->src_object));
8264         }
8265
8266       {
8267         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8268
8269         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8270                 old_deactivate_mark);
8271         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8272                     make_number (BEG), make_number (Z));
8273         UNGCPRO;
8274       }
8275       if (XBUFFER (coding->src_object) != current_buffer)
8276         kill_src_buffer = 1;
8277       coding->src_object = Fcurrent_buffer ();
8278       if (BEG != GPT)
8279         move_gap_both (BEG, BEG_BYTE);
8280       coding->src_chars = Z - BEG;
8281       coding->src_bytes = Z_BYTE - BEG_BYTE;
8282       coding->src_pos = BEG;
8283       coding->src_pos_byte = BEG_BYTE;
8284       coding->src_multibyte = Z < Z_BYTE;
8285     }
8286   else if (STRINGP (src_object))
8287     {
8288       code_conversion_save (0, 0);
8289       coding->src_pos = from;
8290       coding->src_pos_byte = from_byte;
8291     }
8292   else if (BUFFERP (src_object))
8293     {
8294       code_conversion_save (0, 0);
8295       set_buffer_internal (XBUFFER (src_object));
8296       if (EQ (src_object, dst_object))
8297         {
8298           saved_pt = PT, saved_pt_byte = PT_BYTE;
8299           coding->src_object = del_range_1 (from, to, 1, 1);
8300           coding->src_pos = 0;
8301           coding->src_pos_byte = 0;
8302         }
8303       else
8304         {
8305           if (from < GPT && to >= GPT)
8306             move_gap_both (from, from_byte);
8307           coding->src_pos = from;
8308           coding->src_pos_byte = from_byte;
8309         }
8310     }
8311   else
8312     code_conversion_save (0, 0);
8313
8314   if (BUFFERP (dst_object))
8315     {
8316       coding->dst_object = dst_object;
8317       if (EQ (src_object, dst_object))
8318         {
8319           coding->dst_pos = from;
8320           coding->dst_pos_byte = from_byte;
8321         }
8322       else
8323         {
8324           struct buffer *current = current_buffer;
8325
8326           set_buffer_temp (XBUFFER (dst_object));
8327           coding->dst_pos = PT;
8328           coding->dst_pos_byte = PT_BYTE;
8329           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8330           set_buffer_temp (current);
8331         }
8332       coding->dst_multibyte
8333         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8334     }
8335   else if (EQ (dst_object, Qt))
8336     {
8337       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8338       coding->dst_object = Qnil;
8339       coding->destination = xmalloc (dst_bytes);
8340       coding->dst_bytes = dst_bytes;
8341       coding->dst_multibyte = 0;
8342     }
8343   else
8344     {
8345       coding->dst_object = Qnil;
8346       coding->dst_multibyte = 0;
8347     }
8348
8349   encode_coding (coding);
8350
8351   if (EQ (dst_object, Qt))
8352     {
8353       if (BUFFERP (coding->dst_object))
8354         coding->dst_object = Fbuffer_string ();
8355       else
8356         {
8357           coding->dst_object
8358             = make_unibyte_string ((char *) coding->destination,
8359                                    coding->produced);
8360           xfree (coding->destination);
8361         }
8362     }
8363
8364   if (saved_pt >= 0)
8365     {
8366       /* This is the case of:
8367          (BUFFERP (src_object) && EQ (src_object, dst_object))
8368          As we have moved PT while replacing the original buffer
8369          contents, we must recover it now.  */
8370       set_buffer_internal (XBUFFER (src_object));
8371       if (saved_pt < from)
8372         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8373       else if (saved_pt < from + chars)
8374         TEMP_SET_PT_BOTH (from, from_byte);
8375       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8376         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8377                           saved_pt_byte + (coding->produced - bytes));
8378       else
8379         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8380                           saved_pt_byte + (coding->produced - bytes));
8381
8382       if (need_marker_adjustment)
8383         {
8384           struct Lisp_Marker *tail;
8385
8386           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8387             if (tail->need_adjustment)
8388               {
8389                 tail->need_adjustment = 0;
8390                 if (tail->insertion_type)
8391                   {
8392                     tail->bytepos = from_byte;
8393                     tail->charpos = from;
8394                   }
8395                 else
8396                   {
8397                     tail->bytepos = from_byte + coding->produced;
8398                     tail->charpos
8399                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8400                          ? tail->bytepos : from + coding->produced_char);
8401                   }
8402               }
8403         }
8404     }
8405
8406   if (kill_src_buffer)
8407     Fkill_buffer (coding->src_object);
8408
8409   Vdeactivate_mark = old_deactivate_mark;
8410   unbind_to (count, Qnil);
8411 }
8412
8413
8414 Lisp_Object
8415 preferred_coding_system (void)
8416 {
8417   int id = coding_categories[coding_priorities[0]].id;
8418
8419   return CODING_ID_NAME (id);
8420 }
8421
8422 #if defined (WINDOWSNT) || defined (CYGWIN)
8423
8424 Lisp_Object
8425 from_unicode (Lisp_Object str)
8426 {
8427   CHECK_STRING (str);
8428   if (!STRING_MULTIBYTE (str) &&
8429       SBYTES (str) & 1)
8430     {
8431       str = Fsubstring (str, make_number (0), make_number (-1));
8432     }
8433
8434   return code_convert_string_norecord (str, Qutf_16le, 0);
8435 }
8436
8437 Lisp_Object
8438 from_unicode_buffer (const wchar_t* wstr)
8439 {
8440     return from_unicode (
8441         make_unibyte_string (
8442             (char*) wstr,
8443             /* we get one of the two final 0 bytes for free. */
8444             1 + sizeof (wchar_t) * wcslen (wstr)));
8445 }
8446
8447 wchar_t *
8448 to_unicode (Lisp_Object str, Lisp_Object *buf)
8449 {
8450   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8451   /* We need to make another copy (in addition to the one made by
8452      code_convert_string_norecord) to ensure that the final string is
8453      _doubly_ zero terminated --- that is, that the string is
8454      terminated by two zero bytes and one utf-16le null character.
8455      Because strings are already terminated with a single zero byte,
8456      we just add one additional zero. */
8457   str = make_uninit_string (SBYTES (*buf) + 1);
8458   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8459   SDATA (str) [SBYTES (*buf)] = '\0';
8460   *buf = str;
8461   return WCSDATA (*buf);
8462 }
8463
8464 #endif /* WINDOWSNT || CYGWIN */
8465
8466 \f
8467 #ifdef emacs
8468 /*** 8. Emacs Lisp library functions ***/
8469
8470 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8471        doc: /* Return t if OBJECT is nil or a coding-system.
8472 See the documentation of `define-coding-system' for information
8473 about coding-system objects.  */)
8474   (Lisp_Object object)
8475 {
8476   if (NILP (object)
8477       || CODING_SYSTEM_ID (object) >= 0)
8478     return Qt;
8479   if (! SYMBOLP (object)
8480       || NILP (Fget (object, Qcoding_system_define_form)))
8481     return Qnil;
8482   return Qt;
8483 }
8484
8485 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8486        Sread_non_nil_coding_system, 1, 1, 0,
8487        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8488   (Lisp_Object prompt)
8489 {
8490   Lisp_Object val;
8491   do
8492     {
8493       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8494                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8495     }
8496   while (SCHARS (val) == 0);
8497   return (Fintern (val, Qnil));
8498 }
8499
8500 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8501        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8502 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8503 Ignores case when completing coding systems (all Emacs coding systems
8504 are lower-case).  */)
8505   (Lisp_Object prompt, Lisp_Object default_coding_system)
8506 {
8507   Lisp_Object val;
8508   ptrdiff_t count = SPECPDL_INDEX ();
8509
8510   if (SYMBOLP (default_coding_system))
8511     default_coding_system = SYMBOL_NAME (default_coding_system);
8512   specbind (Qcompletion_ignore_case, Qt);
8513   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8514                           Qt, Qnil, Qcoding_system_history,
8515                           default_coding_system, Qnil);
8516   unbind_to (count, Qnil);
8517   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8518 }
8519
8520 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8521        1, 1, 0,
8522        doc: /* Check validity of CODING-SYSTEM.
8523 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8524 It is valid if it is nil or a symbol defined as a coding system by the
8525 function `define-coding-system'.  */)
8526   (Lisp_Object coding_system)
8527 {
8528   Lisp_Object define_form;
8529
8530   define_form = Fget (coding_system, Qcoding_system_define_form);
8531   if (! NILP (define_form))
8532     {
8533       Fput (coding_system, Qcoding_system_define_form, Qnil);
8534       safe_eval (define_form);
8535     }
8536   if (!NILP (Fcoding_system_p (coding_system)))
8537     return coding_system;
8538   xsignal1 (Qcoding_system_error, coding_system);
8539 }
8540
8541 \f
8542 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8543    HIGHEST, return the coding system of the highest
8544    priority among the detected coding systems.  Otherwise return a
8545    list of detected coding systems sorted by their priorities.  If
8546    MULTIBYTEP, it is assumed that the bytes are in correct
8547    multibyte form but contains only ASCII and eight-bit chars.
8548    Otherwise, the bytes are raw bytes.
8549
8550    CODING-SYSTEM controls the detection as below:
8551
8552    If it is nil, detect both text-format and eol-format.  If the
8553    text-format part of CODING-SYSTEM is already specified
8554    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8555    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8556    detect only text-format.  */
8557
8558 Lisp_Object
8559 detect_coding_system (const unsigned char *src,
8560                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8561                       bool highest, bool multibytep,
8562                       Lisp_Object coding_system)
8563 {
8564   const unsigned char *src_end = src + src_bytes;
8565   Lisp_Object attrs, eol_type;
8566   Lisp_Object val = Qnil;
8567   struct coding_system coding;
8568   ptrdiff_t id;
8569   struct coding_detection_info detect_info;
8570   enum coding_category base_category;
8571   bool null_byte_found = 0, eight_bit_found = 0;
8572
8573   if (NILP (coding_system))
8574     coding_system = Qundecided;
8575   setup_coding_system (coding_system, &coding);
8576   attrs = CODING_ID_ATTRS (coding.id);
8577   eol_type = CODING_ID_EOL_TYPE (coding.id);
8578   coding_system = CODING_ATTR_BASE_NAME (attrs);
8579
8580   coding.source = src;
8581   coding.src_chars = src_chars;
8582   coding.src_bytes = src_bytes;
8583   coding.src_multibyte = multibytep;
8584   coding.consumed = 0;
8585   coding.mode |= CODING_MODE_LAST_BLOCK;
8586   coding.head_ascii = 0;
8587
8588   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8589
8590   /* At first, detect text-format if necessary.  */
8591   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8592   if (base_category == coding_category_undecided)
8593     {
8594       enum coding_category category IF_LINT (= 0);
8595       struct coding_system *this IF_LINT (= NULL);
8596       int c, i;
8597       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8598                                        inhibit_null_byte_detection);
8599       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8600                                        inhibit_iso_escape_detection);
8601       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8602
8603       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8604       for (; src < src_end; src++)
8605         {
8606           c = *src;
8607           if (c & 0x80)
8608             {
8609               eight_bit_found = 1;
8610               if (null_byte_found)
8611                 break;
8612             }
8613           else if (c < 0x20)
8614             {
8615               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8616                   && ! inhibit_ied
8617                   && ! detect_info.checked)
8618                 {
8619                   if (detect_coding_iso_2022 (&coding, &detect_info))
8620                     {
8621                       /* We have scanned the whole data.  */
8622                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8623                         {
8624                           /* We didn't find an 8-bit code.  We may
8625                              have found a null-byte, but it's very
8626                              rare that a binary file confirm to
8627                              ISO-2022.  */
8628                           src = src_end;
8629                           coding.head_ascii = src - coding.source;
8630                         }
8631                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8632                       break;
8633                     }
8634                 }
8635               else if (! c && !inhibit_nbd)
8636                 {
8637                   null_byte_found = 1;
8638                   if (eight_bit_found)
8639                     break;
8640                 }
8641               if (! eight_bit_found)
8642                 coding.head_ascii++;
8643             }
8644           else if (! eight_bit_found)
8645             coding.head_ascii++;
8646         }
8647
8648       if (null_byte_found || eight_bit_found
8649           || coding.head_ascii < coding.src_bytes
8650           || detect_info.found)
8651         {
8652           if (coding.head_ascii == coding.src_bytes)
8653             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8654             for (i = 0; i < coding_category_raw_text; i++)
8655               {
8656                 category = coding_priorities[i];
8657                 this = coding_categories + category;
8658                 if (detect_info.found & (1 << category))
8659                   break;
8660               }
8661           else
8662             {
8663               if (null_byte_found)
8664                 {
8665                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8666                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8667                 }
8668               else if (prefer_utf_8
8669                        && detect_coding_utf_8 (&coding, &detect_info))
8670                 {
8671                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8672                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8673                 }
8674               for (i = 0; i < coding_category_raw_text; i++)
8675                 {
8676                   category = coding_priorities[i];
8677                   this = coding_categories + category;
8678
8679                   if (this->id < 0)
8680                     {
8681                       /* No coding system of this category is defined.  */
8682                       detect_info.rejected |= (1 << category);
8683                     }
8684                   else if (category >= coding_category_raw_text)
8685                     continue;
8686                   else if (detect_info.checked & (1 << category))
8687                     {
8688                       if (highest
8689                           && (detect_info.found & (1 << category)))
8690                         break;
8691                     }
8692                   else if ((*(this->detector)) (&coding, &detect_info)
8693                            && highest
8694                            && (detect_info.found & (1 << category)))
8695                     {
8696                       if (category == coding_category_utf_16_auto)
8697                         {
8698                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8699                             category = coding_category_utf_16_le;
8700                           else
8701                             category = coding_category_utf_16_be;
8702                         }
8703                       break;
8704                     }
8705                 }
8706             }
8707         }
8708
8709       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8710           || null_byte_found)
8711         {
8712           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8713           id = CODING_SYSTEM_ID (Qno_conversion);
8714           val = list1 (make_number (id));
8715         }
8716       else if (! detect_info.rejected && ! detect_info.found)
8717         {
8718           detect_info.found = CATEGORY_MASK_ANY;
8719           id = coding_categories[coding_category_undecided].id;
8720           val = list1 (make_number (id));
8721         }
8722       else if (highest)
8723         {
8724           if (detect_info.found)
8725             {
8726               detect_info.found = 1 << category;
8727               val = list1 (make_number (this->id));
8728             }
8729           else
8730             for (i = 0; i < coding_category_raw_text; i++)
8731               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8732                 {
8733                   detect_info.found = 1 << coding_priorities[i];
8734                   id = coding_categories[coding_priorities[i]].id;
8735                   val = list1 (make_number (id));
8736                   break;
8737                 }
8738         }
8739       else
8740         {
8741           int mask = detect_info.rejected | detect_info.found;
8742           int found = 0;
8743
8744           for (i = coding_category_raw_text - 1; i >= 0; i--)
8745             {
8746               category = coding_priorities[i];
8747               if (! (mask & (1 << category)))
8748                 {
8749                   found |= 1 << category;
8750                   id = coding_categories[category].id;
8751                   if (id >= 0)
8752                     val = list1 (make_number (id));
8753                 }
8754             }
8755           for (i = coding_category_raw_text - 1; i >= 0; i--)
8756             {
8757               category = coding_priorities[i];
8758               if (detect_info.found & (1 << category))
8759                 {
8760                   id = coding_categories[category].id;
8761                   val = Fcons (make_number (id), val);
8762                 }
8763             }
8764           detect_info.found |= found;
8765         }
8766     }
8767   else if (base_category == coding_category_utf_8_auto)
8768     {
8769       if (detect_coding_utf_8 (&coding, &detect_info))
8770         {
8771           struct coding_system *this;
8772
8773           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8774             this = coding_categories + coding_category_utf_8_sig;
8775           else
8776             this = coding_categories + coding_category_utf_8_nosig;
8777           val = list1 (make_number (this->id));
8778         }
8779     }
8780   else if (base_category == coding_category_utf_16_auto)
8781     {
8782       if (detect_coding_utf_16 (&coding, &detect_info))
8783         {
8784           struct coding_system *this;
8785
8786           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8787             this = coding_categories + coding_category_utf_16_le;
8788           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8789             this = coding_categories + coding_category_utf_16_be;
8790           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8791             this = coding_categories + coding_category_utf_16_be_nosig;
8792           else
8793             this = coding_categories + coding_category_utf_16_le_nosig;
8794           val = list1 (make_number (this->id));
8795         }
8796     }
8797   else
8798     {
8799       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8800       val = list1 (make_number (coding.id));
8801     }
8802
8803   /* Then, detect eol-format if necessary.  */
8804   {
8805     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8806     Lisp_Object tail;
8807
8808     if (VECTORP (eol_type))
8809       {
8810         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8811           {
8812             if (null_byte_found)
8813               normal_eol = EOL_SEEN_LF;
8814             else
8815               normal_eol = detect_eol (coding.source, src_bytes,
8816                                        coding_category_raw_text);
8817           }
8818         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8819                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8820           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8821                                       coding_category_utf_16_be);
8822         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8823                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8824           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8825                                       coding_category_utf_16_le);
8826       }
8827     else
8828       {
8829         if (EQ (eol_type, Qunix))
8830           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8831         else if (EQ (eol_type, Qdos))
8832           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8833         else
8834           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8835       }
8836
8837     for (tail = val; CONSP (tail); tail = XCDR (tail))
8838       {
8839         enum coding_category category;
8840         int this_eol;
8841
8842         id = XINT (XCAR (tail));
8843         attrs = CODING_ID_ATTRS (id);
8844         category = XINT (CODING_ATTR_CATEGORY (attrs));
8845         eol_type = CODING_ID_EOL_TYPE (id);
8846         if (VECTORP (eol_type))
8847           {
8848             if (category == coding_category_utf_16_be
8849                 || category == coding_category_utf_16_be_nosig)
8850               this_eol = utf_16_be_eol;
8851             else if (category == coding_category_utf_16_le
8852                      || category == coding_category_utf_16_le_nosig)
8853               this_eol = utf_16_le_eol;
8854             else
8855               this_eol = normal_eol;
8856
8857             if (this_eol == EOL_SEEN_LF)
8858               XSETCAR (tail, AREF (eol_type, 0));
8859             else if (this_eol == EOL_SEEN_CRLF)
8860               XSETCAR (tail, AREF (eol_type, 1));
8861             else if (this_eol == EOL_SEEN_CR)
8862               XSETCAR (tail, AREF (eol_type, 2));
8863             else
8864               XSETCAR (tail, CODING_ID_NAME (id));
8865           }
8866         else
8867           XSETCAR (tail, CODING_ID_NAME (id));
8868       }
8869   }
8870
8871   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8872 }
8873
8874
8875 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8876        2, 3, 0,
8877        doc: /* Detect coding system of the text in the region between START and END.
8878 Return a list of possible coding systems ordered by priority.
8879 The coding systems to try and their priorities follows what
8880 the function `coding-system-priority-list' (which see) returns.
8881
8882 If only ASCII characters are found (except for such ISO-2022 control
8883 characters as ESC), it returns a list of single element `undecided'
8884 or its subsidiary coding system according to a detected end-of-line
8885 format.
8886
8887 If optional argument HIGHEST is non-nil, return the coding system of
8888 highest priority.  */)
8889   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8890 {
8891   ptrdiff_t from, to;
8892   ptrdiff_t from_byte, to_byte;
8893
8894   validate_region (&start, &end);
8895   from = XINT (start), to = XINT (end);
8896   from_byte = CHAR_TO_BYTE (from);
8897   to_byte = CHAR_TO_BYTE (to);
8898
8899   if (from < GPT && to >= GPT)
8900     move_gap_both (to, to_byte);
8901
8902   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8903                                to - from, to_byte - from_byte,
8904                                !NILP (highest),
8905                                !NILP (BVAR (current_buffer
8906                                       , enable_multibyte_characters)),
8907                                Qnil);
8908 }
8909
8910 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8911        1, 2, 0,
8912        doc: /* Detect coding system of the text in STRING.
8913 Return a list of possible coding systems ordered by priority.
8914 The coding systems to try and their priorities follows what
8915 the function `coding-system-priority-list' (which see) returns.
8916
8917 If only ASCII characters are found (except for such ISO-2022 control
8918 characters as ESC), it returns a list of single element `undecided'
8919 or its subsidiary coding system according to a detected end-of-line
8920 format.
8921
8922 If optional argument HIGHEST is non-nil, return the coding system of
8923 highest priority.  */)
8924   (Lisp_Object string, Lisp_Object highest)
8925 {
8926   CHECK_STRING (string);
8927
8928   return detect_coding_system (SDATA (string),
8929                                SCHARS (string), SBYTES (string),
8930                                !NILP (highest), STRING_MULTIBYTE (string),
8931                                Qnil);
8932 }
8933
8934
8935 static bool
8936 char_encodable_p (int c, Lisp_Object attrs)
8937 {
8938   Lisp_Object tail;
8939   struct charset *charset;
8940   Lisp_Object translation_table;
8941
8942   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8943   if (! NILP (translation_table))
8944     c = translate_char (translation_table, c);
8945   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8946        CONSP (tail); tail = XCDR (tail))
8947     {
8948       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8949       if (CHAR_CHARSET_P (c, charset))
8950         break;
8951     }
8952   return (! NILP (tail));
8953 }
8954
8955
8956 /* Return a list of coding systems that safely encode the text between
8957    START and END.  If EXCLUDE is non-nil, it is a list of coding
8958    systems not to check.  The returned list doesn't contain any such
8959    coding systems.  In any case, if the text contains only ASCII or is
8960    unibyte, return t.  */
8961
8962 DEFUN ("find-coding-systems-region-internal",
8963        Ffind_coding_systems_region_internal,
8964        Sfind_coding_systems_region_internal, 2, 3, 0,
8965        doc: /* Internal use only.  */)
8966   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8967 {
8968   Lisp_Object coding_attrs_list, safe_codings;
8969   ptrdiff_t start_byte, end_byte;
8970   const unsigned char *p, *pbeg, *pend;
8971   int c;
8972   Lisp_Object tail, elt, work_table;
8973
8974   if (STRINGP (start))
8975     {
8976       if (!STRING_MULTIBYTE (start)
8977           || SCHARS (start) == SBYTES (start))
8978         return Qt;
8979       start_byte = 0;
8980       end_byte = SBYTES (start);
8981     }
8982   else
8983     {
8984       CHECK_NUMBER_COERCE_MARKER (start);
8985       CHECK_NUMBER_COERCE_MARKER (end);
8986       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8987         args_out_of_range (start, end);
8988       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8989         return Qt;
8990       start_byte = CHAR_TO_BYTE (XINT (start));
8991       end_byte = CHAR_TO_BYTE (XINT (end));
8992       if (XINT (end) - XINT (start) == end_byte - start_byte)
8993         return Qt;
8994
8995       if (XINT (start) < GPT && XINT (end) > GPT)
8996         {
8997           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8998             move_gap_both (XINT (start), start_byte);
8999           else
9000             move_gap_both (XINT (end), end_byte);
9001         }
9002     }
9003
9004   coding_attrs_list = Qnil;
9005   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9006     if (NILP (exclude)
9007         || NILP (Fmemq (XCAR (tail), exclude)))
9008       {
9009         Lisp_Object attrs;
9010
9011         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9012         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9013           {
9014             ASET (attrs, coding_attr_trans_tbl,
9015                   get_translation_table (attrs, 1, NULL));
9016             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9017           }
9018       }
9019
9020   if (STRINGP (start))
9021     p = pbeg = SDATA (start);
9022   else
9023     p = pbeg = BYTE_POS_ADDR (start_byte);
9024   pend = p + (end_byte - start_byte);
9025
9026   while (p < pend && ASCII_BYTE_P (*p)) p++;
9027   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9028
9029   work_table = Fmake_char_table (Qnil, Qnil);
9030   while (p < pend)
9031     {
9032       if (ASCII_BYTE_P (*p))
9033         p++;
9034       else
9035         {
9036           c = STRING_CHAR_ADVANCE (p);
9037           if (!NILP (char_table_ref (work_table, c)))
9038             /* This character was already checked.  Ignore it.  */
9039             continue;
9040
9041           charset_map_loaded = 0;
9042           for (tail = coding_attrs_list; CONSP (tail);)
9043             {
9044               elt = XCAR (tail);
9045               if (NILP (elt))
9046                 tail = XCDR (tail);
9047               else if (char_encodable_p (c, elt))
9048                 tail = XCDR (tail);
9049               else if (CONSP (XCDR (tail)))
9050                 {
9051                   XSETCAR (tail, XCAR (XCDR (tail)));
9052                   XSETCDR (tail, XCDR (XCDR (tail)));
9053                 }
9054               else
9055                 {
9056                   XSETCAR (tail, Qnil);
9057                   tail = XCDR (tail);
9058                 }
9059             }
9060           if (charset_map_loaded)
9061             {
9062               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9063
9064               if (STRINGP (start))
9065                 pbeg = SDATA (start);
9066               else
9067                 pbeg = BYTE_POS_ADDR (start_byte);
9068               p = pbeg + p_offset;
9069               pend = pbeg + pend_offset;
9070             }
9071           char_table_set (work_table, c, Qt);
9072         }
9073     }
9074
9075   safe_codings = list2 (Qraw_text, Qno_conversion);
9076   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9077     if (! NILP (XCAR (tail)))
9078       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9079
9080   return safe_codings;
9081 }
9082
9083
9084 DEFUN ("unencodable-char-position", Funencodable_char_position,
9085        Sunencodable_char_position, 3, 5, 0,
9086        doc: /*
9087 Return position of first un-encodable character in a region.
9088 START and END specify the region and CODING-SYSTEM specifies the
9089 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9090
9091 If optional 4th argument COUNT is non-nil, it specifies at most how
9092 many un-encodable characters to search.  In this case, the value is a
9093 list of positions.
9094
9095 If optional 5th argument STRING is non-nil, it is a string to search
9096 for un-encodable characters.  In that case, START and END are indexes
9097 to the string.  */)
9098   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
9099 {
9100   EMACS_INT n;
9101   struct coding_system coding;
9102   Lisp_Object attrs, charset_list, translation_table;
9103   Lisp_Object positions;
9104   ptrdiff_t from, to;
9105   const unsigned char *p, *stop, *pend;
9106   bool ascii_compatible;
9107
9108   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9109   attrs = CODING_ID_ATTRS (coding.id);
9110   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9111     return Qnil;
9112   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9113   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9114   translation_table = get_translation_table (attrs, 1, NULL);
9115
9116   if (NILP (string))
9117     {
9118       validate_region (&start, &end);
9119       from = XINT (start);
9120       to = XINT (end);
9121       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9122           || (ascii_compatible
9123               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9124         return Qnil;
9125       p = CHAR_POS_ADDR (from);
9126       pend = CHAR_POS_ADDR (to);
9127       if (from < GPT && to >= GPT)
9128         stop = GPT_ADDR;
9129       else
9130         stop = pend;
9131     }
9132   else
9133     {
9134       CHECK_STRING (string);
9135       CHECK_NATNUM (start);
9136       CHECK_NATNUM (end);
9137       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
9138         args_out_of_range_3 (string, start, end);
9139       from = XINT (start);
9140       to = XINT (end);
9141       if (! STRING_MULTIBYTE (string))
9142         return Qnil;
9143       p = SDATA (string) + string_char_to_byte (string, from);
9144       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9145       if (ascii_compatible && (to - from) == (pend - p))
9146         return Qnil;
9147     }
9148
9149   if (NILP (count))
9150     n = 1;
9151   else
9152     {
9153       CHECK_NATNUM (count);
9154       n = XINT (count);
9155     }
9156
9157   positions = Qnil;
9158   charset_map_loaded = 0;
9159   while (1)
9160     {
9161       int c;
9162
9163       if (ascii_compatible)
9164         while (p < stop && ASCII_BYTE_P (*p))
9165           p++, from++;
9166       if (p >= stop)
9167         {
9168           if (p >= pend)
9169             break;
9170           stop = pend;
9171           p = GAP_END_ADDR;
9172         }
9173
9174       c = STRING_CHAR_ADVANCE (p);
9175       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9176           && ! char_charset (translate_char (translation_table, c),
9177                              charset_list, NULL))
9178         {
9179           positions = Fcons (make_number (from), positions);
9180           n--;
9181           if (n == 0)
9182             break;
9183         }
9184
9185       from++;
9186       if (charset_map_loaded && NILP (string))
9187         {
9188           p = CHAR_POS_ADDR (from);
9189           pend = CHAR_POS_ADDR (to);
9190           if (from < GPT && to >= GPT)
9191             stop = GPT_ADDR;
9192           else
9193             stop = pend;
9194           charset_map_loaded = 0;
9195         }
9196     }
9197
9198   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9199 }
9200
9201
9202 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9203        Scheck_coding_systems_region, 3, 3, 0,
9204        doc: /* Check if the region is encodable by coding systems.
9205
9206 START and END are buffer positions specifying the region.
9207 CODING-SYSTEM-LIST is a list of coding systems to check.
9208
9209 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9210 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9211 whole region, POS0, POS1, ... are buffer positions where non-encodable
9212 characters are found.
9213
9214 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9215 value is nil.
9216
9217 START may be a string.  In that case, check if the string is
9218 encodable, and the value contains indices to the string instead of
9219 buffer positions.  END is ignored.
9220
9221 If the current buffer (or START if it is a string) is unibyte, the value
9222 is nil.  */)
9223   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9224 {
9225   Lisp_Object list;
9226   ptrdiff_t start_byte, end_byte;
9227   ptrdiff_t pos;
9228   const unsigned char *p, *pbeg, *pend;
9229   int c;
9230   Lisp_Object tail, elt, attrs;
9231
9232   if (STRINGP (start))
9233     {
9234       if (!STRING_MULTIBYTE (start)
9235           || SCHARS (start) == SBYTES (start))
9236         return Qnil;
9237       start_byte = 0;
9238       end_byte = SBYTES (start);
9239       pos = 0;
9240     }
9241   else
9242     {
9243       CHECK_NUMBER_COERCE_MARKER (start);
9244       CHECK_NUMBER_COERCE_MARKER (end);
9245       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9246         args_out_of_range (start, end);
9247       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9248         return Qnil;
9249       start_byte = CHAR_TO_BYTE (XINT (start));
9250       end_byte = CHAR_TO_BYTE (XINT (end));
9251       if (XINT (end) - XINT (start) == end_byte - start_byte)
9252         return Qnil;
9253
9254       if (XINT (start) < GPT && XINT (end) > GPT)
9255         {
9256           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9257             move_gap_both (XINT (start), start_byte);
9258           else
9259             move_gap_both (XINT (end), end_byte);
9260         }
9261       pos = XINT (start);
9262     }
9263
9264   list = Qnil;
9265   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9266     {
9267       elt = XCAR (tail);
9268       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9269       ASET (attrs, coding_attr_trans_tbl,
9270             get_translation_table (attrs, 1, NULL));
9271       list = Fcons (list2 (elt, attrs), list);
9272     }
9273
9274   if (STRINGP (start))
9275     p = pbeg = SDATA (start);
9276   else
9277     p = pbeg = BYTE_POS_ADDR (start_byte);
9278   pend = p + (end_byte - start_byte);
9279
9280   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9281   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9282
9283   while (p < pend)
9284     {
9285       if (ASCII_BYTE_P (*p))
9286         p++;
9287       else
9288         {
9289           c = STRING_CHAR_ADVANCE (p);
9290
9291           charset_map_loaded = 0;
9292           for (tail = list; CONSP (tail); tail = XCDR (tail))
9293             {
9294               elt = XCDR (XCAR (tail));
9295               if (! char_encodable_p (c, XCAR (elt)))
9296                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9297             }
9298           if (charset_map_loaded)
9299             {
9300               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9301
9302               if (STRINGP (start))
9303                 pbeg = SDATA (start);
9304               else
9305                 pbeg = BYTE_POS_ADDR (start_byte);
9306               p = pbeg + p_offset;
9307               pend = pbeg + pend_offset;
9308             }
9309         }
9310       pos++;
9311     }
9312
9313   tail = list;
9314   list = Qnil;
9315   for (; CONSP (tail); tail = XCDR (tail))
9316     {
9317       elt = XCAR (tail);
9318       if (CONSP (XCDR (XCDR (elt))))
9319         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9320                       list);
9321     }
9322
9323   return list;
9324 }
9325
9326
9327 static Lisp_Object
9328 code_convert_region (Lisp_Object start, Lisp_Object end,
9329                      Lisp_Object coding_system, Lisp_Object dst_object,
9330                      bool encodep, bool norecord)
9331 {
9332   struct coding_system coding;
9333   ptrdiff_t from, from_byte, to, to_byte;
9334   Lisp_Object src_object;
9335
9336   if (NILP (coding_system))
9337     coding_system = Qno_conversion;
9338   else
9339     CHECK_CODING_SYSTEM (coding_system);
9340   src_object = Fcurrent_buffer ();
9341   if (NILP (dst_object))
9342     dst_object = src_object;
9343   else if (! EQ (dst_object, Qt))
9344     CHECK_BUFFER (dst_object);
9345
9346   validate_region (&start, &end);
9347   from = XFASTINT (start);
9348   from_byte = CHAR_TO_BYTE (from);
9349   to = XFASTINT (end);
9350   to_byte = CHAR_TO_BYTE (to);
9351
9352   setup_coding_system (coding_system, &coding);
9353   coding.mode |= CODING_MODE_LAST_BLOCK;
9354
9355   if (encodep)
9356     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9357                           dst_object);
9358   else
9359     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9360                           dst_object);
9361   if (! norecord)
9362     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9363
9364   return (BUFFERP (dst_object)
9365           ? make_number (coding.produced_char)
9366           : coding.dst_object);
9367 }
9368
9369
9370 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9371        3, 4, "r\nzCoding system: ",
9372        doc: /* Decode the current region from the specified coding system.
9373 When called from a program, takes four arguments:
9374         START, END, CODING-SYSTEM, and DESTINATION.
9375 START and END are buffer positions.
9376
9377 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9378 If nil, the region between START and END is replaced by the decoded text.
9379 If buffer, the decoded text is inserted in that buffer after point (point
9380 does not move).
9381 In those cases, the length of the decoded text is returned.
9382 If DESTINATION is t, the decoded text is returned.
9383
9384 This function sets `last-coding-system-used' to the precise coding system
9385 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9386 not fully specified.)  */)
9387   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9388 {
9389   return code_convert_region (start, end, coding_system, destination, 0, 0);
9390 }
9391
9392 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9393        3, 4, "r\nzCoding system: ",
9394        doc: /* Encode the current region by specified coding system.
9395 When called from a program, takes four arguments:
9396         START, END, CODING-SYSTEM and DESTINATION.
9397 START and END are buffer positions.
9398
9399 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9400 If nil, the region between START and END is replace by the encoded text.
9401 If buffer, the encoded text is inserted in that buffer after point (point
9402 does not move).
9403 In those cases, the length of the encoded text is returned.
9404 If DESTINATION is t, the encoded text is returned.
9405
9406 This function sets `last-coding-system-used' to the precise coding system
9407 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9408 not fully specified.)  */)
9409   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9410 {
9411   return code_convert_region (start, end, coding_system, destination, 1, 0);
9412 }
9413
9414 Lisp_Object
9415 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9416                      Lisp_Object dst_object, bool encodep, bool nocopy,
9417                      bool norecord)
9418 {
9419   struct coding_system coding;
9420   ptrdiff_t chars, bytes;
9421
9422   CHECK_STRING (string);
9423   if (NILP (coding_system))
9424     {
9425       if (! norecord)
9426         Vlast_coding_system_used = Qno_conversion;
9427       if (NILP (dst_object))
9428         return (nocopy ? Fcopy_sequence (string) : string);
9429     }
9430
9431   if (NILP (coding_system))
9432     coding_system = Qno_conversion;
9433   else
9434     CHECK_CODING_SYSTEM (coding_system);
9435   if (NILP (dst_object))
9436     dst_object = Qt;
9437   else if (! EQ (dst_object, Qt))
9438     CHECK_BUFFER (dst_object);
9439
9440   setup_coding_system (coding_system, &coding);
9441   coding.mode |= CODING_MODE_LAST_BLOCK;
9442   chars = SCHARS (string);
9443   bytes = SBYTES (string);
9444   if (encodep)
9445     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9446   else
9447     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9448   if (! norecord)
9449     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9450
9451   return (BUFFERP (dst_object)
9452           ? make_number (coding.produced_char)
9453           : coding.dst_object);
9454 }
9455
9456
9457 /* Encode or decode STRING according to CODING_SYSTEM.
9458    Do not set Vlast_coding_system_used.
9459
9460    This function is called only from macros DECODE_FILE and
9461    ENCODE_FILE, thus we ignore character composition.  */
9462
9463 Lisp_Object
9464 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9465                               bool encodep)
9466 {
9467   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9468 }
9469
9470
9471 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9472        2, 4, 0,
9473        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9474
9475 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9476 if the decoding operation is trivial.
9477
9478 Optional fourth arg BUFFER non-nil means that the decoded text is
9479 inserted in that buffer after point (point does not move).  In this
9480 case, the return value is the length of the decoded text.
9481
9482 This function sets `last-coding-system-used' to the precise coding system
9483 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9484 not fully specified.)  */)
9485   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9486 {
9487   return code_convert_string (string, coding_system, buffer,
9488                               0, ! NILP (nocopy), 0);
9489 }
9490
9491 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9492        2, 4, 0,
9493        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9494
9495 Optional third arg NOCOPY non-nil means it is OK to return STRING
9496 itself if the encoding operation is trivial.
9497
9498 Optional fourth arg BUFFER non-nil means that the encoded text is
9499 inserted in that buffer after point (point does not move).  In this
9500 case, the return value is the length of the encoded text.
9501
9502 This function sets `last-coding-system-used' to the precise coding system
9503 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9504 not fully specified.)  */)
9505   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9506 {
9507   return code_convert_string (string, coding_system, buffer,
9508                               1, ! NILP (nocopy), 0);
9509 }
9510
9511 \f
9512 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9513        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9514 Return the corresponding character.  */)
9515   (Lisp_Object code)
9516 {
9517   Lisp_Object spec, attrs, val;
9518   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9519   EMACS_INT ch;
9520   int c;
9521
9522   CHECK_NATNUM (code);
9523   ch = XFASTINT (code);
9524   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9525   attrs = AREF (spec, 0);
9526
9527   if (ASCII_BYTE_P (ch)
9528       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9529     return code;
9530
9531   val = CODING_ATTR_CHARSET_LIST (attrs);
9532   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9533   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9534   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9535
9536   if (ch <= 0x7F)
9537     {
9538       c = ch;
9539       charset = charset_roman;
9540     }
9541   else if (ch >= 0xA0 && ch < 0xDF)
9542     {
9543       c = ch - 0x80;
9544       charset = charset_kana;
9545     }
9546   else
9547     {
9548       EMACS_INT c1 = ch >> 8;
9549       int c2 = ch & 0xFF;
9550
9551       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9552           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9553         error ("Invalid code: %"pI"d", ch);
9554       c = ch;
9555       SJIS_TO_JIS (c);
9556       charset = charset_kanji;
9557     }
9558   c = DECODE_CHAR (charset, c);
9559   if (c < 0)
9560     error ("Invalid code: %"pI"d", ch);
9561   return make_number (c);
9562 }
9563
9564
9565 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9566        doc: /* Encode a Japanese character CH to shift_jis encoding.
9567 Return the corresponding code in SJIS.  */)
9568   (Lisp_Object ch)
9569 {
9570   Lisp_Object spec, attrs, charset_list;
9571   int c;
9572   struct charset *charset;
9573   unsigned code;
9574
9575   CHECK_CHARACTER (ch);
9576   c = XFASTINT (ch);
9577   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9578   attrs = AREF (spec, 0);
9579
9580   if (ASCII_CHAR_P (c)
9581       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9582     return ch;
9583
9584   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9585   charset = char_charset (c, charset_list, &code);
9586   if (code == CHARSET_INVALID_CODE (charset))
9587     error ("Can't encode by shift_jis encoding: %c", c);
9588   JIS_TO_SJIS (code);
9589
9590   return make_number (code);
9591 }
9592
9593 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9594        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9595 Return the corresponding character.  */)
9596   (Lisp_Object code)
9597 {
9598   Lisp_Object spec, attrs, val;
9599   struct charset *charset_roman, *charset_big5, *charset;
9600   EMACS_INT ch;
9601   int c;
9602
9603   CHECK_NATNUM (code);
9604   ch = XFASTINT (code);
9605   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9606   attrs = AREF (spec, 0);
9607
9608   if (ASCII_BYTE_P (ch)
9609       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9610     return code;
9611
9612   val = CODING_ATTR_CHARSET_LIST (attrs);
9613   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9614   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9615
9616   if (ch <= 0x7F)
9617     {
9618       c = ch;
9619       charset = charset_roman;
9620     }
9621   else
9622     {
9623       EMACS_INT b1 = ch >> 8;
9624       int b2 = ch & 0x7F;
9625       if (b1 < 0xA1 || b1 > 0xFE
9626           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9627         error ("Invalid code: %"pI"d", ch);
9628       c = ch;
9629       charset = charset_big5;
9630     }
9631   c = DECODE_CHAR (charset, c);
9632   if (c < 0)
9633     error ("Invalid code: %"pI"d", ch);
9634   return make_number (c);
9635 }
9636
9637 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9638        doc: /* Encode the Big5 character CH to BIG5 coding system.
9639 Return the corresponding character code in Big5.  */)
9640   (Lisp_Object ch)
9641 {
9642   Lisp_Object spec, attrs, charset_list;
9643   struct charset *charset;
9644   int c;
9645   unsigned code;
9646
9647   CHECK_CHARACTER (ch);
9648   c = XFASTINT (ch);
9649   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9650   attrs = AREF (spec, 0);
9651   if (ASCII_CHAR_P (c)
9652       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9653     return ch;
9654
9655   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9656   charset = char_charset (c, charset_list, &code);
9657   if (code == CHARSET_INVALID_CODE (charset))
9658     error ("Can't encode by Big5 encoding: %c", c);
9659
9660   return make_number (code);
9661 }
9662
9663 \f
9664 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9665        Sset_terminal_coding_system_internal, 1, 2, 0,
9666        doc: /* Internal use only.  */)
9667   (Lisp_Object coding_system, Lisp_Object terminal)
9668 {
9669   struct terminal *term = get_terminal (terminal, 1);
9670   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9671   CHECK_SYMBOL (coding_system);
9672   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9673   /* We had better not send unsafe characters to terminal.  */
9674   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9675   /* Character composition should be disabled.  */
9676   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9677   terminal_coding->src_multibyte = 1;
9678   terminal_coding->dst_multibyte = 0;
9679   tset_charset_list
9680     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9681             ? coding_charset_list (terminal_coding)
9682             : list1 (make_number (charset_ascii))));
9683   return Qnil;
9684 }
9685
9686 DEFUN ("set-safe-terminal-coding-system-internal",
9687        Fset_safe_terminal_coding_system_internal,
9688        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9689        doc: /* Internal use only.  */)
9690   (Lisp_Object coding_system)
9691 {
9692   CHECK_SYMBOL (coding_system);
9693   setup_coding_system (Fcheck_coding_system (coding_system),
9694                        &safe_terminal_coding);
9695   /* Character composition should be disabled.  */
9696   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9697   safe_terminal_coding.src_multibyte = 1;
9698   safe_terminal_coding.dst_multibyte = 0;
9699   return Qnil;
9700 }
9701
9702 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9703        Sterminal_coding_system, 0, 1, 0,
9704        doc: /* Return coding system specified for terminal output on the given terminal.
9705 TERMINAL may be a terminal object, a frame, or nil for the selected
9706 frame's terminal device.  */)
9707   (Lisp_Object terminal)
9708 {
9709   struct coding_system *terminal_coding
9710     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9711   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9712
9713   /* For backward compatibility, return nil if it is `undecided'.  */
9714   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9715 }
9716
9717 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9718        Sset_keyboard_coding_system_internal, 1, 2, 0,
9719        doc: /* Internal use only.  */)
9720   (Lisp_Object coding_system, Lisp_Object terminal)
9721 {
9722   struct terminal *t = get_terminal (terminal, 1);
9723   CHECK_SYMBOL (coding_system);
9724   if (NILP (coding_system))
9725     coding_system = Qno_conversion;
9726   else
9727     Fcheck_coding_system (coding_system);
9728   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9729   /* Character composition should be disabled.  */
9730   TERMINAL_KEYBOARD_CODING (t)->common_flags
9731     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9732   return Qnil;
9733 }
9734
9735 DEFUN ("keyboard-coding-system",
9736        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9737        doc: /* Return coding system specified for decoding keyboard input.  */)
9738   (Lisp_Object terminal)
9739 {
9740   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9741                          (get_terminal (terminal, 1))->id);
9742 }
9743
9744 \f
9745 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9746        Sfind_operation_coding_system,  1, MANY, 0,
9747        doc: /* Choose a coding system for an operation based on the target name.
9748 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9749 DECODING-SYSTEM is the coding system to use for decoding
9750 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9751 for encoding (in case OPERATION does encoding).
9752
9753 The first argument OPERATION specifies an I/O primitive:
9754   For file I/O, `insert-file-contents' or `write-region'.
9755   For process I/O, `call-process', `call-process-region', or `start-process'.
9756   For network I/O, `open-network-stream'.
9757
9758 The remaining arguments should be the same arguments that were passed
9759 to the primitive.  Depending on which primitive, one of those arguments
9760 is selected as the TARGET.  For example, if OPERATION does file I/O,
9761 whichever argument specifies the file name is TARGET.
9762
9763 TARGET has a meaning which depends on OPERATION:
9764   For file I/O, TARGET is a file name (except for the special case below).
9765   For process I/O, TARGET is a process name.
9766   For network I/O, TARGET is a service name or a port number.
9767
9768 This function looks up what is specified for TARGET in
9769 `file-coding-system-alist', `process-coding-system-alist',
9770 or `network-coding-system-alist' depending on OPERATION.
9771 They may specify a coding system, a cons of coding systems,
9772 or a function symbol to call.
9773 In the last case, we call the function with one argument,
9774 which is a list of all the arguments given to this function.
9775 If the function can't decide a coding system, it can return
9776 `undecided' so that the normal code-detection is performed.
9777
9778 If OPERATION is `insert-file-contents', the argument corresponding to
9779 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9780 file name to look up, and BUFFER is a buffer that contains the file's
9781 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9782 function to call for FILENAME, that function should examine the
9783 contents of BUFFER instead of reading the file.
9784
9785 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9786   (ptrdiff_t nargs, Lisp_Object *args)
9787 {
9788   Lisp_Object operation, target_idx, target, val;
9789   register Lisp_Object chain;
9790
9791   if (nargs < 2)
9792     error ("Too few arguments");
9793   operation = args[0];
9794   if (!SYMBOLP (operation)
9795       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9796     error ("Invalid first argument");
9797   if (nargs <= 1 + XFASTINT (target_idx))
9798     error ("Too few arguments for operation `%s'",
9799            SDATA (SYMBOL_NAME (operation)));
9800   target = args[XFASTINT (target_idx) + 1];
9801   if (!(STRINGP (target)
9802         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9803             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9804         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9805     error ("Invalid argument %"pI"d of operation `%s'",
9806            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9807   if (CONSP (target))
9808     target = XCAR (target);
9809
9810   chain = ((EQ (operation, Qinsert_file_contents)
9811             || EQ (operation, Qwrite_region))
9812            ? Vfile_coding_system_alist
9813            : (EQ (operation, Qopen_network_stream)
9814               ? Vnetwork_coding_system_alist
9815               : Vprocess_coding_system_alist));
9816   if (NILP (chain))
9817     return Qnil;
9818
9819   for (; CONSP (chain); chain = XCDR (chain))
9820     {
9821       Lisp_Object elt;
9822
9823       elt = XCAR (chain);
9824       if (CONSP (elt)
9825           && ((STRINGP (target)
9826                && STRINGP (XCAR (elt))
9827                && fast_string_match (XCAR (elt), target) >= 0)
9828               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9829         {
9830           val = XCDR (elt);
9831           /* Here, if VAL is both a valid coding system and a valid
9832              function symbol, we return VAL as a coding system.  */
9833           if (CONSP (val))
9834             return val;
9835           if (! SYMBOLP (val))
9836             return Qnil;
9837           if (! NILP (Fcoding_system_p (val)))
9838             return Fcons (val, val);
9839           if (! NILP (Ffboundp (val)))
9840             {
9841               /* We use call1 rather than safe_call1
9842                  so as to get bug reports about functions called here
9843                  which don't handle the current interface.  */
9844               val = call1 (val, Flist (nargs, args));
9845               if (CONSP (val))
9846                 return val;
9847               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9848                 return Fcons (val, val);
9849             }
9850           return Qnil;
9851         }
9852     }
9853   return Qnil;
9854 }
9855
9856 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9857        Sset_coding_system_priority, 0, MANY, 0,
9858        doc: /* Assign higher priority to the coding systems given as arguments.
9859 If multiple coding systems belong to the same category,
9860 all but the first one are ignored.
9861
9862 usage: (set-coding-system-priority &rest coding-systems)  */)
9863   (ptrdiff_t nargs, Lisp_Object *args)
9864 {
9865   ptrdiff_t i, j;
9866   bool changed[coding_category_max];
9867   enum coding_category priorities[coding_category_max];
9868
9869   memset (changed, 0, sizeof changed);
9870
9871   for (i = j = 0; i < nargs; i++)
9872     {
9873       enum coding_category category;
9874       Lisp_Object spec, attrs;
9875
9876       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9877       attrs = AREF (spec, 0);
9878       category = XINT (CODING_ATTR_CATEGORY (attrs));
9879       if (changed[category])
9880         /* Ignore this coding system because a coding system of the
9881            same category already had a higher priority.  */
9882         continue;
9883       changed[category] = 1;
9884       priorities[j++] = category;
9885       if (coding_categories[category].id >= 0
9886           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9887         setup_coding_system (args[i], &coding_categories[category]);
9888       Fset (AREF (Vcoding_category_table, category), args[i]);
9889     }
9890
9891   /* Now we have decided top J priorities.  Reflect the order of the
9892      original priorities to the remaining priorities.  */
9893
9894   for (i = j, j = 0; i < coding_category_max; i++, j++)
9895     {
9896       while (j < coding_category_max
9897              && changed[coding_priorities[j]])
9898         j++;
9899       if (j == coding_category_max)
9900         emacs_abort ();
9901       priorities[i] = coding_priorities[j];
9902     }
9903
9904   memcpy (coding_priorities, priorities, sizeof priorities);
9905
9906   /* Update `coding-category-list'.  */
9907   Vcoding_category_list = Qnil;
9908   for (i = coding_category_max; i-- > 0; )
9909     Vcoding_category_list
9910       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9911                Vcoding_category_list);
9912
9913   return Qnil;
9914 }
9915
9916 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9917        Scoding_system_priority_list, 0, 1, 0,
9918        doc: /* Return a list of coding systems ordered by their priorities.
9919 The list contains a subset of coding systems; i.e. coding systems
9920 assigned to each coding category (see `coding-category-list').
9921
9922 HIGHESTP non-nil means just return the highest priority one.  */)
9923   (Lisp_Object highestp)
9924 {
9925   int i;
9926   Lisp_Object val;
9927
9928   for (i = 0, val = Qnil; i < coding_category_max; i++)
9929     {
9930       enum coding_category category = coding_priorities[i];
9931       int id = coding_categories[category].id;
9932       Lisp_Object attrs;
9933
9934       if (id < 0)
9935         continue;
9936       attrs = CODING_ID_ATTRS (id);
9937       if (! NILP (highestp))
9938         return CODING_ATTR_BASE_NAME (attrs);
9939       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9940     }
9941   return Fnreverse (val);
9942 }
9943
9944 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9945
9946 static Lisp_Object
9947 make_subsidiaries (Lisp_Object base)
9948 {
9949   Lisp_Object subsidiaries;
9950   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9951   char *buf = alloca (base_name_len + 6);
9952   int i;
9953
9954   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9955   subsidiaries = make_uninit_vector (3);
9956   for (i = 0; i < 3; i++)
9957     {
9958       strcpy (buf + base_name_len, suffixes[i]);
9959       ASET (subsidiaries, i, intern (buf));
9960     }
9961   return subsidiaries;
9962 }
9963
9964
9965 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9966        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9967        doc: /* For internal use only.
9968 usage: (define-coding-system-internal ...)  */)
9969   (ptrdiff_t nargs, Lisp_Object *args)
9970 {
9971   Lisp_Object name;
9972   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9973   Lisp_Object attrs;            /* Vector of attributes.  */
9974   Lisp_Object eol_type;
9975   Lisp_Object aliases;
9976   Lisp_Object coding_type, charset_list, safe_charsets;
9977   enum coding_category category;
9978   Lisp_Object tail, val;
9979   int max_charset_id = 0;
9980   int i;
9981
9982   if (nargs < coding_arg_max)
9983     goto short_args;
9984
9985   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9986
9987   name = args[coding_arg_name];
9988   CHECK_SYMBOL (name);
9989   ASET (attrs, coding_attr_base_name, name);
9990
9991   val = args[coding_arg_mnemonic];
9992   if (! STRINGP (val))
9993     CHECK_CHARACTER (val);
9994   ASET (attrs, coding_attr_mnemonic, val);
9995
9996   coding_type = args[coding_arg_coding_type];
9997   CHECK_SYMBOL (coding_type);
9998   ASET (attrs, coding_attr_type, coding_type);
9999
10000   charset_list = args[coding_arg_charset_list];
10001   if (SYMBOLP (charset_list))
10002     {
10003       if (EQ (charset_list, Qiso_2022))
10004         {
10005           if (! EQ (coding_type, Qiso_2022))
10006             error ("Invalid charset-list");
10007           charset_list = Viso_2022_charset_list;
10008         }
10009       else if (EQ (charset_list, Qemacs_mule))
10010         {
10011           if (! EQ (coding_type, Qemacs_mule))
10012             error ("Invalid charset-list");
10013           charset_list = Vemacs_mule_charset_list;
10014         }
10015       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10016         {
10017           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10018             error ("Invalid charset-list");
10019           if (max_charset_id < XFASTINT (XCAR (tail)))
10020             max_charset_id = XFASTINT (XCAR (tail));
10021         }
10022     }
10023   else
10024     {
10025       charset_list = Fcopy_sequence (charset_list);
10026       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10027         {
10028           struct charset *charset;
10029
10030           val = XCAR (tail);
10031           CHECK_CHARSET_GET_CHARSET (val, charset);
10032           if (EQ (coding_type, Qiso_2022)
10033               ? CHARSET_ISO_FINAL (charset) < 0
10034               : EQ (coding_type, Qemacs_mule)
10035               ? CHARSET_EMACS_MULE_ID (charset) < 0
10036               : 0)
10037             error ("Can't handle charset `%s'",
10038                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10039
10040           XSETCAR (tail, make_number (charset->id));
10041           if (max_charset_id < charset->id)
10042             max_charset_id = charset->id;
10043         }
10044     }
10045   ASET (attrs, coding_attr_charset_list, charset_list);
10046
10047   safe_charsets = make_uninit_string (max_charset_id + 1);
10048   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10049   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10050     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10051   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10052
10053   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10054
10055   val = args[coding_arg_decode_translation_table];
10056   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10057     CHECK_SYMBOL (val);
10058   ASET (attrs, coding_attr_decode_tbl, val);
10059
10060   val = args[coding_arg_encode_translation_table];
10061   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10062     CHECK_SYMBOL (val);
10063   ASET (attrs, coding_attr_encode_tbl, val);
10064
10065   val = args[coding_arg_post_read_conversion];
10066   CHECK_SYMBOL (val);
10067   ASET (attrs, coding_attr_post_read, val);
10068
10069   val = args[coding_arg_pre_write_conversion];
10070   CHECK_SYMBOL (val);
10071   ASET (attrs, coding_attr_pre_write, val);
10072
10073   val = args[coding_arg_default_char];
10074   if (NILP (val))
10075     ASET (attrs, coding_attr_default_char, make_number (' '));
10076   else
10077     {
10078       CHECK_CHARACTER (val);
10079       ASET (attrs, coding_attr_default_char, val);
10080     }
10081
10082   val = args[coding_arg_for_unibyte];
10083   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10084
10085   val = args[coding_arg_plist];
10086   CHECK_LIST (val);
10087   ASET (attrs, coding_attr_plist, val);
10088
10089   if (EQ (coding_type, Qcharset))
10090     {
10091       /* Generate a lisp vector of 256 elements.  Each element is nil,
10092          integer, or a list of charset IDs.
10093
10094          If Nth element is nil, the byte code N is invalid in this
10095          coding system.
10096
10097          If Nth element is a number NUM, N is the first byte of a
10098          charset whose ID is NUM.
10099
10100          If Nth element is a list of charset IDs, N is the first byte
10101          of one of them.  The list is sorted by dimensions of the
10102          charsets.  A charset of smaller dimension comes first. */
10103       val = Fmake_vector (make_number (256), Qnil);
10104
10105       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10106         {
10107           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10108           int dim = CHARSET_DIMENSION (charset);
10109           int idx = (dim - 1) * 4;
10110
10111           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10112             ASET (attrs, coding_attr_ascii_compat, Qt);
10113
10114           for (i = charset->code_space[idx];
10115                i <= charset->code_space[idx + 1]; i++)
10116             {
10117               Lisp_Object tmp, tmp2;
10118               int dim2;
10119
10120               tmp = AREF (val, i);
10121               if (NILP (tmp))
10122                 tmp = XCAR (tail);
10123               else if (NUMBERP (tmp))
10124                 {
10125                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10126                   if (dim < dim2)
10127                     tmp = list2 (XCAR (tail), tmp);
10128                   else
10129                     tmp = list2 (tmp, XCAR (tail));
10130                 }
10131               else
10132                 {
10133                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10134                     {
10135                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10136                       if (dim < dim2)
10137                         break;
10138                     }
10139                   if (NILP (tmp2))
10140                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10141                   else
10142                     {
10143                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10144                       XSETCAR (tmp2, XCAR (tail));
10145                     }
10146                 }
10147               ASET (val, i, tmp);
10148             }
10149         }
10150       ASET (attrs, coding_attr_charset_valids, val);
10151       category = coding_category_charset;
10152     }
10153   else if (EQ (coding_type, Qccl))
10154     {
10155       Lisp_Object valids;
10156
10157       if (nargs < coding_arg_ccl_max)
10158         goto short_args;
10159
10160       val = args[coding_arg_ccl_decoder];
10161       CHECK_CCL_PROGRAM (val);
10162       if (VECTORP (val))
10163         val = Fcopy_sequence (val);
10164       ASET (attrs, coding_attr_ccl_decoder, val);
10165
10166       val = args[coding_arg_ccl_encoder];
10167       CHECK_CCL_PROGRAM (val);
10168       if (VECTORP (val))
10169         val = Fcopy_sequence (val);
10170       ASET (attrs, coding_attr_ccl_encoder, val);
10171
10172       val = args[coding_arg_ccl_valids];
10173       valids = Fmake_string (make_number (256), make_number (0));
10174       for (tail = val; CONSP (tail); tail = XCDR (tail))
10175         {
10176           int from, to;
10177
10178           val = XCAR (tail);
10179           if (INTEGERP (val))
10180             {
10181               if (! (0 <= XINT (val) && XINT (val) <= 255))
10182                 args_out_of_range_3 (val, make_number (0), make_number (255));
10183               from = to = XINT (val);
10184             }
10185           else
10186             {
10187               CHECK_CONS (val);
10188               CHECK_NATNUM_CAR (val);
10189               CHECK_NUMBER_CDR (val);
10190               if (XINT (XCAR (val)) > 255)
10191                 args_out_of_range_3 (XCAR (val),
10192                                      make_number (0), make_number (255));
10193               from = XINT (XCAR (val));
10194               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10195                 args_out_of_range_3 (XCDR (val),
10196                                      XCAR (val), make_number (255));
10197               to = XINT (XCDR (val));
10198             }
10199           for (i = from; i <= to; i++)
10200             SSET (valids, i, 1);
10201         }
10202       ASET (attrs, coding_attr_ccl_valids, valids);
10203
10204       category = coding_category_ccl;
10205     }
10206   else if (EQ (coding_type, Qutf_16))
10207     {
10208       Lisp_Object bom, endian;
10209
10210       ASET (attrs, coding_attr_ascii_compat, Qnil);
10211
10212       if (nargs < coding_arg_utf16_max)
10213         goto short_args;
10214
10215       bom = args[coding_arg_utf16_bom];
10216       if (! NILP (bom) && ! EQ (bom, Qt))
10217         {
10218           CHECK_CONS (bom);
10219           val = XCAR (bom);
10220           CHECK_CODING_SYSTEM (val);
10221           val = XCDR (bom);
10222           CHECK_CODING_SYSTEM (val);
10223         }
10224       ASET (attrs, coding_attr_utf_bom, bom);
10225
10226       endian = args[coding_arg_utf16_endian];
10227       CHECK_SYMBOL (endian);
10228       if (NILP (endian))
10229         endian = Qbig;
10230       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10231         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10232       ASET (attrs, coding_attr_utf_16_endian, endian);
10233
10234       category = (CONSP (bom)
10235                   ? coding_category_utf_16_auto
10236                   : NILP (bom)
10237                   ? (EQ (endian, Qbig)
10238                      ? coding_category_utf_16_be_nosig
10239                      : coding_category_utf_16_le_nosig)
10240                   : (EQ (endian, Qbig)
10241                      ? coding_category_utf_16_be
10242                      : coding_category_utf_16_le));
10243     }
10244   else if (EQ (coding_type, Qiso_2022))
10245     {
10246       Lisp_Object initial, reg_usage, request, flags;
10247
10248       if (nargs < coding_arg_iso2022_max)
10249         goto short_args;
10250
10251       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10252       CHECK_VECTOR (initial);
10253       for (i = 0; i < 4; i++)
10254         {
10255           val = AREF (initial, i);
10256           if (! NILP (val))
10257             {
10258               struct charset *charset;
10259
10260               CHECK_CHARSET_GET_CHARSET (val, charset);
10261               ASET (initial, i, make_number (CHARSET_ID (charset)));
10262               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10263                 ASET (attrs, coding_attr_ascii_compat, Qt);
10264             }
10265           else
10266             ASET (initial, i, make_number (-1));
10267         }
10268
10269       reg_usage = args[coding_arg_iso2022_reg_usage];
10270       CHECK_CONS (reg_usage);
10271       CHECK_NUMBER_CAR (reg_usage);
10272       CHECK_NUMBER_CDR (reg_usage);
10273
10274       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10275       for (tail = request; CONSP (tail); tail = XCDR (tail))
10276         {
10277           int id;
10278           Lisp_Object tmp1;
10279
10280           val = XCAR (tail);
10281           CHECK_CONS (val);
10282           tmp1 = XCAR (val);
10283           CHECK_CHARSET_GET_ID (tmp1, id);
10284           CHECK_NATNUM_CDR (val);
10285           if (XINT (XCDR (val)) >= 4)
10286             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10287           XSETCAR (val, make_number (id));
10288         }
10289
10290       flags = args[coding_arg_iso2022_flags];
10291       CHECK_NATNUM (flags);
10292       i = XINT (flags) & INT_MAX;
10293       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10294         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10295       flags = make_number (i);
10296
10297       ASET (attrs, coding_attr_iso_initial, initial);
10298       ASET (attrs, coding_attr_iso_usage, reg_usage);
10299       ASET (attrs, coding_attr_iso_request, request);
10300       ASET (attrs, coding_attr_iso_flags, flags);
10301       setup_iso_safe_charsets (attrs);
10302
10303       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10304         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10305                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10306                     ? coding_category_iso_7_else
10307                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10308                     ? coding_category_iso_7
10309                     : coding_category_iso_7_tight);
10310       else
10311         {
10312           int id = XINT (AREF (initial, 1));
10313
10314           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10315                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10316                        || id < 0)
10317                       ? coding_category_iso_8_else
10318                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10319                       ? coding_category_iso_8_1
10320                       : coding_category_iso_8_2);
10321         }
10322       if (category != coding_category_iso_8_1
10323           && category != coding_category_iso_8_2)
10324         ASET (attrs, coding_attr_ascii_compat, Qnil);
10325     }
10326   else if (EQ (coding_type, Qemacs_mule))
10327     {
10328       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10329         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10330       ASET (attrs, coding_attr_ascii_compat, Qt);
10331       category = coding_category_emacs_mule;
10332     }
10333   else if (EQ (coding_type, Qshift_jis))
10334     {
10335
10336       struct charset *charset;
10337
10338       if (XINT (Flength (charset_list)) != 3
10339           && XINT (Flength (charset_list)) != 4)
10340         error ("There should be three or four charsets");
10341
10342       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10343       if (CHARSET_DIMENSION (charset) != 1)
10344         error ("Dimension of charset %s is not one",
10345                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10346       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10347         ASET (attrs, coding_attr_ascii_compat, Qt);
10348
10349       charset_list = XCDR (charset_list);
10350       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10351       if (CHARSET_DIMENSION (charset) != 1)
10352         error ("Dimension of charset %s is not one",
10353                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10354
10355       charset_list = XCDR (charset_list);
10356       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10357       if (CHARSET_DIMENSION (charset) != 2)
10358         error ("Dimension of charset %s is not two",
10359                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10360
10361       charset_list = XCDR (charset_list);
10362       if (! NILP (charset_list))
10363         {
10364           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10365           if (CHARSET_DIMENSION (charset) != 2)
10366             error ("Dimension of charset %s is not two",
10367                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10368         }
10369
10370       category = coding_category_sjis;
10371       Vsjis_coding_system = name;
10372     }
10373   else if (EQ (coding_type, Qbig5))
10374     {
10375       struct charset *charset;
10376
10377       if (XINT (Flength (charset_list)) != 2)
10378         error ("There should be just two charsets");
10379
10380       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10381       if (CHARSET_DIMENSION (charset) != 1)
10382         error ("Dimension of charset %s is not one",
10383                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10384       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10385         ASET (attrs, coding_attr_ascii_compat, Qt);
10386
10387       charset_list = XCDR (charset_list);
10388       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10389       if (CHARSET_DIMENSION (charset) != 2)
10390         error ("Dimension of charset %s is not two",
10391                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10392
10393       category = coding_category_big5;
10394       Vbig5_coding_system = name;
10395     }
10396   else if (EQ (coding_type, Qraw_text))
10397     {
10398       category = coding_category_raw_text;
10399       ASET (attrs, coding_attr_ascii_compat, Qt);
10400     }
10401   else if (EQ (coding_type, Qutf_8))
10402     {
10403       Lisp_Object bom;
10404
10405       if (nargs < coding_arg_utf8_max)
10406         goto short_args;
10407
10408       bom = args[coding_arg_utf8_bom];
10409       if (! NILP (bom) && ! EQ (bom, Qt))
10410         {
10411           CHECK_CONS (bom);
10412           val = XCAR (bom);
10413           CHECK_CODING_SYSTEM (val);
10414           val = XCDR (bom);
10415           CHECK_CODING_SYSTEM (val);
10416         }
10417       ASET (attrs, coding_attr_utf_bom, bom);
10418       if (NILP (bom))
10419         ASET (attrs, coding_attr_ascii_compat, Qt);
10420
10421       category = (CONSP (bom) ? coding_category_utf_8_auto
10422                   : NILP (bom) ? coding_category_utf_8_nosig
10423                   : coding_category_utf_8_sig);
10424     }
10425   else if (EQ (coding_type, Qundecided))
10426     {
10427       if (nargs < coding_arg_undecided_max)
10428         goto short_args;
10429       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10430             args[coding_arg_undecided_inhibit_null_byte_detection]);
10431       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10432             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10433       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10434             args[coding_arg_undecided_prefer_utf_8]);
10435       category = coding_category_undecided;
10436     }
10437   else
10438     error ("Invalid coding system type: %s",
10439            SDATA (SYMBOL_NAME (coding_type)));
10440
10441   ASET (attrs, coding_attr_category, make_number (category));
10442   ASET (attrs, coding_attr_plist,
10443         Fcons (QCcategory,
10444                Fcons (AREF (Vcoding_category_table, category),
10445                       CODING_ATTR_PLIST (attrs))));
10446   ASET (attrs, coding_attr_plist,
10447         Fcons (QCascii_compatible_p,
10448                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10449                       CODING_ATTR_PLIST (attrs))));
10450
10451   eol_type = args[coding_arg_eol_type];
10452   if (! NILP (eol_type)
10453       && ! EQ (eol_type, Qunix)
10454       && ! EQ (eol_type, Qdos)
10455       && ! EQ (eol_type, Qmac))
10456     error ("Invalid eol-type");
10457
10458   aliases = list1 (name);
10459
10460   if (NILP (eol_type))
10461     {
10462       eol_type = make_subsidiaries (name);
10463       for (i = 0; i < 3; i++)
10464         {
10465           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10466
10467           this_name = AREF (eol_type, i);
10468           this_aliases = list1 (this_name);
10469           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10470           this_spec = make_uninit_vector (3);
10471           ASET (this_spec, 0, attrs);
10472           ASET (this_spec, 1, this_aliases);
10473           ASET (this_spec, 2, this_eol_type);
10474           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10475           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10476           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10477           if (NILP (val))
10478             Vcoding_system_alist
10479               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10480                        Vcoding_system_alist);
10481         }
10482     }
10483
10484   spec_vec = make_uninit_vector (3);
10485   ASET (spec_vec, 0, attrs);
10486   ASET (spec_vec, 1, aliases);
10487   ASET (spec_vec, 2, eol_type);
10488
10489   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10490   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10491   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10492   if (NILP (val))
10493     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10494                                   Vcoding_system_alist);
10495
10496   {
10497     int id = coding_categories[category].id;
10498
10499     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10500       setup_coding_system (name, &coding_categories[category]);
10501   }
10502
10503   return Qnil;
10504
10505  short_args:
10506   return Fsignal (Qwrong_number_of_arguments,
10507                   Fcons (intern ("define-coding-system-internal"),
10508                          make_number (nargs)));
10509 }
10510
10511
10512 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10513        3, 3, 0,
10514        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10515   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10516 {
10517   Lisp_Object spec, attrs;
10518
10519   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10520   attrs = AREF (spec, 0);
10521   if (EQ (prop, QCmnemonic))
10522     {
10523       if (! STRINGP (val))
10524         CHECK_CHARACTER (val);
10525       ASET (attrs, coding_attr_mnemonic, val);
10526     }
10527   else if (EQ (prop, QCdefault_char))
10528     {
10529       if (NILP (val))
10530         val = make_number (' ');
10531       else
10532         CHECK_CHARACTER (val);
10533       ASET (attrs, coding_attr_default_char, val);
10534     }
10535   else if (EQ (prop, QCdecode_translation_table))
10536     {
10537       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10538         CHECK_SYMBOL (val);
10539       ASET (attrs, coding_attr_decode_tbl, val);
10540     }
10541   else if (EQ (prop, QCencode_translation_table))
10542     {
10543       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10544         CHECK_SYMBOL (val);
10545       ASET (attrs, coding_attr_encode_tbl, val);
10546     }
10547   else if (EQ (prop, QCpost_read_conversion))
10548     {
10549       CHECK_SYMBOL (val);
10550       ASET (attrs, coding_attr_post_read, val);
10551     }
10552   else if (EQ (prop, QCpre_write_conversion))
10553     {
10554       CHECK_SYMBOL (val);
10555       ASET (attrs, coding_attr_pre_write, val);
10556     }
10557   else if (EQ (prop, QCascii_compatible_p))
10558     {
10559       ASET (attrs, coding_attr_ascii_compat, val);
10560     }
10561
10562   ASET (attrs, coding_attr_plist,
10563         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10564   return val;
10565 }
10566
10567
10568 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10569        Sdefine_coding_system_alias, 2, 2, 0,
10570        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10571   (Lisp_Object alias, Lisp_Object coding_system)
10572 {
10573   Lisp_Object spec, aliases, eol_type, val;
10574
10575   CHECK_SYMBOL (alias);
10576   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10577   aliases = AREF (spec, 1);
10578   /* ALIASES should be a list of length more than zero, and the first
10579      element is a base coding system.  Append ALIAS at the tail of the
10580      list.  */
10581   while (!NILP (XCDR (aliases)))
10582     aliases = XCDR (aliases);
10583   XSETCDR (aliases, list1 (alias));
10584
10585   eol_type = AREF (spec, 2);
10586   if (VECTORP (eol_type))
10587     {
10588       Lisp_Object subsidiaries;
10589       int i;
10590
10591       subsidiaries = make_subsidiaries (alias);
10592       for (i = 0; i < 3; i++)
10593         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10594                                      AREF (eol_type, i));
10595     }
10596
10597   Fputhash (alias, spec, Vcoding_system_hash_table);
10598   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10599   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10600   if (NILP (val))
10601     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10602                                   Vcoding_system_alist);
10603
10604   return Qnil;
10605 }
10606
10607 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10608        1, 1, 0,
10609        doc: /* Return the base of CODING-SYSTEM.
10610 Any alias or subsidiary coding system is not a base coding system.  */)
10611   (Lisp_Object coding_system)
10612 {
10613   Lisp_Object spec, attrs;
10614
10615   if (NILP (coding_system))
10616     return (Qno_conversion);
10617   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10618   attrs = AREF (spec, 0);
10619   return CODING_ATTR_BASE_NAME (attrs);
10620 }
10621
10622 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10623        1, 1, 0,
10624        doc: "Return the property list of CODING-SYSTEM.")
10625   (Lisp_Object coding_system)
10626 {
10627   Lisp_Object spec, attrs;
10628
10629   if (NILP (coding_system))
10630     coding_system = Qno_conversion;
10631   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10632   attrs = AREF (spec, 0);
10633   return CODING_ATTR_PLIST (attrs);
10634 }
10635
10636
10637 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10638        1, 1, 0,
10639        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10640   (Lisp_Object coding_system)
10641 {
10642   Lisp_Object spec;
10643
10644   if (NILP (coding_system))
10645     coding_system = Qno_conversion;
10646   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10647   return AREF (spec, 1);
10648 }
10649
10650 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10651        Scoding_system_eol_type, 1, 1, 0,
10652        doc: /* Return eol-type of CODING-SYSTEM.
10653 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10654
10655 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10656 and CR respectively.
10657
10658 A vector value indicates that a format of end-of-line should be
10659 detected automatically.  Nth element of the vector is the subsidiary
10660 coding system whose eol-type is N.  */)
10661   (Lisp_Object coding_system)
10662 {
10663   Lisp_Object spec, eol_type;
10664   int n;
10665
10666   if (NILP (coding_system))
10667     coding_system = Qno_conversion;
10668   if (! CODING_SYSTEM_P (coding_system))
10669     return Qnil;
10670   spec = CODING_SYSTEM_SPEC (coding_system);
10671   eol_type = AREF (spec, 2);
10672   if (VECTORP (eol_type))
10673     return Fcopy_sequence (eol_type);
10674   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10675   return make_number (n);
10676 }
10677
10678 #endif /* emacs */
10679
10680 \f
10681 /*** 9. Post-amble ***/
10682
10683 void
10684 init_coding_once (void)
10685 {
10686   int i;
10687
10688   for (i = 0; i < coding_category_max; i++)
10689     {
10690       coding_categories[i].id = -1;
10691       coding_priorities[i] = i;
10692     }
10693
10694   /* ISO2022 specific initialize routine.  */
10695   for (i = 0; i < 0x20; i++)
10696     iso_code_class[i] = ISO_control_0;
10697   for (i = 0x21; i < 0x7F; i++)
10698     iso_code_class[i] = ISO_graphic_plane_0;
10699   for (i = 0x80; i < 0xA0; i++)
10700     iso_code_class[i] = ISO_control_1;
10701   for (i = 0xA1; i < 0xFF; i++)
10702     iso_code_class[i] = ISO_graphic_plane_1;
10703   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10704   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10705   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10706   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10707   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10708   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10709   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10710   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10711   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10712
10713   for (i = 0; i < 256; i++)
10714     {
10715       emacs_mule_bytes[i] = 1;
10716     }
10717   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10718   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10719   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10720   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10721 }
10722
10723 #ifdef emacs
10724
10725 void
10726 syms_of_coding (void)
10727 {
10728   staticpro (&Vcoding_system_hash_table);
10729   {
10730     Lisp_Object args[2];
10731     args[0] = QCtest;
10732     args[1] = Qeq;
10733     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10734   }
10735
10736   staticpro (&Vsjis_coding_system);
10737   Vsjis_coding_system = Qnil;
10738
10739   staticpro (&Vbig5_coding_system);
10740   Vbig5_coding_system = Qnil;
10741
10742   staticpro (&Vcode_conversion_reused_workbuf);
10743   Vcode_conversion_reused_workbuf = Qnil;
10744
10745   staticpro (&Vcode_conversion_workbuf_name);
10746   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10747
10748   reused_workbuf_in_use = 0;
10749
10750   DEFSYM (Qcharset, "charset");
10751   DEFSYM (Qtarget_idx, "target-idx");
10752   DEFSYM (Qcoding_system_history, "coding-system-history");
10753   Fset (Qcoding_system_history, Qnil);
10754
10755   /* Target FILENAME is the first argument.  */
10756   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10757   /* Target FILENAME is the third argument.  */
10758   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10759
10760   DEFSYM (Qcall_process, "call-process");
10761   /* Target PROGRAM is the first argument.  */
10762   Fput (Qcall_process, Qtarget_idx, make_number (0));
10763
10764   DEFSYM (Qcall_process_region, "call-process-region");
10765   /* Target PROGRAM is the third argument.  */
10766   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10767
10768   DEFSYM (Qstart_process, "start-process");
10769   /* Target PROGRAM is the third argument.  */
10770   Fput (Qstart_process, Qtarget_idx, make_number (2));
10771
10772   DEFSYM (Qopen_network_stream, "open-network-stream");
10773   /* Target SERVICE is the fourth argument.  */
10774   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10775
10776   DEFSYM (Qcoding_system, "coding-system");
10777   DEFSYM (Qcoding_aliases, "coding-aliases");
10778
10779   DEFSYM (Qeol_type, "eol-type");
10780   DEFSYM (Qunix, "unix");
10781   DEFSYM (Qdos, "dos");
10782   DEFSYM (Qmac, "mac");
10783
10784   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10785   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10786   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10787   DEFSYM (Qdefault_char, "default-char");
10788   DEFSYM (Qundecided, "undecided");
10789   DEFSYM (Qno_conversion, "no-conversion");
10790   DEFSYM (Qraw_text, "raw-text");
10791
10792   DEFSYM (Qiso_2022, "iso-2022");
10793
10794   DEFSYM (Qutf_8, "utf-8");
10795   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10796
10797 #if defined (WINDOWSNT) || defined (CYGWIN)
10798   /* No, not utf-16-le: that one has a BOM.  */
10799   DEFSYM (Qutf_16le, "utf-16le");
10800 #endif
10801
10802   DEFSYM (Qutf_16, "utf-16");
10803   DEFSYM (Qbig, "big");
10804   DEFSYM (Qlittle, "little");
10805
10806   DEFSYM (Qshift_jis, "shift-jis");
10807   DEFSYM (Qbig5, "big5");
10808
10809   DEFSYM (Qcoding_system_p, "coding-system-p");
10810
10811   DEFSYM (Qcoding_system_error, "coding-system-error");
10812   Fput (Qcoding_system_error, Qerror_conditions,
10813         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10814   Fput (Qcoding_system_error, Qerror_message,
10815         build_pure_c_string ("Invalid coding system"));
10816
10817   /* Intern this now in case it isn't already done.
10818      Setting this variable twice is harmless.
10819      But don't staticpro it here--that is done in alloc.c.  */
10820   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10821
10822   DEFSYM (Qtranslation_table, "translation-table");
10823   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10824   DEFSYM (Qtranslation_table_id, "translation-table-id");
10825   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10826   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10827
10828   DEFSYM (Qvalid_codes, "valid-codes");
10829
10830   DEFSYM (Qemacs_mule, "emacs-mule");
10831
10832   DEFSYM (QCcategory, ":category");
10833   DEFSYM (QCmnemonic, ":mnemonic");
10834   DEFSYM (QCdefault_char, ":default-char");
10835   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10836   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10837   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10838   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10839   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10840
10841   Vcoding_category_table
10842     = Fmake_vector (make_number (coding_category_max), Qnil);
10843   staticpro (&Vcoding_category_table);
10844   /* Followings are target of code detection.  */
10845   ASET (Vcoding_category_table, coding_category_iso_7,
10846         intern_c_string ("coding-category-iso-7"));
10847   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10848         intern_c_string ("coding-category-iso-7-tight"));
10849   ASET (Vcoding_category_table, coding_category_iso_8_1,
10850         intern_c_string ("coding-category-iso-8-1"));
10851   ASET (Vcoding_category_table, coding_category_iso_8_2,
10852         intern_c_string ("coding-category-iso-8-2"));
10853   ASET (Vcoding_category_table, coding_category_iso_7_else,
10854         intern_c_string ("coding-category-iso-7-else"));
10855   ASET (Vcoding_category_table, coding_category_iso_8_else,
10856         intern_c_string ("coding-category-iso-8-else"));
10857   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10858         intern_c_string ("coding-category-utf-8-auto"));
10859   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10860         intern_c_string ("coding-category-utf-8"));
10861   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10862         intern_c_string ("coding-category-utf-8-sig"));
10863   ASET (Vcoding_category_table, coding_category_utf_16_be,
10864         intern_c_string ("coding-category-utf-16-be"));
10865   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10866         intern_c_string ("coding-category-utf-16-auto"));
10867   ASET (Vcoding_category_table, coding_category_utf_16_le,
10868         intern_c_string ("coding-category-utf-16-le"));
10869   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10870         intern_c_string ("coding-category-utf-16-be-nosig"));
10871   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10872         intern_c_string ("coding-category-utf-16-le-nosig"));
10873   ASET (Vcoding_category_table, coding_category_charset,
10874         intern_c_string ("coding-category-charset"));
10875   ASET (Vcoding_category_table, coding_category_sjis,
10876         intern_c_string ("coding-category-sjis"));
10877   ASET (Vcoding_category_table, coding_category_big5,
10878         intern_c_string ("coding-category-big5"));
10879   ASET (Vcoding_category_table, coding_category_ccl,
10880         intern_c_string ("coding-category-ccl"));
10881   ASET (Vcoding_category_table, coding_category_emacs_mule,
10882         intern_c_string ("coding-category-emacs-mule"));
10883   /* Followings are NOT target of code detection.  */
10884   ASET (Vcoding_category_table, coding_category_raw_text,
10885         intern_c_string ("coding-category-raw-text"));
10886   ASET (Vcoding_category_table, coding_category_undecided,
10887         intern_c_string ("coding-category-undecided"));
10888
10889   DEFSYM (Qinsufficient_source, "insufficient-source");
10890   DEFSYM (Qinvalid_source, "invalid-source");
10891   DEFSYM (Qinterrupted, "interrupted");
10892   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10893
10894   defsubr (&Scoding_system_p);
10895   defsubr (&Sread_coding_system);
10896   defsubr (&Sread_non_nil_coding_system);
10897   defsubr (&Scheck_coding_system);
10898   defsubr (&Sdetect_coding_region);
10899   defsubr (&Sdetect_coding_string);
10900   defsubr (&Sfind_coding_systems_region_internal);
10901   defsubr (&Sunencodable_char_position);
10902   defsubr (&Scheck_coding_systems_region);
10903   defsubr (&Sdecode_coding_region);
10904   defsubr (&Sencode_coding_region);
10905   defsubr (&Sdecode_coding_string);
10906   defsubr (&Sencode_coding_string);
10907   defsubr (&Sdecode_sjis_char);
10908   defsubr (&Sencode_sjis_char);
10909   defsubr (&Sdecode_big5_char);
10910   defsubr (&Sencode_big5_char);
10911   defsubr (&Sset_terminal_coding_system_internal);
10912   defsubr (&Sset_safe_terminal_coding_system_internal);
10913   defsubr (&Sterminal_coding_system);
10914   defsubr (&Sset_keyboard_coding_system_internal);
10915   defsubr (&Skeyboard_coding_system);
10916   defsubr (&Sfind_operation_coding_system);
10917   defsubr (&Sset_coding_system_priority);
10918   defsubr (&Sdefine_coding_system_internal);
10919   defsubr (&Sdefine_coding_system_alias);
10920   defsubr (&Scoding_system_put);
10921   defsubr (&Scoding_system_base);
10922   defsubr (&Scoding_system_plist);
10923   defsubr (&Scoding_system_aliases);
10924   defsubr (&Scoding_system_eol_type);
10925   defsubr (&Scoding_system_priority_list);
10926
10927   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10928                doc: /* List of coding systems.
10929
10930 Do not alter the value of this variable manually.  This variable should be
10931 updated by the functions `define-coding-system' and
10932 `define-coding-system-alias'.  */);
10933   Vcoding_system_list = Qnil;
10934
10935   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10936                doc: /* Alist of coding system names.
10937 Each element is one element list of coding system name.
10938 This variable is given to `completing-read' as COLLECTION argument.
10939
10940 Do not alter the value of this variable manually.  This variable should be
10941 updated by the functions `make-coding-system' and
10942 `define-coding-system-alias'.  */);
10943   Vcoding_system_alist = Qnil;
10944
10945   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10946                doc: /* List of coding-categories (symbols) ordered by priority.
10947
10948 On detecting a coding system, Emacs tries code detection algorithms
10949 associated with each coding-category one by one in this order.  When
10950 one algorithm agrees with a byte sequence of source text, the coding
10951 system bound to the corresponding coding-category is selected.
10952
10953 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10954   {
10955     int i;
10956
10957     Vcoding_category_list = Qnil;
10958     for (i = coding_category_max - 1; i >= 0; i--)
10959       Vcoding_category_list
10960         = Fcons (AREF (Vcoding_category_table, i),
10961                  Vcoding_category_list);
10962   }
10963
10964   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10965                doc: /* Specify the coding system for read operations.
10966 It is useful to bind this variable with `let', but do not set it globally.
10967 If the value is a coding system, it is used for decoding on read operation.
10968 If not, an appropriate element is used from one of the coding system alists.
10969 There are three such tables: `file-coding-system-alist',
10970 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10971   Vcoding_system_for_read = Qnil;
10972
10973   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10974                doc: /* Specify the coding system for write operations.
10975 Programs bind this variable with `let', but you should not set it globally.
10976 If the value is a coding system, it is used for encoding of output,
10977 when writing it to a file and when sending it to a file or subprocess.
10978
10979 If this does not specify a coding system, an appropriate element
10980 is used from one of the coding system alists.
10981 There are three such tables: `file-coding-system-alist',
10982 `process-coding-system-alist', and `network-coding-system-alist'.
10983 For output to files, if the above procedure does not specify a coding system,
10984 the value of `buffer-file-coding-system' is used.  */);
10985   Vcoding_system_for_write = Qnil;
10986
10987   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10988                doc: /*
10989 Coding system used in the latest file or process I/O.  */);
10990   Vlast_coding_system_used = Qnil;
10991
10992   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10993                doc: /*
10994 Error status of the last code conversion.
10995
10996 When an error was detected in the last code conversion, this variable
10997 is set to one of the following symbols.
10998   `insufficient-source'
10999   `inconsistent-eol'
11000   `invalid-source'
11001   `interrupted'
11002   `insufficient-memory'
11003 When no error was detected, the value doesn't change.  So, to check
11004 the error status of a code conversion by this variable, you must
11005 explicitly set this variable to nil before performing code
11006 conversion.  */);
11007   Vlast_code_conversion_error = Qnil;
11008
11009   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11010                doc: /*
11011 *Non-nil means always inhibit code conversion of end-of-line format.
11012 See info node `Coding Systems' and info node `Text and Binary' concerning
11013 such conversion.  */);
11014   inhibit_eol_conversion = 0;
11015
11016   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11017                doc: /*
11018 Non-nil means process buffer inherits coding system of process output.
11019 Bind it to t if the process output is to be treated as if it were a file
11020 read from some filesystem.  */);
11021   inherit_process_coding_system = 0;
11022
11023   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11024                doc: /*
11025 Alist to decide a coding system to use for a file I/O operation.
11026 The format is ((PATTERN . VAL) ...),
11027 where PATTERN is a regular expression matching a file name,
11028 VAL is a coding system, a cons of coding systems, or a function symbol.
11029 If VAL is a coding system, it is used for both decoding and encoding
11030 the file contents.
11031 If VAL is a cons of coding systems, the car part is used for decoding,
11032 and the cdr part is used for encoding.
11033 If VAL is a function symbol, the function must return a coding system
11034 or a cons of coding systems which are used as above.  The function is
11035 called with an argument that is a list of the arguments with which
11036 `find-operation-coding-system' was called.  If the function can't decide
11037 a coding system, it can return `undecided' so that the normal
11038 code-detection is performed.
11039
11040 See also the function `find-operation-coding-system'
11041 and the variable `auto-coding-alist'.  */);
11042   Vfile_coding_system_alist = Qnil;
11043
11044   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11045                doc: /*
11046 Alist to decide a coding system to use for a process I/O operation.
11047 The format is ((PATTERN . VAL) ...),
11048 where PATTERN is a regular expression matching a program name,
11049 VAL is a coding system, a cons of coding systems, or a function symbol.
11050 If VAL is a coding system, it is used for both decoding what received
11051 from the program and encoding what sent to the program.
11052 If VAL is a cons of coding systems, the car part is used for decoding,
11053 and the cdr part is used for encoding.
11054 If VAL is a function symbol, the function must return a coding system
11055 or a cons of coding systems which are used as above.
11056
11057 See also the function `find-operation-coding-system'.  */);
11058   Vprocess_coding_system_alist = Qnil;
11059
11060   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11061                doc: /*
11062 Alist to decide a coding system to use for a network I/O operation.
11063 The format is ((PATTERN . VAL) ...),
11064 where PATTERN is a regular expression matching a network service name
11065 or is a port number to connect to,
11066 VAL is a coding system, a cons of coding systems, or a function symbol.
11067 If VAL is a coding system, it is used for both decoding what received
11068 from the network stream and encoding what sent to the network stream.
11069 If VAL is a cons of coding systems, the car part is used for decoding,
11070 and the cdr part is used for encoding.
11071 If VAL is a function symbol, the function must return a coding system
11072 or a cons of coding systems which are used as above.
11073
11074 See also the function `find-operation-coding-system'.  */);
11075   Vnetwork_coding_system_alist = Qnil;
11076
11077   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11078                doc: /* Coding system to use with system messages.
11079 Also used for decoding keyboard input on X Window system.  */);
11080   Vlocale_coding_system = Qnil;
11081
11082   /* The eol mnemonics are reset in startup.el system-dependently.  */
11083   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11084                doc: /*
11085 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11086   eol_mnemonic_unix = build_pure_c_string (":");
11087
11088   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11089                doc: /*
11090 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11091   eol_mnemonic_dos = build_pure_c_string ("\\");
11092
11093   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11094                doc: /*
11095 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11096   eol_mnemonic_mac = build_pure_c_string ("/");
11097
11098   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11099                doc: /*
11100 *String displayed in mode line when end-of-line format is not yet determined.  */);
11101   eol_mnemonic_undecided = build_pure_c_string (":");
11102
11103   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11104                doc: /*
11105 *Non-nil enables character translation while encoding and decoding.  */);
11106   Venable_character_translation = Qt;
11107
11108   DEFVAR_LISP ("standard-translation-table-for-decode",
11109                Vstandard_translation_table_for_decode,
11110                doc: /* Table for translating characters while decoding.  */);
11111   Vstandard_translation_table_for_decode = Qnil;
11112
11113   DEFVAR_LISP ("standard-translation-table-for-encode",
11114                Vstandard_translation_table_for_encode,
11115                doc: /* Table for translating characters while encoding.  */);
11116   Vstandard_translation_table_for_encode = Qnil;
11117
11118   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11119                doc: /* Alist of charsets vs revision numbers.
11120 While encoding, if a charset (car part of an element) is found,
11121 designate it with the escape sequence identifying revision (cdr part
11122 of the element).  */);
11123   Vcharset_revision_table = Qnil;
11124
11125   DEFVAR_LISP ("default-process-coding-system",
11126                Vdefault_process_coding_system,
11127                doc: /* Cons of coding systems used for process I/O by default.
11128 The car part is used for decoding a process output,
11129 the cdr part is used for encoding a text to be sent to a process.  */);
11130   Vdefault_process_coding_system = Qnil;
11131
11132   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11133                doc: /*
11134 Table of extra Latin codes in the range 128..159 (inclusive).
11135 This is a vector of length 256.
11136 If Nth element is non-nil, the existence of code N in a file
11137 \(or output of subprocess) doesn't prevent it to be detected as
11138 a coding system of ISO 2022 variant which has a flag
11139 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11140 or reading output of a subprocess.
11141 Only 128th through 159th elements have a meaning.  */);
11142   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11143
11144   DEFVAR_LISP ("select-safe-coding-system-function",
11145                Vselect_safe_coding_system_function,
11146                doc: /*
11147 Function to call to select safe coding system for encoding a text.
11148
11149 If set, this function is called to force a user to select a proper
11150 coding system which can encode the text in the case that a default
11151 coding system used in each operation can't encode the text.  The
11152 function should take care that the buffer is not modified while
11153 the coding system is being selected.
11154
11155 The default value is `select-safe-coding-system' (which see).  */);
11156   Vselect_safe_coding_system_function = Qnil;
11157
11158   DEFVAR_BOOL ("coding-system-require-warning",
11159                coding_system_require_warning,
11160                doc: /* Internal use only.
11161 If non-nil, on writing a file, `select-safe-coding-system-function' is
11162 called even if `coding-system-for-write' is non-nil.  The command
11163 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11164   coding_system_require_warning = 0;
11165
11166
11167   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11168                inhibit_iso_escape_detection,
11169                doc: /*
11170 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11171
11172 When Emacs reads text, it tries to detect how the text is encoded.
11173 This code detection is sensitive to escape sequences.  If Emacs sees
11174 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11175 of the ISO2022 encodings, and decodes text by the corresponding coding
11176 system (e.g. `iso-2022-7bit').
11177
11178 However, there may be a case that you want to read escape sequences in
11179 a file as is.  In such a case, you can set this variable to non-nil.
11180 Then the code detection will ignore any escape sequences, and no text is
11181 detected as encoded in some ISO-2022 encoding.  The result is that all
11182 escape sequences become visible in a buffer.
11183
11184 The default value is nil, and it is strongly recommended not to change
11185 it.  That is because many Emacs Lisp source files that contain
11186 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11187 in Emacs's distribution, and they won't be decoded correctly on
11188 reading if you suppress escape sequence detection.
11189
11190 The other way to read escape sequences in a file without decoding is
11191 to explicitly specify some coding system that doesn't use ISO-2022
11192 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11193   inhibit_iso_escape_detection = 0;
11194
11195   DEFVAR_BOOL ("inhibit-null-byte-detection",
11196                inhibit_null_byte_detection,
11197                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11198 By default, Emacs treats it as binary data, and does not attempt to
11199 decode it.  The effect is as if you specified `no-conversion' for
11200 reading that text.
11201
11202 Set this to non-nil when a regular text happens to include null bytes.
11203 Examples are Index nodes of Info files and null-byte delimited output
11204 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11205 decode text as usual.  */);
11206   inhibit_null_byte_detection = 0;
11207
11208   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11209                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11210 Internal use only.  Removed after the experimental optimizer gets stable. */);
11211   disable_ascii_optimization = 0;
11212
11213   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11214                doc: /* Char table for translating self-inserting characters.
11215 This is applied to the result of input methods, not their input.
11216 See also `keyboard-translate-table'.
11217
11218 Use of this variable for character code unification was rendered
11219 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11220 internal character representation.  */);
11221     Vtranslation_table_for_input = Qnil;
11222
11223   {
11224     Lisp_Object args[coding_arg_undecided_max];
11225     Lisp_Object plist[16];
11226     int i;
11227
11228     for (i = 0; i < coding_arg_undecided_max; i++)
11229       args[i] = Qnil;
11230
11231     plist[0] = intern_c_string (":name");
11232     plist[1] = args[coding_arg_name] = Qno_conversion;
11233     plist[2] = intern_c_string (":mnemonic");
11234     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11235     plist[4] = intern_c_string (":coding-type");
11236     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11237     plist[6] = intern_c_string (":ascii-compatible-p");
11238     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11239     plist[8] = intern_c_string (":default-char");
11240     plist[9] = args[coding_arg_default_char] = make_number (0);
11241     plist[10] = intern_c_string (":for-unibyte");
11242     plist[11] = args[coding_arg_for_unibyte] = Qt;
11243     plist[12] = intern_c_string (":docstring");
11244     plist[13] = build_pure_c_string ("Do no conversion.\n\
11245 \n\
11246 When you visit a file with this coding, the file is read into a\n\
11247 unibyte buffer as is, thus each byte of a file is treated as a\n\
11248 character.");
11249     plist[14] = intern_c_string (":eol-type");
11250     plist[15] = args[coding_arg_eol_type] = Qunix;
11251     args[coding_arg_plist] = Flist (16, plist);
11252     Fdefine_coding_system_internal (coding_arg_max, args);
11253
11254     plist[1] = args[coding_arg_name] = Qundecided;
11255     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11256     plist[5] = args[coding_arg_coding_type] = Qundecided;
11257     /* This is already set.
11258        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11259     plist[8] = intern_c_string (":charset-list");
11260     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11261     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11262     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11263     plist[15] = args[coding_arg_eol_type] = Qnil;
11264     args[coding_arg_plist] = Flist (16, plist);
11265     args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11266     args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11267     Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11268   }
11269
11270   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11271
11272   {
11273     int i;
11274
11275     for (i = 0; i < coding_category_max; i++)
11276       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11277   }
11278 #if defined (DOS_NT)
11279   system_eol_type = Qdos;
11280 #else
11281   system_eol_type = Qunix;
11282 #endif
11283   staticpro (&system_eol_type);
11284 }
11285
11286 char *
11287 emacs_strerror (int error_number)
11288 {
11289   char *str;
11290
11291   synchronize_system_messages_locale ();
11292   str = strerror (error_number);
11293
11294   if (! NILP (Vlocale_coding_system))
11295     {
11296       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11297                                                       Vlocale_coding_system,
11298                                                       0);
11299       str = SSDATA (dec);
11300     }
11301
11302   return str;
11303 }
11304
11305 #endif /* emacs */