src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2013 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 497
 498 /* A character to be produced on output if encoding of the original
 499    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 500 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 501
 502 /* UTF-8 section */
 503 #define CODING_UTF_8_BOM(coding)        \
 504   ((coding)->spec.utf_8_bom)
 505
 506 /* UTF-16 section */
 507 #define CODING_UTF_16_BOM(coding)       \
 508   ((coding)->spec.utf_16.bom)
 509
 510 #define CODING_UTF_16_ENDIAN(coding)    \
 511   ((coding)->spec.utf_16.endian)
 512
 513 #define CODING_UTF_16_SURROGATE(coding) \
 514   ((coding)->spec.utf_16.surrogate)
 515
 516
 517 /* CCL section */
 518 #define CODING_CCL_DECODER(coding)      \
 519   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 520 #define CODING_CCL_ENCODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 522 #define CODING_CCL_VALIDS(coding)                                          \
 523   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 524
 525 /* Index for each coding category in `coding_categories' */
 526
 527 enum coding_category
 528   {
 529     coding_category_iso_7,
 530     coding_category_iso_7_tight,
 531     coding_category_iso_8_1,
 532     coding_category_iso_8_2,
 533     coding_category_iso_7_else,
 534     coding_category_iso_8_else,
 535     coding_category_utf_8_auto,
 536     coding_category_utf_8_nosig,
 537     coding_category_utf_8_sig,
 538     coding_category_utf_16_auto,
 539     coding_category_utf_16_be,
 540     coding_category_utf_16_le,
 541     coding_category_utf_16_be_nosig,
 542     coding_category_utf_16_le_nosig,
 543     coding_category_charset,
 544     coding_category_sjis,
 545     coding_category_big5,
 546     coding_category_ccl,
 547     coding_category_emacs_mule,
 548     /* All above are targets of code detection.  */
 549     coding_category_raw_text,
 550     coding_category_undecided,
 551     coding_category_max
 552   };
 553
 554 /* Definitions of flag bits used in detect_coding_XXXX.  */
 555 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 556 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 557 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 558 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 559 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 560 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 561 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 562 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 563 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 564 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 565 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 566 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 567 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 568 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 569 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 570 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 571 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 572 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 573 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 574 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 575
 576 /* This value is returned if detect_coding_mask () find nothing other
 577    than ASCII characters.  */
 578 #define CATEGORY_MASK_ANY               \
 579   (CATEGORY_MASK_ISO_7                  \
 580    | CATEGORY_MASK_ISO_7_TIGHT          \
 581    | CATEGORY_MASK_ISO_8_1              \
 582    | CATEGORY_MASK_ISO_8_2              \
 583    | CATEGORY_MASK_ISO_7_ELSE           \
 584    | CATEGORY_MASK_ISO_8_ELSE           \
 585    | CATEGORY_MASK_UTF_8_AUTO           \
 586    | CATEGORY_MASK_UTF_8_NOSIG          \
 587    | CATEGORY_MASK_UTF_8_SIG            \
 588    | CATEGORY_MASK_UTF_16_AUTO          \
 589    | CATEGORY_MASK_UTF_16_BE            \
 590    | CATEGORY_MASK_UTF_16_LE            \
 591    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 592    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 593    | CATEGORY_MASK_CHARSET              \
 594    | CATEGORY_MASK_SJIS                 \
 595    | CATEGORY_MASK_BIG5                 \
 596    | CATEGORY_MASK_CCL                  \
 597    | CATEGORY_MASK_EMACS_MULE)
 598
 599
 600 #define CATEGORY_MASK_ISO_7BIT \
 601   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 602
 603 #define CATEGORY_MASK_ISO_8BIT \
 604   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 605
 606 #define CATEGORY_MASK_ISO_ELSE \
 607   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 608
 609 #define CATEGORY_MASK_ISO_ESCAPE        \
 610   (CATEGORY_MASK_ISO_7                  \
 611    | CATEGORY_MASK_ISO_7_TIGHT          \
 612    | CATEGORY_MASK_ISO_7_ELSE           \
 613    | CATEGORY_MASK_ISO_8_ELSE)
 614
 615 #define CATEGORY_MASK_ISO       \
 616   (  CATEGORY_MASK_ISO_7BIT     \
 617      | CATEGORY_MASK_ISO_8BIT   \
 618      | CATEGORY_MASK_ISO_ELSE)
 619
 620 #define CATEGORY_MASK_UTF_16            \
 621   (CATEGORY_MASK_UTF_16_AUTO            \
 622    | CATEGORY_MASK_UTF_16_BE            \
 623    | CATEGORY_MASK_UTF_16_LE            \
 624    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 625    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 626
 627 #define CATEGORY_MASK_UTF_8     \
 628   (CATEGORY_MASK_UTF_8_AUTO     \
 629    | CATEGORY_MASK_UTF_8_NOSIG  \
 630    | CATEGORY_MASK_UTF_8_SIG)
 631
 632 /* Table of coding categories (Lisp symbols).  This variable is for
 633    internal use only.  */
 634 static Lisp_Object Vcoding_category_table;
 635
 636 /* Table of coding-categories ordered by priority.  */
 637 static enum coding_category coding_priorities[coding_category_max];
 638
 639 /* Nth element is a coding context for the coding system bound to the
 640    Nth coding category.  */
 641 static struct coding_system coding_categories[coding_category_max];
 642
 643 /*** Commonly used macros and functions ***/
 644
 645 #ifndef min
 646 #define min(a, b) ((a) < (b) ? (a) : (b))
 647 #endif
 648 #ifndef max
 649 #define max(a, b) ((a) > (b) ? (a) : (b))
 650 #endif
 651
 652 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 653
 654 static int
 655 encode_inhibit_flag (Lisp_Object flag)
 656 {
 657   return NILP (flag) ? -1 : EQ (flag, Qt);
 658 }
 659
 660 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 661    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 662
 663 static bool
 664 inhibit_flag (int encoded_flag, bool var)
 665 {
 666   return 0 < encoded_flag + var;
 667 }
 668
 669 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 670   do {                                                  \
 671     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 672     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 673   } while (0)
 674
 675 static void
 676 CHECK_NATNUM_CAR (Lisp_Object x)
 677 {
 678   Lisp_Object tmp = XCAR (x);
 679   CHECK_NATNUM (tmp);
 680   XSETCAR (x, tmp);
 681 }
 682
 683 static void
 684 CHECK_NATNUM_CDR (Lisp_Object x)
 685 {
 686   Lisp_Object tmp = XCDR (x);
 687   CHECK_NATNUM (tmp);
 688   XSETCDR (x, tmp);
 689 }
 690
 691
 692 /* Safely get one byte from the source text pointed by SRC which ends
 693    at SRC_END, and set C to that byte.  If there are not enough bytes
 694    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 695    and a multibyte character is found at SRC, set C to the
 696    negative value of the character code.  The caller should declare
 697    and set these variables appropriately in advance:
 698         src, src_end, multibytep */
 699
 700 #define ONE_MORE_BYTE(c)                                \
 701   do {                                                  \
 702     if (src == src_end)                                 \
 703       {                                                 \
 704         if (src_base < src)                             \
 705           record_conversion_result                      \
 706             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 707         goto no_more_source;                            \
 708       }                                                 \
 709     c = *src++;                                         \
 710     if (multibytep && (c & 0x80))                       \
 711       {                                                 \
 712         if ((c & 0xFE) == 0xC0)                         \
 713           c = ((c & 1) << 6) | *src++;                  \
 714         else                                            \
 715           {                                             \
 716             src--;                                      \
 717             c = - string_char (src, &src, NULL);        \
 718             record_conversion_result                    \
 719               (coding, CODING_RESULT_INVALID_SRC);      \
 720           }                                             \
 721       }                                                 \
 722     consumed_chars++;                                   \
 723   } while (0)
 724
 725 /* Safely get two bytes from the source text pointed by SRC which ends
 726    at SRC_END, and set C1 and C2 to those bytes while skipping the
 727    heading multibyte characters.  If there are not enough bytes in the
 728    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 729    a multibyte character is found for C2, set C2 to the negative value
 730    of the character code.  The caller should declare and set these
 731    variables appropriately in advance:
 732         src, src_end, multibytep
 733    It is intended that this macro is used in detect_coding_utf_16.  */
 734
 735 #define TWO_MORE_BYTES(c1, c2)                          \
 736   do {                                                  \
 737     do {                                                \
 738       if (src == src_end)                               \
 739         goto no_more_source;                            \
 740       c1 = *src++;                                      \
 741       if (multibytep && (c1 & 0x80))                    \
 742         {                                               \
 743           if ((c1 & 0xFE) == 0xC0)                      \
 744             c1 = ((c1 & 1) << 6) | *src++;              \
 745           else                                          \
 746             {                                           \
 747               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 748               c1 = -1;                                  \
 749             }                                           \
 750         }                                               \
 751     } while (c1 < 0);                                   \
 752     if (src == src_end)                                 \
 753       goto no_more_source;                              \
 754     c2 = *src++;                                        \
 755     if (multibytep && (c2 & 0x80))                      \
 756       {                                                 \
 757         if ((c2 & 0xFE) == 0xC0)                        \
 758           c2 = ((c2 & 1) << 6) | *src++;                \
 759         else                                            \
 760           c2 = -1;                                      \
 761       }                                                 \
 762   } while (0)
 763
 764
 765 /* Store a byte C in the place pointed by DST and increment DST to the
 766    next free point, and increment PRODUCED_CHARS.  The caller should
 767    assure that C is 0..127, and declare and set the variable `dst'
 768    appropriately in advance.
 769 */
 770
 771
 772 #define EMIT_ONE_ASCII_BYTE(c)  \
 773   do {                          \
 774     produced_chars++;           \
 775     *dst++ = (c);               \
 776   } while (0)
 777
 778
 779 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 780
 781 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 782   do {                                  \
 783     produced_chars += 2;                \
 784     *dst++ = (c1), *dst++ = (c2);       \
 785   } while (0)
 786
 787
 788 /* Store a byte C in the place pointed by DST and increment DST to the
 789    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 790    store in an appropriate multibyte form.  The caller should
 791    declare and set the variables `dst' and `multibytep' appropriately
 792    in advance.  */
 793
 794 #define EMIT_ONE_BYTE(c)                \
 795   do {                                  \
 796     produced_chars++;                   \
 797     if (multibytep)                     \
 798       {                                 \
 799         unsigned ch = (c);              \
 800         if (ch >= 0x80)                 \
 801           ch = BYTE8_TO_CHAR (ch);      \
 802         CHAR_STRING_ADVANCE (ch, dst);  \
 803       }                                 \
 804     else                                \
 805       *dst++ = (c);                     \
 806   } while (0)
 807
 808
 809 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 810
 811 #define EMIT_TWO_BYTES(c1, c2)          \
 812   do {                                  \
 813     produced_chars += 2;                \
 814     if (multibytep)                     \
 815       {                                 \
 816         unsigned ch;                    \
 817                                         \
 818         ch = (c1);                      \
 819         if (ch >= 0x80)                 \
 820           ch = BYTE8_TO_CHAR (ch);      \
 821         CHAR_STRING_ADVANCE (ch, dst);  \
 822         ch = (c2);                      \
 823         if (ch >= 0x80)                 \
 824           ch = BYTE8_TO_CHAR (ch);      \
 825         CHAR_STRING_ADVANCE (ch, dst);  \
 826       }                                 \
 827     else                                \
 828       {                                 \
 829         *dst++ = (c1);                  \
 830         *dst++ = (c2);                  \
 831       }                                 \
 832   } while (0)
 833
 834
 835 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 836   do {                                  \
 837     EMIT_ONE_BYTE (c1);                 \
 838     EMIT_TWO_BYTES (c2, c3);            \
 839   } while (0)
 840
 841
 842 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 843   do {                                          \
 844     EMIT_TWO_BYTES (c1, c2);                    \
 845     EMIT_TWO_BYTES (c3, c4);                    \
 846   } while (0)
 847
 848
 849 static void
 850 record_conversion_result (struct coding_system *coding,
 851                           enum coding_result_code result)
 852 {
 853   coding->result = result;
 854   switch (result)
 855     {
 856     case CODING_RESULT_INSUFFICIENT_SRC:
 857       Vlast_code_conversion_error = Qinsufficient_source;
 858       break;
 859     case CODING_RESULT_INVALID_SRC:
 860       Vlast_code_conversion_error = Qinvalid_source;
 861       break;
 862     case CODING_RESULT_INTERRUPT:
 863       Vlast_code_conversion_error = Qinterrupted;
 864       break;
 865     case CODING_RESULT_INSUFFICIENT_DST:
 866       /* Don't record this error in Vlast_code_conversion_error
 867          because it happens just temporarily and is resolved when the
 868          whole conversion is finished.  */
 869       break;
 870     case CODING_RESULT_SUCCESS:
 871       break;
 872     default:
 873       Vlast_code_conversion_error = intern ("Unknown error");
 874     }
 875 }
 876
 877 /* These wrapper macros are used to preserve validity of pointers into
 878    buffer text across calls to decode_char, encode_char, etc, which
 879    could cause relocation of buffers if it loads a charset map,
 880    because loading a charset map allocates large structures.  */
 881
 882 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 883   do {                                                                       \
 884     ptrdiff_t offset;                                                        \
 885                                                                              \
 886     charset_map_loaded = 0;                                                  \
 887     c = DECODE_CHAR (charset, code);                                         \
 888     if (charset_map_loaded                                                   \
 889         && (offset = coding_change_source (coding)))                         \
 890       {                                                                      \
 891         src += offset;                                                       \
 892         src_base += offset;                                                  \
 893         src_end += offset;                                                   \
 894       }                                                                      \
 895   } while (0)
 896
 897 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 898   do {                                                                  \
 899     ptrdiff_t offset;                                                   \
 900                                                                         \
 901     charset_map_loaded = 0;                                             \
 902     code = ENCODE_CHAR (charset, c);                                    \
 903     if (charset_map_loaded                                              \
 904         && (offset = coding_change_destination (coding)))               \
 905       {                                                                 \
 906         dst += offset;                                                  \
 907         dst_end += offset;                                              \
 908       }                                                                 \
 909   } while (0)
 910
 911 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 912   do {                                                                  \
 913     ptrdiff_t offset;                                                   \
 914                                                                         \
 915     charset_map_loaded = 0;                                             \
 916     charset = char_charset (c, charset_list, code_return);              \
 917     if (charset_map_loaded                                              \
 918         && (offset = coding_change_destination (coding)))               \
 919       {                                                                 \
 920         dst += offset;                                                  \
 921         dst_end += offset;                                              \
 922       }                                                                 \
 923   } while (0)
 924
 925 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 926   do {                                                                  \
 927     ptrdiff_t offset;                                                   \
 928                                                                         \
 929     charset_map_loaded = 0;                                             \
 930     result = CHAR_CHARSET_P (c, charset);                               \
 931     if (charset_map_loaded                                              \
 932         && (offset = coding_change_destination (coding)))               \
 933       {                                                                 \
 934         dst += offset;                                                  \
 935         dst_end += offset;                                              \
 936       }                                                                 \
 937   } while (0)
 938
 939
 940 /* If there are at least BYTES length of room at dst, allocate memory
 941    for coding->destination and update dst and dst_end.  We don't have
 942    to take care of coding->source which will be relocated.  It is
 943    handled by calling coding_set_source in encode_coding.  */
 944
 945 #define ASSURE_DESTINATION(bytes)                               \
 946   do {                                                          \
 947     if (dst + (bytes) >= dst_end)                               \
 948       {                                                         \
 949         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 950                                                                 \
 951         dst = alloc_destination (coding, more_bytes, dst);      \
 952         dst_end = coding->destination + coding->dst_bytes;      \
 953       }                                                         \
 954   } while (0)
 955
 956
 957 /* Store multibyte form of the character C in P, and advance P to the
 958    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 959    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 960    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 961
 962 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 963
 964 /* Return the character code of character whose multibyte form is at
 965    P, and advance P to the end of the multibyte form.  This used to be
 966    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 967    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 968
 969 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 970
 971 /* Set coding->source from coding->src_object.  */
 972
 973 static void
 974 coding_set_source (struct coding_system *coding)
 975 {
 976   if (BUFFERP (coding->src_object))
 977     {
 978       struct buffer *buf = XBUFFER (coding->src_object);
 979
 980       if (coding->src_pos < 0)
 981         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 982       else
 983         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 984     }
 985   else if (STRINGP (coding->src_object))
 986     {
 987       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 988     }
 989   else
 990     {
 991       /* Otherwise, the source is C string and is never relocated
 992          automatically.  Thus we don't have to update anything.  */
 993     }
 994 }
 995
 996
 997 /* Set coding->source from coding->src_object, and return how many
 998    bytes coding->source was changed.  */
 999
1000 static ptrdiff_t
1001 coding_change_source (struct coding_system *coding)
1002 {
1003   const unsigned char *orig = coding->source;
1004   coding_set_source (coding);
1005   return coding->source - orig;
1006 }
1007
1008
1009 /* Set coding->destination from coding->dst_object.  */
1010
1011 static void
1012 coding_set_destination (struct coding_system *coding)
1013 {
1014   if (BUFFERP (coding->dst_object))
1015     {
1016       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1017         {
1018           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1019           coding->dst_bytes = (GAP_END_ADDR
1020                                - (coding->src_bytes - coding->consumed)
1021                                - coding->destination);
1022         }
1023       else
1024         {
1025           /* We are sure that coding->dst_pos_byte is before the gap
1026              of the buffer. */
1027           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1028                                  + coding->dst_pos_byte - BEG_BYTE);
1029           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1030                                - coding->destination);
1031         }
1032     }
1033   else
1034     {
1035       /* Otherwise, the destination is C string and is never relocated
1036          automatically.  Thus we don't have to update anything.  */
1037     }
1038 }
1039
1040
1041 /* Set coding->destination from coding->dst_object, and return how
1042    many bytes coding->destination was changed.  */
1043
1044 static ptrdiff_t
1045 coding_change_destination (struct coding_system *coding)
1046 {
1047   const unsigned char *orig = coding->destination;
1048   coding_set_destination (coding);
1049   return coding->destination - orig;
1050 }
1051
1052
1053 static void
1054 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1055 {
1056   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1057     string_overflow ();
1058   coding->destination = xrealloc (coding->destination,
1059                                   coding->dst_bytes + bytes);
1060   coding->dst_bytes += bytes;
1061 }
1062
1063 static void
1064 coding_alloc_by_making_gap (struct coding_system *coding,
1065                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1066 {
1067   if (EQ (coding->src_object, coding->dst_object))
1068     {
1069       /* The gap may contain the produced data at the head and not-yet
1070          consumed data at the tail.  To preserve those data, we at
1071          first make the gap size to zero, then increase the gap
1072          size.  */
1073       ptrdiff_t add = GAP_SIZE;
1074
1075       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1076       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1077       make_gap (bytes);
1078       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1079       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1080     }
1081   else
1082     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1083 }
1084
1085
1086 static unsigned char *
1087 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1088                    unsigned char *dst)
1089 {
1090   ptrdiff_t offset = dst - coding->destination;
1091
1092   if (BUFFERP (coding->dst_object))
1093     {
1094       struct buffer *buf = XBUFFER (coding->dst_object);
1095
1096       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1097     }
1098   else
1099     coding_alloc_by_realloc (coding, nbytes);
1100   coding_set_destination (coding);
1101   dst = coding->destination + offset;
1102   return dst;
1103 }
1104
1105 /** Macros for annotations.  */
1106
1107 /* An annotation data is stored in the array coding->charbuf in this
1108    format:
1109      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1110    LENGTH is the number of elements in the annotation.
1111    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1112    NCHARS is the number of characters in the text annotated.
1113
1114    The format of the following elements depend on ANNOTATION_MASK.
1115
1116    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1117    follows:
1118      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1119
1120    NBYTES is the number of bytes specified in the header part of
1121    old-style emacs-mule encoding, or 0 for the other kind of
1122    composition.
1123
1124    METHOD is one of enum composition_method.
1125
1126    Optional COMPOSITION-COMPONENTS are characters and composition
1127    rules.
1128
1129    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1130    follows.
1131
1132    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1133    recover from an invalid annotation, and should be skipped by
1134    produce_annotation.  */
1135
1136 /* Maximum length of the header of annotation data.  */
1137 #define MAX_ANNOTATION_LENGTH 5
1138
1139 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1140   do {                                                  \
1141     *(buf)++ = -(len);                                  \
1142     *(buf)++ = (mask);                                  \
1143     *(buf)++ = (nchars);                                \
1144     coding->annotated = 1;                              \
1145   } while (0);
1146
1147 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1148   do {                                                                      \
1149     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1150     *buf++ = nbytes;                                                        \
1151     *buf++ = method;                                                        \
1152   } while (0)
1153
1154
1155 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1156   do {                                                                  \
1157     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1158     *buf++ = id;                                                        \
1159   } while (0)
1160
1161
1162 /* Bitmasks for coding->eol_seen.  */
1163
1164 #define EOL_SEEN_NONE   0
1165 #define EOL_SEEN_LF     1
1166 #define EOL_SEEN_CR     2
1167 #define EOL_SEEN_CRLF   4
1168
1169 \f
1170 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1171
1172
1173
1174 \f
1175 /*** 3. UTF-8 ***/
1176
1177 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1178    Return true if a text is encoded in UTF-8.  */
1179
1180 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1181 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1182 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1183 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1184 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1185 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1186
1187 #define UTF_8_BOM_1 0xEF
1188 #define UTF_8_BOM_2 0xBB
1189 #define UTF_8_BOM_3 0xBF
1190
1191 /* Unlike the other detect_coding_XXX, this function counts number of
1192    characters and check EOL format.  */
1193
1194 static bool
1195 detect_coding_utf_8 (struct coding_system *coding,
1196                      struct coding_detection_info *detect_info)
1197 {
1198   const unsigned char *src = coding->source, *src_base;
1199   const unsigned char *src_end = coding->source + coding->src_bytes;
1200   bool multibytep = coding->src_multibyte;
1201   ptrdiff_t consumed_chars = 0;
1202   bool bom_found = 0;
1203   int nchars = coding->head_ascii;
1204   int eol_seen = coding->eol_seen;
1205
1206   detect_info->checked |= CATEGORY_MASK_UTF_8;
1207   /* A coding system of this category is always ASCII compatible.  */
1208   src += nchars;
1209
1210   if (src == coding->source     /* BOM should be at the head.  */
1211       && src + 3 < src_end      /* BOM is 3-byte long.  */
1212       && src[0] == UTF_8_BOM_1
1213       && src[1] == UTF_8_BOM_2
1214       && src[2] == UTF_8_BOM_3)
1215     {
1216       bom_found = 1;
1217       src += 3;
1218       nchars++;
1219     }
1220
1221   while (1)
1222     {
1223       int c, c1, c2, c3, c4;
1224
1225       src_base = src;
1226       ONE_MORE_BYTE (c);
1227       if (c < 0 || UTF_8_1_OCTET_P (c))
1228         {
1229           nchars++;
1230           if (c == '\r')
1231             {
1232               if (src < src_end && *src == '\n')
1233                 {
1234                   eol_seen |= EOL_SEEN_CRLF;
1235                   src++;
1236                   nchars++;
1237                 }
1238               else
1239                 eol_seen |= EOL_SEEN_CR;
1240             }
1241           else if (c == '\n')
1242             eol_seen |= EOL_SEEN_LF;
1243           continue;
1244         }
1245       ONE_MORE_BYTE (c1);
1246       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1247         break;
1248       if (UTF_8_2_OCTET_LEADING_P (c))
1249         {
1250           nchars++;
1251           continue;
1252         }
1253       ONE_MORE_BYTE (c2);
1254       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1255         break;
1256       if (UTF_8_3_OCTET_LEADING_P (c))
1257         {
1258           nchars++;
1259           continue;
1260         }
1261       ONE_MORE_BYTE (c3);
1262       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1263         break;
1264       if (UTF_8_4_OCTET_LEADING_P (c))
1265         {
1266           nchars++;
1267           continue;
1268         }
1269       ONE_MORE_BYTE (c4);
1270       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1271         break;
1272       if (UTF_8_5_OCTET_LEADING_P (c))
1273         {
1274           nchars++;
1275           continue;
1276         }
1277       break;
1278     }
1279   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1280   return 0;
1281
1282  no_more_source:
1283   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1284     {
1285       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1286       return 0;
1287     }
1288   if (bom_found)
1289     {
1290       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1291       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1292     }
1293   else
1294     {
1295       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1296       if (nchars < src_end - coding->source)
1297         /* The found characters are less than source bytes, which
1298            means that we found a valid non-ASCII characters.  */
1299         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1300     }
1301   coding->detected_utf8_chars = nchars;
1302   return 1;
1303 }
1304
1305
1306 static void
1307 decode_coding_utf_8 (struct coding_system *coding)
1308 {
1309   const unsigned char *src = coding->source + coding->consumed;
1310   const unsigned char *src_end = coding->source + coding->src_bytes;
1311   const unsigned char *src_base;
1312   int *charbuf = coding->charbuf + coding->charbuf_used;
1313   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1314   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1315   bool multibytep = coding->src_multibyte;
1316   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1317   bool eol_dos
1318     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1319   int byte_after_cr = -1;
1320
1321   if (bom != utf_without_bom)
1322     {
1323       int c1, c2, c3;
1324
1325       src_base = src;
1326       ONE_MORE_BYTE (c1);
1327       if (! UTF_8_3_OCTET_LEADING_P (c1))
1328         src = src_base;
1329       else
1330         {
1331           ONE_MORE_BYTE (c2);
1332           if (! UTF_8_EXTRA_OCTET_P (c2))
1333             src = src_base;
1334           else
1335             {
1336               ONE_MORE_BYTE (c3);
1337               if (! UTF_8_EXTRA_OCTET_P (c3))
1338                 src = src_base;
1339               else
1340                 {
1341                   if ((c1 != UTF_8_BOM_1)
1342                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1343                     src = src_base;
1344                   else
1345                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1346                 }
1347             }
1348         }
1349     }
1350   CODING_UTF_8_BOM (coding) = utf_without_bom;
1351
1352   while (1)
1353     {
1354       int c, c1, c2, c3, c4, c5;
1355
1356       src_base = src;
1357       consumed_chars_base = consumed_chars;
1358
1359       if (charbuf >= charbuf_end)
1360         {
1361           if (byte_after_cr >= 0)
1362             src_base--;
1363           break;
1364         }
1365
1366       if (byte_after_cr >= 0)
1367         c1 = byte_after_cr, byte_after_cr = -1;
1368       else
1369         ONE_MORE_BYTE (c1);
1370       if (c1 < 0)
1371         {
1372           c = - c1;
1373         }
1374       else if (UTF_8_1_OCTET_P (c1))
1375         {
1376           if (eol_dos && c1 == '\r')
1377             ONE_MORE_BYTE (byte_after_cr);
1378           c = c1;
1379         }
1380       else
1381         {
1382           ONE_MORE_BYTE (c2);
1383           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1384             goto invalid_code;
1385           if (UTF_8_2_OCTET_LEADING_P (c1))
1386             {
1387               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1388               /* Reject overlong sequences here and below.  Encoders
1389                  producing them are incorrect, they can be misleading,
1390                  and they mess up read/write invariance.  */
1391               if (c < 128)
1392                 goto invalid_code;
1393             }
1394           else
1395             {
1396               ONE_MORE_BYTE (c3);
1397               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1398                 goto invalid_code;
1399               if (UTF_8_3_OCTET_LEADING_P (c1))
1400                 {
1401                   c = (((c1 & 0xF) << 12)
1402                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1403                   if (c < 0x800
1404                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1405                     goto invalid_code;
1406                 }
1407               else
1408                 {
1409                   ONE_MORE_BYTE (c4);
1410                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1411                     goto invalid_code;
1412                   if (UTF_8_4_OCTET_LEADING_P (c1))
1413                     {
1414                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1415                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1416                     if (c < 0x10000)
1417                       goto invalid_code;
1418                     }
1419                   else
1420                     {
1421                       ONE_MORE_BYTE (c5);
1422                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1423                         goto invalid_code;
1424                       if (UTF_8_5_OCTET_LEADING_P (c1))
1425                         {
1426                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1427                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1428                                | (c5 & 0x3F));
1429                           if ((c > MAX_CHAR) || (c < 0x200000))
1430                             goto invalid_code;
1431                         }
1432                       else
1433                         goto invalid_code;
1434                     }
1435                 }
1436             }
1437         }
1438
1439       *charbuf++ = c;
1440       continue;
1441
1442     invalid_code:
1443       src = src_base;
1444       consumed_chars = consumed_chars_base;
1445       ONE_MORE_BYTE (c);
1446       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1447       coding->errors++;
1448     }
1449
1450  no_more_source:
1451   coding->consumed_char += consumed_chars_base;
1452   coding->consumed = src_base - coding->source;
1453   coding->charbuf_used = charbuf - coding->charbuf;
1454 }
1455
1456
1457 static bool
1458 encode_coding_utf_8 (struct coding_system *coding)
1459 {
1460   bool multibytep = coding->dst_multibyte;
1461   int *charbuf = coding->charbuf;
1462   int *charbuf_end = charbuf + coding->charbuf_used;
1463   unsigned char *dst = coding->destination + coding->produced;
1464   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1465   ptrdiff_t produced_chars = 0;
1466   int c;
1467
1468   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1469     {
1470       ASSURE_DESTINATION (3);
1471       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1472       CODING_UTF_8_BOM (coding) = utf_without_bom;
1473     }
1474
1475   if (multibytep)
1476     {
1477       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1478
1479       while (charbuf < charbuf_end)
1480         {
1481           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1482
1483           ASSURE_DESTINATION (safe_room);
1484           c = *charbuf++;
1485           if (CHAR_BYTE8_P (c))
1486             {
1487               c = CHAR_TO_BYTE8 (c);
1488               EMIT_ONE_BYTE (c);
1489             }
1490           else
1491             {
1492               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1493               for (p = str; p < pend; p++)
1494                 EMIT_ONE_BYTE (*p);
1495             }
1496         }
1497     }
1498   else
1499     {
1500       int safe_room = MAX_MULTIBYTE_LENGTH;
1501
1502       while (charbuf < charbuf_end)
1503         {
1504           ASSURE_DESTINATION (safe_room);
1505           c = *charbuf++;
1506           if (CHAR_BYTE8_P (c))
1507             *dst++ = CHAR_TO_BYTE8 (c);
1508           else
1509             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1510           produced_chars++;
1511         }
1512     }
1513   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1514   coding->produced_char += produced_chars;
1515   coding->produced = dst - coding->destination;
1516   return 0;
1517 }
1518
1519
1520 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1521    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1522
1523 #define UTF_16_HIGH_SURROGATE_P(val) \
1524   (((val) & 0xFC00) == 0xD800)
1525
1526 #define UTF_16_LOW_SURROGATE_P(val) \
1527   (((val) & 0xFC00) == 0xDC00)
1528
1529
1530 static bool
1531 detect_coding_utf_16 (struct coding_system *coding,
1532                       struct coding_detection_info *detect_info)
1533 {
1534   const unsigned char *src = coding->source;
1535   const unsigned char *src_end = coding->source + coding->src_bytes;
1536   bool multibytep = coding->src_multibyte;
1537   int c1, c2;
1538
1539   detect_info->checked |= CATEGORY_MASK_UTF_16;
1540   if (coding->mode & CODING_MODE_LAST_BLOCK
1541       && (coding->src_chars & 1))
1542     {
1543       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1544       return 0;
1545     }
1546
1547   TWO_MORE_BYTES (c1, c2);
1548   if ((c1 == 0xFF) && (c2 == 0xFE))
1549     {
1550       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1551                              | CATEGORY_MASK_UTF_16_AUTO);
1552       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1553                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1554                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1555     }
1556   else if ((c1 == 0xFE) && (c2 == 0xFF))
1557     {
1558       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1559                              | CATEGORY_MASK_UTF_16_AUTO);
1560       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1561                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1562                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1563     }
1564   else if (c2 < 0)
1565     {
1566       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1567       return 0;
1568     }
1569   else
1570     {
1571       /* We check the dispersion of Eth and Oth bytes where E is even and
1572          O is odd.  If both are high, we assume binary data.*/
1573       unsigned char e[256], o[256];
1574       unsigned e_num = 1, o_num = 1;
1575
1576       memset (e, 0, 256);
1577       memset (o, 0, 256);
1578       e[c1] = 1;
1579       o[c2] = 1;
1580
1581       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1582                                 |CATEGORY_MASK_UTF_16_BE
1583                                 | CATEGORY_MASK_UTF_16_LE);
1584
1585       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1586              != CATEGORY_MASK_UTF_16)
1587         {
1588           TWO_MORE_BYTES (c1, c2);
1589           if (c2 < 0)
1590             break;
1591           if (! e[c1])
1592             {
1593               e[c1] = 1;
1594               e_num++;
1595               if (e_num >= 128)
1596                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1597             }
1598           if (! o[c2])
1599             {
1600               o[c2] = 1;
1601               o_num++;
1602               if (o_num >= 128)
1603                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1604             }
1605         }
1606       return 0;
1607     }
1608
1609  no_more_source:
1610   return 1;
1611 }
1612
1613 static void
1614 decode_coding_utf_16 (struct coding_system *coding)
1615 {
1616   const unsigned char *src = coding->source + coding->consumed;
1617   const unsigned char *src_end = coding->source + coding->src_bytes;
1618   const unsigned char *src_base;
1619   int *charbuf = coding->charbuf + coding->charbuf_used;
1620   /* We may produces at most 3 chars in one loop.  */
1621   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1622   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1623   bool multibytep = coding->src_multibyte;
1624   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1625   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1626   int surrogate = CODING_UTF_16_SURROGATE (coding);
1627   bool eol_dos
1628     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1629   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1630
1631   if (bom == utf_with_bom)
1632     {
1633       int c, c1, c2;
1634
1635       src_base = src;
1636       ONE_MORE_BYTE (c1);
1637       ONE_MORE_BYTE (c2);
1638       c = (c1 << 8) | c2;
1639
1640       if (endian == utf_16_big_endian
1641           ? c != 0xFEFF : c != 0xFFFE)
1642         {
1643           /* The first two bytes are not BOM.  Treat them as bytes
1644              for a normal character.  */
1645           src = src_base;
1646           coding->errors++;
1647         }
1648       CODING_UTF_16_BOM (coding) = utf_without_bom;
1649     }
1650   else if (bom == utf_detect_bom)
1651     {
1652       /* We have already tried to detect BOM and failed in
1653          detect_coding.  */
1654       CODING_UTF_16_BOM (coding) = utf_without_bom;
1655     }
1656
1657   while (1)
1658     {
1659       int c, c1, c2;
1660
1661       src_base = src;
1662       consumed_chars_base = consumed_chars;
1663
1664       if (charbuf >= charbuf_end)
1665         {
1666           if (byte_after_cr1 >= 0)
1667             src_base -= 2;
1668           break;
1669         }
1670
1671       if (byte_after_cr1 >= 0)
1672         c1 = byte_after_cr1, byte_after_cr1 = -1;
1673       else
1674         ONE_MORE_BYTE (c1);
1675       if (c1 < 0)
1676         {
1677           *charbuf++ = -c1;
1678           continue;
1679         }
1680       if (byte_after_cr2 >= 0)
1681         c2 = byte_after_cr2, byte_after_cr2 = -1;
1682       else
1683         ONE_MORE_BYTE (c2);
1684       if (c2 < 0)
1685         {
1686           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1687           *charbuf++ = -c2;
1688           continue;
1689         }
1690       c = (endian == utf_16_big_endian
1691            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1692
1693       if (surrogate)
1694         {
1695           if (! UTF_16_LOW_SURROGATE_P (c))
1696             {
1697               if (endian == utf_16_big_endian)
1698                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1699               else
1700                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1701               *charbuf++ = c1;
1702               *charbuf++ = c2;
1703               coding->errors++;
1704               if (UTF_16_HIGH_SURROGATE_P (c))
1705                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1706               else
1707                 *charbuf++ = c;
1708             }
1709           else
1710             {
1711               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1712               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1713               *charbuf++ = 0x10000 + c;
1714             }
1715         }
1716       else
1717         {
1718           if (UTF_16_HIGH_SURROGATE_P (c))
1719             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1720           else
1721             {
1722               if (eol_dos && c == '\r')
1723                 {
1724                   ONE_MORE_BYTE (byte_after_cr1);
1725                   ONE_MORE_BYTE (byte_after_cr2);
1726                 }
1727               *charbuf++ = c;
1728             }
1729         }
1730     }
1731
1732  no_more_source:
1733   coding->consumed_char += consumed_chars_base;
1734   coding->consumed = src_base - coding->source;
1735   coding->charbuf_used = charbuf - coding->charbuf;
1736 }
1737
1738 static bool
1739 encode_coding_utf_16 (struct coding_system *coding)
1740 {
1741   bool multibytep = coding->dst_multibyte;
1742   int *charbuf = coding->charbuf;
1743   int *charbuf_end = charbuf + coding->charbuf_used;
1744   unsigned char *dst = coding->destination + coding->produced;
1745   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1746   int safe_room = 8;
1747   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1748   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1749   ptrdiff_t produced_chars = 0;
1750   int c;
1751
1752   if (bom != utf_without_bom)
1753     {
1754       ASSURE_DESTINATION (safe_room);
1755       if (big_endian)
1756         EMIT_TWO_BYTES (0xFE, 0xFF);
1757       else
1758         EMIT_TWO_BYTES (0xFF, 0xFE);
1759       CODING_UTF_16_BOM (coding) = utf_without_bom;
1760     }
1761
1762   while (charbuf < charbuf_end)
1763     {
1764       ASSURE_DESTINATION (safe_room);
1765       c = *charbuf++;
1766       if (c > MAX_UNICODE_CHAR)
1767         c = coding->default_char;
1768
1769       if (c < 0x10000)
1770         {
1771           if (big_endian)
1772             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1773           else
1774             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1775         }
1776       else
1777         {
1778           int c1, c2;
1779
1780           c -= 0x10000;
1781           c1 = (c >> 10) + 0xD800;
1782           c2 = (c & 0x3FF) + 0xDC00;
1783           if (big_endian)
1784             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1785           else
1786             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1787         }
1788     }
1789   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1790   coding->produced = dst - coding->destination;
1791   coding->produced_char += produced_chars;
1792   return 0;
1793 }
1794
1795 \f
1796 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1797
1798 /* Emacs' internal format for representation of multiple character
1799    sets is a kind of multi-byte encoding, i.e. characters are
1800    represented by variable-length sequences of one-byte codes.
1801
1802    ASCII characters and control characters (e.g. `tab', `newline') are
1803    represented by one-byte sequences which are their ASCII codes, in
1804    the range 0x00 through 0x7F.
1805
1806    8-bit characters of the range 0x80..0x9F are represented by
1807    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1808    code + 0x20).
1809
1810    8-bit characters of the range 0xA0..0xFF are represented by
1811    one-byte sequences which are their 8-bit code.
1812
1813    The other characters are represented by a sequence of `base
1814    leading-code', optional `extended leading-code', and one or two
1815    `position-code's.  The length of the sequence is determined by the
1816    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1817    whereas extended leading-code and position-code take the range 0xA0
1818    through 0xFF.  See `charset.h' for more details about leading-code
1819    and position-code.
1820
1821    --- CODE RANGE of Emacs' internal format ---
1822    character set        range
1823    -------------        -----
1824    ascii                0x00..0x7F
1825    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1826    eight-bit-graphic    0xA0..0xBF
1827    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1828    ---------------------------------------------
1829
1830    As this is the internal character representation, the format is
1831    usually not used externally (i.e. in a file or in a data sent to a
1832    process).  But, it is possible to have a text externally in this
1833    format (i.e. by encoding by the coding system `emacs-mule').
1834
1835    In that case, a sequence of one-byte codes has a slightly different
1836    form.
1837
1838    At first, all characters in eight-bit-control are represented by
1839    one-byte sequences which are their 8-bit code.
1840
1841    Next, character composition data are represented by the byte
1842    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1843    where,
1844         METHOD is 0xF2 plus one of composition method (enum
1845         composition_method),
1846
1847         BYTES is 0xA0 plus a byte length of this composition data,
1848
1849         CHARS is 0xA0 plus a number of characters composed by this
1850         data,
1851
1852         COMPONENTs are characters of multibyte form or composition
1853         rules encoded by two-byte of ASCII codes.
1854
1855    In addition, for backward compatibility, the following formats are
1856    also recognized as composition data on decoding.
1857
1858    0x80 MSEQ ...
1859    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1860
1861    Here,
1862         MSEQ is a multibyte form but in these special format:
1863           ASCII: 0xA0 ASCII_CODE+0x80,
1864           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1865         RULE is a one byte code of the range 0xA0..0xF0 that
1866         represents a composition rule.
1867   */
1868
1869 char emacs_mule_bytes[256];
1870
1871
1872 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1873    Return true if a text is encoded in 'emacs-mule'.  */
1874
1875 static bool
1876 detect_coding_emacs_mule (struct coding_system *coding,
1877                           struct coding_detection_info *detect_info)
1878 {
1879   const unsigned char *src = coding->source, *src_base;
1880   const unsigned char *src_end = coding->source + coding->src_bytes;
1881   bool multibytep = coding->src_multibyte;
1882   ptrdiff_t consumed_chars = 0;
1883   int c;
1884   int found = 0;
1885
1886   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1887   /* A coding system of this category is always ASCII compatible.  */
1888   src += coding->head_ascii;
1889
1890   while (1)
1891     {
1892       src_base = src;
1893       ONE_MORE_BYTE (c);
1894       if (c < 0)
1895         continue;
1896       if (c == 0x80)
1897         {
1898           /* Perhaps the start of composite character.  We simply skip
1899              it because analyzing it is too heavy for detecting.  But,
1900              at least, we check that the composite character
1901              constitutes of more than 4 bytes.  */
1902           const unsigned char *src_start;
1903
1904         repeat:
1905           src_start = src;
1906           do
1907             {
1908               ONE_MORE_BYTE (c);
1909             }
1910           while (c >= 0xA0);
1911
1912           if (src - src_start <= 4)
1913             break;
1914           found = CATEGORY_MASK_EMACS_MULE;
1915           if (c == 0x80)
1916             goto repeat;
1917         }
1918
1919       if (c < 0x80)
1920         {
1921           if (c < 0x20
1922               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1923             break;
1924         }
1925       else
1926         {
1927           int more_bytes = emacs_mule_bytes[c] - 1;
1928
1929           while (more_bytes > 0)
1930             {
1931               ONE_MORE_BYTE (c);
1932               if (c < 0xA0)
1933                 {
1934                   src--;        /* Unread the last byte.  */
1935                   break;
1936                 }
1937               more_bytes--;
1938             }
1939           if (more_bytes != 0)
1940             break;
1941           found = CATEGORY_MASK_EMACS_MULE;
1942         }
1943     }
1944   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1945   return 0;
1946
1947  no_more_source:
1948   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1949     {
1950       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1951       return 0;
1952     }
1953   detect_info->found |= found;
1954   return 1;
1955 }
1956
1957
1958 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1959    character.  If CMP_STATUS indicates that we must expect MSEQ or
1960    RULE described above, decode it and return the negative value of
1961    the decoded character or rule.  If an invalid byte is found, return
1962    -1.  If SRC is too short, return -2.  */
1963
1964 static int
1965 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1966                  int *nbytes, int *nchars, int *id,
1967                  struct composition_status *cmp_status)
1968 {
1969   const unsigned char *src_end = coding->source + coding->src_bytes;
1970   const unsigned char *src_base = src;
1971   bool multibytep = coding->src_multibyte;
1972   int charset_ID;
1973   unsigned code;
1974   int c;
1975   int consumed_chars = 0;
1976   bool mseq_found = 0;
1977
1978   ONE_MORE_BYTE (c);
1979   if (c < 0)
1980     {
1981       c = -c;
1982       charset_ID = emacs_mule_charset[0];
1983     }
1984   else
1985     {
1986       if (c >= 0xA0)
1987         {
1988           if (cmp_status->state != COMPOSING_NO
1989               && cmp_status->old_form)
1990             {
1991               if (cmp_status->state == COMPOSING_CHAR)
1992                 {
1993                   if (c == 0xA0)
1994                     {
1995                       ONE_MORE_BYTE (c);
1996                       c -= 0x80;
1997                       if (c < 0)
1998                         goto invalid_code;
1999                     }
2000                   else
2001                     c -= 0x20;
2002                   mseq_found = 1;
2003                 }
2004               else
2005                 {
2006                   *nbytes = src - src_base;
2007                   *nchars = consumed_chars;
2008                   return -c;
2009                 }
2010             }
2011           else
2012             goto invalid_code;
2013         }
2014
2015       switch (emacs_mule_bytes[c])
2016         {
2017         case 2:
2018           if ((charset_ID = emacs_mule_charset[c]) < 0)
2019             goto invalid_code;
2020           ONE_MORE_BYTE (c);
2021           if (c < 0xA0)
2022             goto invalid_code;
2023           code = c & 0x7F;
2024           break;
2025
2026         case 3:
2027           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2028               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2029             {
2030               ONE_MORE_BYTE (c);
2031               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2032                 goto invalid_code;
2033               ONE_MORE_BYTE (c);
2034               if (c < 0xA0)
2035                 goto invalid_code;
2036               code = c & 0x7F;
2037             }
2038           else
2039             {
2040               if ((charset_ID = emacs_mule_charset[c]) < 0)
2041                 goto invalid_code;
2042               ONE_MORE_BYTE (c);
2043               if (c < 0xA0)
2044                 goto invalid_code;
2045               code = (c & 0x7F) << 8;
2046               ONE_MORE_BYTE (c);
2047               if (c < 0xA0)
2048                 goto invalid_code;
2049               code |= c & 0x7F;
2050             }
2051           break;
2052
2053         case 4:
2054           ONE_MORE_BYTE (c);
2055           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2056             goto invalid_code;
2057           ONE_MORE_BYTE (c);
2058           if (c < 0xA0)
2059             goto invalid_code;
2060           code = (c & 0x7F) << 8;
2061           ONE_MORE_BYTE (c);
2062           if (c < 0xA0)
2063             goto invalid_code;
2064           code |= c & 0x7F;
2065           break;
2066
2067         case 1:
2068           code = c;
2069           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2070           break;
2071
2072         default:
2073           emacs_abort ();
2074         }
2075       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2076                           CHARSET_FROM_ID (charset_ID), code, c);
2077       if (c < 0)
2078         goto invalid_code;
2079     }
2080   *nbytes = src - src_base;
2081   *nchars = consumed_chars;
2082   if (id)
2083     *id = charset_ID;
2084   return (mseq_found ? -c : c);
2085
2086  no_more_source:
2087   return -2;
2088
2089  invalid_code:
2090   return -1;
2091 }
2092
2093
2094 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2095
2096 /* Handle these composition sequence ('|': the end of header elements,
2097    BYTES and CHARS >= 0xA0):
2098
2099    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2100    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2101    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2102
2103    and these old form:
2104
2105    (4) relative composition: 0x80 | MSEQ ... MSEQ
2106    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2107
2108    When the starter 0x80 and the following header elements are found,
2109    this annotation header is produced.
2110
2111         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2112
2113    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2114    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2115
2116    Then, upon reading the following elements, these codes are produced
2117    until the composition end is found:
2118
2119    (1) CHAR ... CHAR
2120    (2) ALT ... ALT CHAR ... CHAR
2121    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2122    (4) CHAR ... CHAR
2123    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2124
2125    When the composition end is found, LENGTH and NCHARS in the
2126    annotation header is updated as below:
2127
2128    (1) LENGTH: unchanged, NCHARS: unchanged
2129    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2130    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2131    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2132    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2133
2134    If an error is found while composing, the annotation header is
2135    changed to the original composition header (plus filler -1s) as
2136    below:
2137
2138    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2139    (5)          [ 0x80 0xFF -1 -1- -1 ]
2140
2141    and the sequence [ -2 DECODED-RULE ] is changed to the original
2142    byte sequence as below:
2143         o the original byte sequence is B: [ B -1 ]
2144         o the original byte sequence is B1 B2: [ B1 B2 ]
2145
2146    Most of the routines are implemented by macros because many
2147    variables and labels in the caller decode_coding_emacs_mule must be
2148    accessible, and they are usually called just once (thus doesn't
2149    increase the size of compiled object).  */
2150
2151 /* Decode a composition rule represented by C as a component of
2152    composition sequence of Emacs 20 style.  Set RULE to the decoded
2153    rule. */
2154
2155 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2156   do {                                                  \
2157     int gref, nref;                                     \
2158                                                         \
2159     c -= 0xA0;                                          \
2160     if (c < 0 || c >= 81)                               \
2161       goto invalid_code;                                \
2162     gref = c / 9, nref = c % 9;                         \
2163     if (gref == 4) gref = 10;                           \
2164     if (nref == 4) nref = 10;                           \
2165     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2166   } while (0)
2167
2168
2169 /* Decode a composition rule represented by C and the following byte
2170    at SRC as a component of composition sequence of Emacs 21 style.
2171    Set RULE to the decoded rule.  */
2172
2173 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2174   do {                                                  \
2175     int gref, nref;                                     \
2176                                                         \
2177     gref = c - 0x20;                                    \
2178     if (gref < 0 || gref >= 81)                         \
2179       goto invalid_code;                                \
2180     ONE_MORE_BYTE (c);                                  \
2181     nref = c - 0x20;                                    \
2182     if (nref < 0 || nref >= 81)                         \
2183       goto invalid_code;                                \
2184     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2185   } while (0)
2186
2187
2188 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2189    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2190    byte length of this composition information, CHARS is the number of
2191    characters composed by this composition.  */
2192
2193 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2194   do {                                                                  \
2195     enum composition_method method = c - 0xF2;                          \
2196     int nbytes, nchars;                                                 \
2197                                                                         \
2198     ONE_MORE_BYTE (c);                                                  \
2199     if (c < 0)                                                          \
2200       goto invalid_code;                                                \
2201     nbytes = c - 0xA0;                                                  \
2202     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2203       goto invalid_code;                                                \
2204     ONE_MORE_BYTE (c);                                                  \
2205     nchars = c - 0xA0;                                                  \
2206     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2207       goto invalid_code;                                                \
2208     cmp_status->old_form = 0;                                           \
2209     cmp_status->method = method;                                        \
2210     if (method == COMPOSITION_RELATIVE)                                 \
2211       cmp_status->state = COMPOSING_CHAR;                               \
2212     else                                                                \
2213       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2214     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2215     cmp_status->nchars = nchars;                                        \
2216     cmp_status->ncomps = nbytes - 4;                                    \
2217     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2218   } while (0)
2219
2220
2221 /* Start of Emacs 20 style format for relative composition.  */
2222
2223 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2224   do {                                                          \
2225     cmp_status->old_form = 1;                                   \
2226     cmp_status->method = COMPOSITION_RELATIVE;                  \
2227     cmp_status->state = COMPOSING_CHAR;                         \
2228     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2229     cmp_status->nchars = cmp_status->ncomps = 0;                \
2230     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2231   } while (0)
2232
2233
2234 /* Start of Emacs 20 style format for rule-base composition.  */
2235
2236 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2237   do {                                                          \
2238     cmp_status->old_form = 1;                                   \
2239     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2240     cmp_status->state = COMPOSING_CHAR;                         \
2241     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2242     cmp_status->nchars = cmp_status->ncomps = 0;                \
2243     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2244   } while (0)
2245
2246
2247 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2248   do {                                                  \
2249     const unsigned char *current_src = src;             \
2250                                                         \
2251     ONE_MORE_BYTE (c);                                  \
2252     if (c < 0)                                          \
2253       goto invalid_code;                                \
2254     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2255         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2256       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2257     else if (c < 0xA0)                                  \
2258       goto invalid_code;                                \
2259     else if (c < 0xC0)                                  \
2260       {                                                 \
2261         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2262         /* Re-read C as a composition component.  */    \
2263         src = current_src;                              \
2264       }                                                 \
2265     else if (c == 0xFF)                                 \
2266       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2267     else                                                \
2268       goto invalid_code;                                \
2269   } while (0)
2270
2271 #define EMACS_MULE_COMPOSITION_END()                            \
2272   do {                                                          \
2273     int idx = - cmp_status->length;                             \
2274                                                                 \
2275     if (cmp_status->old_form)                                   \
2276       charbuf[idx + 2] = cmp_status->nchars;                    \
2277     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2278       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2279     cmp_status->state = COMPOSING_NO;                           \
2280   } while (0)
2281
2282
2283 static int
2284 emacs_mule_finish_composition (int *charbuf,
2285                                struct composition_status *cmp_status)
2286 {
2287   int idx = - cmp_status->length;
2288   int new_chars;
2289
2290   if (cmp_status->old_form && cmp_status->nchars > 0)
2291     {
2292       charbuf[idx + 2] = cmp_status->nchars;
2293       new_chars = 0;
2294       if (cmp_status->method == COMPOSITION_WITH_RULE
2295           && cmp_status->state == COMPOSING_CHAR)
2296         {
2297           /* The last rule was invalid.  */
2298           int rule = charbuf[-1] + 0xA0;
2299
2300           charbuf[-2] = BYTE8_TO_CHAR (rule);
2301           charbuf[-1] = -1;
2302           new_chars = 1;
2303         }
2304     }
2305   else
2306     {
2307       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2308
2309       if (cmp_status->method == COMPOSITION_WITH_RULE)
2310         {
2311           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2312           charbuf[idx++] = -3;
2313           charbuf[idx++] = 0;
2314           new_chars = 1;
2315         }
2316       else
2317         {
2318           int nchars = charbuf[idx + 1] + 0xA0;
2319           int nbytes = charbuf[idx + 2] + 0xA0;
2320
2321           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2322           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2323           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2324           charbuf[idx++] = -1;
2325           new_chars = 4;
2326         }
2327     }
2328   cmp_status->state = COMPOSING_NO;
2329   return new_chars;
2330 }
2331
2332 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2333   do {                                                                    \
2334     if (cmp_status->state != COMPOSING_NO)                                \
2335       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2336   } while (0)
2337
2338
2339 static void
2340 decode_coding_emacs_mule (struct coding_system *coding)
2341 {
2342   const unsigned char *src = coding->source + coding->consumed;
2343   const unsigned char *src_end = coding->source + coding->src_bytes;
2344   const unsigned char *src_base;
2345   int *charbuf = coding->charbuf + coding->charbuf_used;
2346   /* We may produce two annotations (charset and composition) in one
2347      loop and one more charset annotation at the end.  */
2348   int *charbuf_end
2349     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2350       /* We can produce up to 2 characters in a loop.  */
2351       - 1;
2352   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2353   bool multibytep = coding->src_multibyte;
2354   ptrdiff_t char_offset = coding->produced_char;
2355   ptrdiff_t last_offset = char_offset;
2356   int last_id = charset_ascii;
2357   bool eol_dos
2358     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2359   int byte_after_cr = -1;
2360   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2361
2362   if (cmp_status->state != COMPOSING_NO)
2363     {
2364       int i;
2365
2366       if (charbuf_end - charbuf < cmp_status->length)
2367         emacs_abort ();
2368       for (i = 0; i < cmp_status->length; i++)
2369         *charbuf++ = cmp_status->carryover[i];
2370       coding->annotated = 1;
2371     }
2372
2373   while (1)
2374     {
2375       int c, id IF_LINT (= 0);
2376
2377       src_base = src;
2378       consumed_chars_base = consumed_chars;
2379
2380       if (charbuf >= charbuf_end)
2381         {
2382           if (byte_after_cr >= 0)
2383             src_base--;
2384           break;
2385         }
2386
2387       if (byte_after_cr >= 0)
2388         c = byte_after_cr, byte_after_cr = -1;
2389       else
2390         ONE_MORE_BYTE (c);
2391
2392       if (c < 0 || c == 0x80)
2393         {
2394           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2395           if (c < 0)
2396             {
2397               *charbuf++ = -c;
2398               char_offset++;
2399             }
2400           else
2401             DECODE_EMACS_MULE_COMPOSITION_START ();
2402           continue;
2403         }
2404
2405       if (c < 0x80)
2406         {
2407           if (eol_dos && c == '\r')
2408             ONE_MORE_BYTE (byte_after_cr);
2409           id = charset_ascii;
2410           if (cmp_status->state != COMPOSING_NO)
2411             {
2412               if (cmp_status->old_form)
2413                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2414               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2415                 cmp_status->ncomps--;
2416             }
2417         }
2418       else
2419         {
2420           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2421           /* emacs_mule_char can load a charset map from a file, which
2422              allocates a large structure and might cause buffer text
2423              to be relocated as result.  Thus, we need to remember the
2424              original pointer to buffer text, and fix up all related
2425              pointers after the call.  */
2426           const unsigned char *orig = coding->source;
2427           ptrdiff_t offset;
2428
2429           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2430                                cmp_status);
2431           offset = coding->source - orig;
2432           if (offset)
2433             {
2434               src += offset;
2435               src_base += offset;
2436               src_end += offset;
2437             }
2438           if (c < 0)
2439             {
2440               if (c == -1)
2441                 goto invalid_code;
2442               if (c == -2)
2443                 break;
2444             }
2445           src = src_base + nbytes;
2446           consumed_chars = consumed_chars_base + nchars;
2447           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2448             cmp_status->ncomps -= nchars;
2449         }
2450
2451       /* Now if C >= 0, we found a normally encoded character, if C <
2452          0, we found an old-style composition component character or
2453          rule.  */
2454
2455       if (cmp_status->state == COMPOSING_NO)
2456         {
2457           if (last_id != id)
2458             {
2459               if (last_id != charset_ascii)
2460                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2461                                   last_id);
2462               last_id = id;
2463               last_offset = char_offset;
2464             }
2465           *charbuf++ = c;
2466           char_offset++;
2467         }
2468       else if (cmp_status->state == COMPOSING_CHAR)
2469         {
2470           if (cmp_status->old_form)
2471             {
2472               if (c >= 0)
2473                 {
2474                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2475                   *charbuf++ = c;
2476                   char_offset++;
2477                 }
2478               else
2479                 {
2480                   *charbuf++ = -c;
2481                   cmp_status->nchars++;
2482                   cmp_status->length++;
2483                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2484                     EMACS_MULE_COMPOSITION_END ();
2485                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2486                     cmp_status->state = COMPOSING_RULE;
2487                 }
2488             }
2489           else
2490             {
2491               *charbuf++ = c;
2492               cmp_status->length++;
2493               cmp_status->nchars--;
2494               if (cmp_status->nchars == 0)
2495                 EMACS_MULE_COMPOSITION_END ();
2496             }
2497         }
2498       else if (cmp_status->state == COMPOSING_RULE)
2499         {
2500           int rule;
2501
2502           if (c >= 0)
2503             {
2504               EMACS_MULE_COMPOSITION_END ();
2505               *charbuf++ = c;
2506               char_offset++;
2507             }
2508           else
2509             {
2510               c = -c;
2511               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2512               if (rule < 0)
2513                 goto invalid_code;
2514               *charbuf++ = -2;
2515               *charbuf++ = rule;
2516               cmp_status->length += 2;
2517               cmp_status->state = COMPOSING_CHAR;
2518             }
2519         }
2520       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2521         {
2522           *charbuf++ = c;
2523           cmp_status->length++;
2524           if (cmp_status->ncomps == 0)
2525             cmp_status->state = COMPOSING_CHAR;
2526           else if (cmp_status->ncomps > 0)
2527             {
2528               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2529                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2530             }
2531           else
2532             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2533         }
2534       else                      /* COMPOSING_COMPONENT_RULE */
2535         {
2536           int rule;
2537
2538           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2539           if (rule < 0)
2540             goto invalid_code;
2541           *charbuf++ = -2;
2542           *charbuf++ = rule;
2543           cmp_status->length += 2;
2544           cmp_status->ncomps--;
2545           if (cmp_status->ncomps > 0)
2546             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2547           else
2548             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2549         }
2550       continue;
2551
2552     invalid_code:
2553       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2554       src = src_base;
2555       consumed_chars = consumed_chars_base;
2556       ONE_MORE_BYTE (c);
2557       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2558       char_offset++;
2559       coding->errors++;
2560     }
2561
2562  no_more_source:
2563   if (cmp_status->state != COMPOSING_NO)
2564     {
2565       if (coding->mode & CODING_MODE_LAST_BLOCK)
2566         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2567       else
2568         {
2569           int i;
2570
2571           charbuf -= cmp_status->length;
2572           for (i = 0; i < cmp_status->length; i++)
2573             cmp_status->carryover[i] = charbuf[i];
2574         }
2575     }
2576   if (last_id != charset_ascii)
2577     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2578   coding->consumed_char += consumed_chars_base;
2579   coding->consumed = src_base - coding->source;
2580   coding->charbuf_used = charbuf - coding->charbuf;
2581 }
2582
2583
2584 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2585   do {                                          \
2586     if (id < 0xA0)                              \
2587       codes[0] = id, codes[1] = 0;              \
2588     else if (id < 0xE0)                         \
2589       codes[0] = 0x9A, codes[1] = id;           \
2590     else if (id < 0xF0)                         \
2591       codes[0] = 0x9B, codes[1] = id;           \
2592     else if (id < 0xF5)                         \
2593       codes[0] = 0x9C, codes[1] = id;           \
2594     else                                        \
2595       codes[0] = 0x9D, codes[1] = id;           \
2596   } while (0);
2597
2598
2599 static bool
2600 encode_coding_emacs_mule (struct coding_system *coding)
2601 {
2602   bool multibytep = coding->dst_multibyte;
2603   int *charbuf = coding->charbuf;
2604   int *charbuf_end = charbuf + coding->charbuf_used;
2605   unsigned char *dst = coding->destination + coding->produced;
2606   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2607   int safe_room = 8;
2608   ptrdiff_t produced_chars = 0;
2609   Lisp_Object attrs, charset_list;
2610   int c;
2611   int preferred_charset_id = -1;
2612
2613   CODING_GET_INFO (coding, attrs, charset_list);
2614   if (! EQ (charset_list, Vemacs_mule_charset_list))
2615     {
2616       charset_list = Vemacs_mule_charset_list;
2617       ASET (attrs, coding_attr_charset_list, charset_list);
2618     }
2619
2620   while (charbuf < charbuf_end)
2621     {
2622       ASSURE_DESTINATION (safe_room);
2623       c = *charbuf++;
2624
2625       if (c < 0)
2626         {
2627           /* Handle an annotation.  */
2628           switch (*charbuf)
2629             {
2630             case CODING_ANNOTATE_COMPOSITION_MASK:
2631               /* Not yet implemented.  */
2632               break;
2633             case CODING_ANNOTATE_CHARSET_MASK:
2634               preferred_charset_id = charbuf[3];
2635               if (preferred_charset_id >= 0
2636                   && NILP (Fmemq (make_number (preferred_charset_id),
2637                                   charset_list)))
2638                 preferred_charset_id = -1;
2639               break;
2640             default:
2641               emacs_abort ();
2642             }
2643           charbuf += -c - 1;
2644           continue;
2645         }
2646
2647       if (ASCII_CHAR_P (c))
2648         EMIT_ONE_ASCII_BYTE (c);
2649       else if (CHAR_BYTE8_P (c))
2650         {
2651           c = CHAR_TO_BYTE8 (c);
2652           EMIT_ONE_BYTE (c);
2653         }
2654       else
2655         {
2656           struct charset *charset;
2657           unsigned code;
2658           int dimension;
2659           int emacs_mule_id;
2660           unsigned char leading_codes[2];
2661
2662           if (preferred_charset_id >= 0)
2663             {
2664               bool result;
2665
2666               charset = CHARSET_FROM_ID (preferred_charset_id);
2667               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2668               if (result)
2669                 code = ENCODE_CHAR (charset, c);
2670               else
2671                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2672                                      &code, charset);
2673             }
2674           else
2675             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2676                                  &code, charset);
2677           if (! charset)
2678             {
2679               c = coding->default_char;
2680               if (ASCII_CHAR_P (c))
2681                 {
2682                   EMIT_ONE_ASCII_BYTE (c);
2683                   continue;
2684                 }
2685               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2686                                    &code, charset);
2687             }
2688           dimension = CHARSET_DIMENSION (charset);
2689           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2690           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2691           EMIT_ONE_BYTE (leading_codes[0]);
2692           if (leading_codes[1])
2693             EMIT_ONE_BYTE (leading_codes[1]);
2694           if (dimension == 1)
2695             EMIT_ONE_BYTE (code | 0x80);
2696           else
2697             {
2698               code |= 0x8080;
2699               EMIT_ONE_BYTE (code >> 8);
2700               EMIT_ONE_BYTE (code & 0xFF);
2701             }
2702         }
2703     }
2704   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2705   coding->produced_char += produced_chars;
2706   coding->produced = dst - coding->destination;
2707   return 0;
2708 }
2709
2710 \f
2711 /*** 7. ISO2022 handlers ***/
2712
2713 /* The following note describes the coding system ISO2022 briefly.
2714    Since the intention of this note is to help understand the
2715    functions in this file, some parts are NOT ACCURATE or are OVERLY
2716    SIMPLIFIED.  For thorough understanding, please refer to the
2717    original document of ISO2022.  This is equivalent to the standard
2718    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2719
2720    ISO2022 provides many mechanisms to encode several character sets
2721    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2722    is encoded using bytes less than 128.  This may make the encoded
2723    text a little bit longer, but the text passes more easily through
2724    several types of gateway, some of which strip off the MSB (Most
2725    Significant Bit).
2726
2727    There are two kinds of character sets: control character sets and
2728    graphic character sets.  The former contain control characters such
2729    as `newline' and `escape' to provide control functions (control
2730    functions are also provided by escape sequences).  The latter
2731    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2732    two control character sets and many graphic character sets.
2733
2734    Graphic character sets are classified into one of the following
2735    four classes, according to the number of bytes (DIMENSION) and
2736    number of characters in one dimension (CHARS) of the set:
2737    - DIMENSION1_CHARS94
2738    - DIMENSION1_CHARS96
2739    - DIMENSION2_CHARS94
2740    - DIMENSION2_CHARS96
2741
2742    In addition, each character set is assigned an identification tag,
2743    unique for each set, called the "final character" (denoted as <F>
2744    hereafter).  The <F> of each character set is decided by ECMA(*)
2745    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2746    (0x30..0x3F are for private use only).
2747
2748    Note (*): ECMA = European Computer Manufacturers Association
2749
2750    Here are examples of graphic character sets [NAME(<F>)]:
2751         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2752         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2753         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2754         o DIMENSION2_CHARS96 -- none for the moment
2755
2756    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2757         C0 [0x00..0x1F] -- control character plane 0
2758         GL [0x20..0x7F] -- graphic character plane 0
2759         C1 [0x80..0x9F] -- control character plane 1
2760         GR [0xA0..0xFF] -- graphic character plane 1
2761
2762    A control character set is directly designated and invoked to C0 or
2763    C1 by an escape sequence.  The most common case is that:
2764    - ISO646's  control character set is designated/invoked to C0, and
2765    - ISO6429's control character set is designated/invoked to C1,
2766    and usually these designations/invocations are omitted in encoded
2767    text.  In a 7-bit environment, only C0 can be used, and a control
2768    character for C1 is encoded by an appropriate escape sequence to
2769    fit into the environment.  All control characters for C1 are
2770    defined to have corresponding escape sequences.
2771
2772    A graphic character set is at first designated to one of four
2773    graphic registers (G0 through G3), then these graphic registers are
2774    invoked to GL or GR.  These designations and invocations can be
2775    done independently.  The most common case is that G0 is invoked to
2776    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2777    these invocations and designations are omitted in encoded text.
2778    In a 7-bit environment, only GL can be used.
2779
2780    When a graphic character set of CHARS94 is invoked to GL, codes
2781    0x20 and 0x7F of the GL area work as control characters SPACE and
2782    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2783    be used.
2784
2785    There are two ways of invocation: locking-shift and single-shift.
2786    With locking-shift, the invocation lasts until the next different
2787    invocation, whereas with single-shift, the invocation affects the
2788    following character only and doesn't affect the locking-shift
2789    state.  Invocations are done by the following control characters or
2790    escape sequences:
2791
2792    ----------------------------------------------------------------------
2793    abbrev  function                  cntrl escape seq   description
2794    ----------------------------------------------------------------------
2795    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2796    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2797    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2798    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2799    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2800    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2801    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2802    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2803    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2804    ----------------------------------------------------------------------
2805    (*) These are not used by any known coding system.
2806
2807    Control characters for these functions are defined by macros
2808    ISO_CODE_XXX in `coding.h'.
2809
2810    Designations are done by the following escape sequences:
2811    ----------------------------------------------------------------------
2812    escape sequence      description
2813    ----------------------------------------------------------------------
2814    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2815    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2816    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2817    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2818    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2819    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2820    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2821    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2822    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2823    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2824    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2825    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2826    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2827    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2828    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2829    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2830    ----------------------------------------------------------------------
2831
2832    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2833    of dimension 1, chars 94, and final character <F>, etc...
2834
2835    Note (*): Although these designations are not allowed in ISO2022,
2836    Emacs accepts them on decoding, and produces them on encoding
2837    CHARS96 character sets in a coding system which is characterized as
2838    7-bit environment, non-locking-shift, and non-single-shift.
2839
2840    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2841    '(' must be omitted.  We refer to this as "short-form" hereafter.
2842
2843    Now you may notice that there are a lot of ways of encoding the
2844    same multilingual text in ISO2022.  Actually, there exist many
2845    coding systems such as Compound Text (used in X11's inter client
2846    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2847    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2848    localized platforms), and all of these are variants of ISO2022.
2849
2850    In addition to the above, Emacs handles two more kinds of escape
2851    sequences: ISO6429's direction specification and Emacs' private
2852    sequence for specifying character composition.
2853
2854    ISO6429's direction specification takes the following form:
2855         o CSI ']'      -- end of the current direction
2856         o CSI '0' ']'  -- end of the current direction
2857         o CSI '1' ']'  -- start of left-to-right text
2858         o CSI '2' ']'  -- start of right-to-left text
2859    The control character CSI (0x9B: control sequence introducer) is
2860    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2861
2862    Character composition specification takes the following form:
2863         o ESC '0' -- start relative composition
2864         o ESC '1' -- end composition
2865         o ESC '2' -- start rule-base composition (*)
2866         o ESC '3' -- start relative composition with alternate chars  (**)
2867         o ESC '4' -- start rule-base composition with alternate chars  (**)
2868   Since these are not standard escape sequences of any ISO standard,
2869   the use of them with these meanings is restricted to Emacs only.
2870
2871   (*) This form is used only in Emacs 20.7 and older versions,
2872   but newer versions can safely decode it.
2873   (**) This form is used only in Emacs 21.1 and newer versions,
2874   and older versions can't decode it.
2875
2876   Here's a list of example usages of these composition escape
2877   sequences (categorized by `enum composition_method').
2878
2879   COMPOSITION_RELATIVE:
2880         ESC 0 CHAR [ CHAR ] ESC 1
2881   COMPOSITION_WITH_RULE:
2882         ESC 2 CHAR [ RULE CHAR ] ESC 1
2883   COMPOSITION_WITH_ALTCHARS:
2884         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2885   COMPOSITION_WITH_RULE_ALTCHARS:
2886         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2887
2888 static enum iso_code_class_type iso_code_class[256];
2889
2890 #define SAFE_CHARSET_P(coding, id)      \
2891   ((id) <= (coding)->max_charset_id     \
2892    && (coding)->safe_charsets[id] != 255)
2893
2894 static void
2895 setup_iso_safe_charsets (Lisp_Object attrs)
2896 {
2897   Lisp_Object charset_list, safe_charsets;
2898   Lisp_Object request;
2899   Lisp_Object reg_usage;
2900   Lisp_Object tail;
2901   EMACS_INT reg94, reg96;
2902   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2903   int max_charset_id;
2904
2905   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2906   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2907       && ! EQ (charset_list, Viso_2022_charset_list))
2908     {
2909       charset_list = Viso_2022_charset_list;
2910       ASET (attrs, coding_attr_charset_list, charset_list);
2911       ASET (attrs, coding_attr_safe_charsets, Qnil);
2912     }
2913
2914   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2915     return;
2916
2917   max_charset_id = 0;
2918   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2919     {
2920       int id = XINT (XCAR (tail));
2921       if (max_charset_id < id)
2922         max_charset_id = id;
2923     }
2924
2925   safe_charsets = make_uninit_string (max_charset_id + 1);
2926   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2927   request = AREF (attrs, coding_attr_iso_request);
2928   reg_usage = AREF (attrs, coding_attr_iso_usage);
2929   reg94 = XINT (XCAR (reg_usage));
2930   reg96 = XINT (XCDR (reg_usage));
2931
2932   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2933     {
2934       Lisp_Object id;
2935       Lisp_Object reg;
2936       struct charset *charset;
2937
2938       id = XCAR (tail);
2939       charset = CHARSET_FROM_ID (XINT (id));
2940       reg = Fcdr (Fassq (id, request));
2941       if (! NILP (reg))
2942         SSET (safe_charsets, XINT (id), XINT (reg));
2943       else if (charset->iso_chars_96)
2944         {
2945           if (reg96 < 4)
2946             SSET (safe_charsets, XINT (id), reg96);
2947         }
2948       else
2949         {
2950           if (reg94 < 4)
2951             SSET (safe_charsets, XINT (id), reg94);
2952         }
2953     }
2954   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2955 }
2956
2957
2958 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2959    Return true if a text is encoded in one of ISO-2022 based coding
2960    systems.  */
2961
2962 static bool
2963 detect_coding_iso_2022 (struct coding_system *coding,
2964                         struct coding_detection_info *detect_info)
2965 {
2966   const unsigned char *src = coding->source, *src_base = src;
2967   const unsigned char *src_end = coding->source + coding->src_bytes;
2968   bool multibytep = coding->src_multibyte;
2969   bool single_shifting = 0;
2970   int id;
2971   int c, c1;
2972   ptrdiff_t consumed_chars = 0;
2973   int i;
2974   int rejected = 0;
2975   int found = 0;
2976   int composition_count = -1;
2977
2978   detect_info->checked |= CATEGORY_MASK_ISO;
2979
2980   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2981     {
2982       struct coding_system *this = &(coding_categories[i]);
2983       Lisp_Object attrs, val;
2984
2985       if (this->id < 0)
2986         continue;
2987       attrs = CODING_ID_ATTRS (this->id);
2988       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2989           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2990         setup_iso_safe_charsets (attrs);
2991       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2992       this->max_charset_id = SCHARS (val) - 1;
2993       this->safe_charsets = SDATA (val);
2994     }
2995
2996   /* A coding system of this category is always ASCII compatible.  */
2997   src += coding->head_ascii;
2998
2999   while (rejected != CATEGORY_MASK_ISO)
3000     {
3001       src_base = src;
3002       ONE_MORE_BYTE (c);
3003       switch (c)
3004         {
3005         case ISO_CODE_ESC:
3006           if (inhibit_iso_escape_detection)
3007             break;
3008           single_shifting = 0;
3009           ONE_MORE_BYTE (c);
3010           if (c == 'N' || c == 'O')
3011             {
3012               /* ESC <Fe> for SS2 or SS3.  */
3013               single_shifting = 1;
3014               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3015             }
3016           else if (c == '1')
3017             {
3018               /* End of composition.  */
3019               if (composition_count < 0
3020                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3021                 /* Invalid */
3022                 break;
3023               composition_count = -1;
3024               found |= CATEGORY_MASK_ISO;
3025             }
3026           else if (c >= '0' && c <= '4')
3027             {
3028               /* ESC <Fp> for start/end composition.  */
3029               composition_count = 0;
3030             }
3031           else
3032             {
3033               if (c >= '(' && c <= '/')
3034                 {
3035                   /* Designation sequence for a charset of dimension 1.  */
3036                   ONE_MORE_BYTE (c1);
3037                   if (c1 < ' ' || c1 >= 0x80
3038                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3039                     /* Invalid designation sequence.  Just ignore.  */
3040                     break;
3041                 }
3042               else if (c == '$')
3043                 {
3044                   /* Designation sequence for a charset of dimension 2.  */
3045                   ONE_MORE_BYTE (c);
3046                   if (c >= '@' && c <= 'B')
3047                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3048                     id = iso_charset_table[1][0][c];
3049                   else if (c >= '(' && c <= '/')
3050                     {
3051                       ONE_MORE_BYTE (c1);
3052                       if (c1 < ' ' || c1 >= 0x80
3053                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3054                         /* Invalid designation sequence.  Just ignore.  */
3055                         break;
3056                     }
3057                   else
3058                     /* Invalid designation sequence.  Just ignore it.  */
3059                     break;
3060                 }
3061               else
3062                 {
3063                   /* Invalid escape sequence.  Just ignore it.  */
3064                   break;
3065                 }
3066
3067               /* We found a valid designation sequence for CHARSET.  */
3068               rejected |= CATEGORY_MASK_ISO_8BIT;
3069               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3070                                   id))
3071                 found |= CATEGORY_MASK_ISO_7;
3072               else
3073                 rejected |= CATEGORY_MASK_ISO_7;
3074               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3075                                   id))
3076                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3077               else
3078                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3079               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3080                                   id))
3081                 found |= CATEGORY_MASK_ISO_7_ELSE;
3082               else
3083                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3084               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3085                                   id))
3086                 found |= CATEGORY_MASK_ISO_8_ELSE;
3087               else
3088                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3089             }
3090           break;
3091
3092         case ISO_CODE_SO:
3093         case ISO_CODE_SI:
3094           /* Locking shift out/in.  */
3095           if (inhibit_iso_escape_detection)
3096             break;
3097           single_shifting = 0;
3098           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3099           break;
3100
3101         case ISO_CODE_CSI:
3102           /* Control sequence introducer.  */
3103           single_shifting = 0;
3104           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3105           found |= CATEGORY_MASK_ISO_8_ELSE;
3106           goto check_extra_latin;
3107
3108         case ISO_CODE_SS2:
3109         case ISO_CODE_SS3:
3110           /* Single shift.   */
3111           if (inhibit_iso_escape_detection)
3112             break;
3113           single_shifting = 0;
3114           rejected |= CATEGORY_MASK_ISO_7BIT;
3115           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3116               & CODING_ISO_FLAG_SINGLE_SHIFT)
3117             {
3118               found |= CATEGORY_MASK_ISO_8_1;
3119               single_shifting = 1;
3120             }
3121           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3122               & CODING_ISO_FLAG_SINGLE_SHIFT)
3123             {
3124               found |= CATEGORY_MASK_ISO_8_2;
3125               single_shifting = 1;
3126             }
3127           if (single_shifting)
3128             break;
3129           goto check_extra_latin;
3130
3131         default:
3132           if (c < 0)
3133             continue;
3134           if (c < 0x80)
3135             {
3136               if (composition_count >= 0)
3137                 composition_count++;
3138               single_shifting = 0;
3139               break;
3140             }
3141           if (c >= 0xA0)
3142             {
3143               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3144               found |= CATEGORY_MASK_ISO_8_1;
3145               /* Check the length of succeeding codes of the range
3146                  0xA0..0FF.  If the byte length is even, we include
3147                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3148                  only when we are not single shifting.  */
3149               if (! single_shifting
3150                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3151                 {
3152                   int len = 1;
3153                   while (src < src_end)
3154                     {
3155                       src_base = src;
3156                       ONE_MORE_BYTE (c);
3157                       if (c < 0xA0)
3158                         {
3159                           src = src_base;
3160                           break;
3161                         }
3162                       len++;
3163                     }
3164
3165                   if (len & 1 && src < src_end)
3166                     {
3167                       rejected |= CATEGORY_MASK_ISO_8_2;
3168                       if (composition_count >= 0)
3169                         composition_count += len;
3170                     }
3171                   else
3172                     {
3173                       found |= CATEGORY_MASK_ISO_8_2;
3174                       if (composition_count >= 0)
3175                         composition_count += len / 2;
3176                     }
3177                 }
3178               break;
3179             }
3180         check_extra_latin:
3181           if (! VECTORP (Vlatin_extra_code_table)
3182               || NILP (AREF (Vlatin_extra_code_table, c)))
3183             {
3184               rejected = CATEGORY_MASK_ISO;
3185               break;
3186             }
3187           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3188               & CODING_ISO_FLAG_LATIN_EXTRA)
3189             found |= CATEGORY_MASK_ISO_8_1;
3190           else
3191             rejected |= CATEGORY_MASK_ISO_8_1;
3192           rejected |= CATEGORY_MASK_ISO_8_2;
3193           break;
3194         }
3195     }
3196   detect_info->rejected |= CATEGORY_MASK_ISO;
3197   return 0;
3198
3199  no_more_source:
3200   detect_info->rejected |= rejected;
3201   detect_info->found |= (found & ~rejected);
3202   return 1;
3203 }
3204
3205
3206 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3207    escape sequence should be kept.  */
3208 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3209   do {                                                                  \
3210     int id, prev;                                                       \
3211                                                                         \
3212     if (final < '0' || final >= 128                                     \
3213         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3214         || !SAFE_CHARSET_P (coding, id))                                \
3215       {                                                                 \
3216         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3217         chars_96 = -1;                                                  \
3218         break;                                                          \
3219       }                                                                 \
3220     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3221     if (id == charset_jisx0201_roman)                                   \
3222       {                                                                 \
3223         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3224           id = charset_ascii;                                           \
3225       }                                                                 \
3226     else if (id == charset_jisx0208_1978)                               \
3227       {                                                                 \
3228         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3229           id = charset_jisx0208;                                        \
3230       }                                                                 \
3231     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3232     /* If there was an invalid designation to REG previously, and this  \
3233        designation is ASCII to REG, we should keep this designation     \
3234        sequence.  */                                                    \
3235     if (prev == -2 && id == charset_ascii)                              \
3236       chars_96 = -1;                                                    \
3237   } while (0)
3238
3239
3240 /* Handle these composition sequence (ALT: alternate char):
3241
3242    (1) relative composition: ESC 0 CHAR ... ESC 1
3243    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3244    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3245    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3246
3247    When the start sequence (ESC 0/2/3/4) is found, this annotation
3248    header is produced.
3249
3250         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3251
3252    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3253    produced until the end sequence (ESC 1) is found:
3254
3255    (1) CHAR ... CHAR
3256    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3257    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3258    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3259
3260    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3261    annotation header is updated as below:
3262
3263    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3264    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3265    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3266    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3267
3268    If an error is found while composing, the annotation header is
3269    changed to:
3270
3271         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3272
3273    and the sequence [ -2 DECODED-RULE ] is changed to the original
3274    byte sequence as below:
3275         o the original byte sequence is B: [ B -1 ]
3276         o the original byte sequence is B1 B2: [ B1 B2 ]
3277    and the sequence [ -1 -1 ] is changed to the original byte
3278    sequence:
3279         [ ESC '0' ]
3280 */
3281
3282 /* Decode a composition rule C1 and maybe one more byte from the
3283    source, and set RULE to the encoded composition rule.  If the rule
3284    is invalid, goto invalid_code.  */
3285
3286 #define DECODE_COMPOSITION_RULE(rule)                                   \
3287   do {                                                                  \
3288     rule = c1 - 32;                                                     \
3289     if (rule < 0)                                                       \
3290       goto invalid_code;                                                \
3291     if (rule < 81)              /* old format (before ver.21) */        \
3292       {                                                                 \
3293         int gref = (rule) / 9;                                          \
3294         int nref = (rule) % 9;                                          \
3295         if (gref == 4) gref = 10;                                       \
3296         if (nref == 4) nref = 10;                                       \
3297         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3298       }                                                                 \
3299     else                        /* new format (after ver.21) */         \
3300       {                                                                 \
3301         int b;                                                          \
3302                                                                         \
3303         ONE_MORE_BYTE (b);                                              \
3304         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3305           goto invalid_code;                                            \
3306         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3307         rule += 0x100;   /* Distinguish it from the old format.  */     \
3308       }                                                                 \
3309   } while (0)
3310
3311 #define ENCODE_COMPOSITION_RULE(rule)                           \
3312   do {                                                          \
3313     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3314                                                                 \
3315     if (rule < 0x100)           /* old format */                \
3316       {                                                         \
3317         if (gref == 10) gref = 4;                               \
3318         if (nref == 10) nref = 4;                               \
3319         charbuf[idx] = 32 + gref * 9 + nref;                    \
3320         charbuf[idx + 1] = -1;                                  \
3321         new_chars++;                                            \
3322       }                                                         \
3323     else                                /* new format */        \
3324       {                                                         \
3325         charbuf[idx] = 32 + 81 + gref;                          \
3326         charbuf[idx + 1] = 32 + nref;                           \
3327         new_chars += 2;                                         \
3328       }                                                         \
3329   } while (0)
3330
3331 /* Finish the current composition as invalid.  */
3332
3333 static int
3334 finish_composition (int *charbuf, struct composition_status *cmp_status)
3335 {
3336   int idx = - cmp_status->length;
3337   int new_chars;
3338
3339   /* Recover the original ESC sequence */
3340   charbuf[idx++] = ISO_CODE_ESC;
3341   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3342                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3343                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3344                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3345                     : '4');
3346   charbuf[idx++] = -2;
3347   charbuf[idx++] = 0;
3348   charbuf[idx++] = -1;
3349   new_chars = cmp_status->nchars;
3350   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3351     for (; idx < 0; idx++)
3352       {
3353         int elt = charbuf[idx];
3354
3355         if (elt == -2)
3356           {
3357             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3358             idx++;
3359           }
3360         else if (elt == -1)
3361           {
3362             charbuf[idx++] = ISO_CODE_ESC;
3363             charbuf[idx] = '0';
3364             new_chars += 2;
3365           }
3366       }
3367   cmp_status->state = COMPOSING_NO;
3368   return new_chars;
3369 }
3370
3371 /* If characters are under composition, finish the composition.  */
3372 #define MAYBE_FINISH_COMPOSITION()                              \
3373   do {                                                          \
3374     if (cmp_status->state != COMPOSING_NO)                      \
3375       char_offset += finish_composition (charbuf, cmp_status);  \
3376   } while (0)
3377
3378 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3379
3380    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3381    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3382    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3383    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3384
3385    Produce this annotation sequence now:
3386
3387    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3388 */
3389
3390 #define DECODE_COMPOSITION_START(c1)                                       \
3391   do {                                                                     \
3392     if (c1 == '0'                                                          \
3393         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3394              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3395             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3396                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3397       {                                                                    \
3398         *charbuf++ = -1;                                                   \
3399         *charbuf++= -1;                                                    \
3400         cmp_status->state = COMPOSING_CHAR;                                \
3401         cmp_status->length += 2;                                           \
3402       }                                                                    \
3403     else                                                                   \
3404       {                                                                    \
3405         MAYBE_FINISH_COMPOSITION ();                                       \
3406         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3407                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3408                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3409                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3410         cmp_status->state                                                  \
3411           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3412         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3413         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3414         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3415         coding->annotated = 1;                                             \
3416       }                                                                    \
3417   } while (0)
3418
3419
3420 /* Handle composition end sequence ESC 1.  */
3421
3422 #define DECODE_COMPOSITION_END()                                        \
3423   do {                                                                  \
3424     if (cmp_status->nchars == 0                                         \
3425         || ((cmp_status->state == COMPOSING_CHAR)                       \
3426             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3427       {                                                                 \
3428         MAYBE_FINISH_COMPOSITION ();                                    \
3429         goto invalid_code;                                              \
3430       }                                                                 \
3431     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3432       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3433     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3434       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3435     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3436     char_offset += cmp_status->nchars;                                  \
3437     cmp_status->state = COMPOSING_NO;                                   \
3438   } while (0)
3439
3440 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3441
3442 #define STORE_COMPOSITION_RULE(rule)    \
3443   do {                                  \
3444     *charbuf++ = -2;                    \
3445     *charbuf++ = rule;                  \
3446     cmp_status->length += 2;            \
3447     cmp_status->state--;                \
3448   } while (0)
3449
3450 /* Store a composed char or a component char C in charbuf, and update
3451    cmp_status.  */
3452
3453 #define STORE_COMPOSITION_CHAR(c)                                       \
3454   do {                                                                  \
3455     *charbuf++ = (c);                                                   \
3456     cmp_status->length++;                                               \
3457     if (cmp_status->state == COMPOSING_CHAR)                            \
3458       cmp_status->nchars++;                                             \
3459     else                                                                \
3460       cmp_status->ncomps++;                                             \
3461     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3462         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3463             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3464       cmp_status->state++;                                              \
3465   } while (0)
3466
3467
3468 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3469
3470 static void
3471 decode_coding_iso_2022 (struct coding_system *coding)
3472 {
3473   const unsigned char *src = coding->source + coding->consumed;
3474   const unsigned char *src_end = coding->source + coding->src_bytes;
3475   const unsigned char *src_base;
3476   int *charbuf = coding->charbuf + coding->charbuf_used;
3477   /* We may produce two annotations (charset and composition) in one
3478      loop and one more charset annotation at the end.  */
3479   int *charbuf_end
3480     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3481   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3482   bool multibytep = coding->src_multibyte;
3483   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3484   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3485   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3486   int charset_id_2, charset_id_3;
3487   struct charset *charset;
3488   int c;
3489   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3490   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3491   ptrdiff_t char_offset = coding->produced_char;
3492   ptrdiff_t last_offset = char_offset;
3493   int last_id = charset_ascii;
3494   bool eol_dos
3495     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3496   int byte_after_cr = -1;
3497   int i;
3498
3499   setup_iso_safe_charsets (attrs);
3500   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3501
3502   if (cmp_status->state != COMPOSING_NO)
3503     {
3504       if (charbuf_end - charbuf < cmp_status->length)
3505         emacs_abort ();
3506       for (i = 0; i < cmp_status->length; i++)
3507         *charbuf++ = cmp_status->carryover[i];
3508       coding->annotated = 1;
3509     }
3510
3511   while (1)
3512     {
3513       int c1, c2, c3;
3514
3515       src_base = src;
3516       consumed_chars_base = consumed_chars;
3517
3518       if (charbuf >= charbuf_end)
3519         {
3520           if (byte_after_cr >= 0)
3521             src_base--;
3522           break;
3523         }
3524
3525       if (byte_after_cr >= 0)
3526         c1 = byte_after_cr, byte_after_cr = -1;
3527       else
3528         ONE_MORE_BYTE (c1);
3529       if (c1 < 0)
3530         goto invalid_code;
3531
3532       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3533         {
3534           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3535           char_offset++;
3536           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3537           continue;
3538         }
3539
3540       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3541         {
3542           if (c1 == ISO_CODE_ESC)
3543             {
3544               if (src + 1 >= src_end)
3545                 goto no_more_source;
3546               *charbuf++ = ISO_CODE_ESC;
3547               char_offset++;
3548               if (src[0] == '%' && src[1] == '@')
3549                 {
3550                   src += 2;
3551                   consumed_chars += 2;
3552                   char_offset += 2;
3553                   /* We are sure charbuf can contain two more chars. */
3554                   *charbuf++ = '%';
3555                   *charbuf++ = '@';
3556                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3557                 }
3558             }
3559           else
3560             {
3561               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3562               char_offset++;
3563             }
3564           continue;
3565         }
3566
3567       if ((cmp_status->state == COMPOSING_RULE
3568            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3569           && c1 != ISO_CODE_ESC)
3570         {
3571           int rule;
3572
3573           DECODE_COMPOSITION_RULE (rule);
3574           STORE_COMPOSITION_RULE (rule);
3575           continue;
3576         }
3577
3578       /* We produce at most one character.  */
3579       switch (iso_code_class [c1])
3580         {
3581         case ISO_0x20_or_0x7F:
3582           if (charset_id_0 < 0
3583               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3584             /* This is SPACE or DEL.  */
3585             charset = CHARSET_FROM_ID (charset_ascii);
3586           else
3587             charset = CHARSET_FROM_ID (charset_id_0);
3588           break;
3589
3590         case ISO_graphic_plane_0:
3591           if (charset_id_0 < 0)
3592             charset = CHARSET_FROM_ID (charset_ascii);
3593           else
3594             charset = CHARSET_FROM_ID (charset_id_0);
3595           break;
3596
3597         case ISO_0xA0_or_0xFF:
3598           if (charset_id_1 < 0
3599               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3600               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3601             goto invalid_code;
3602           /* This is a graphic character, we fall down ... */
3603
3604         case ISO_graphic_plane_1:
3605           if (charset_id_1 < 0)
3606             goto invalid_code;
3607           charset = CHARSET_FROM_ID (charset_id_1);
3608           break;
3609
3610         case ISO_control_0:
3611           if (eol_dos && c1 == '\r')
3612             ONE_MORE_BYTE (byte_after_cr);
3613           MAYBE_FINISH_COMPOSITION ();
3614           charset = CHARSET_FROM_ID (charset_ascii);
3615           break;
3616
3617         case ISO_control_1:
3618           goto invalid_code;
3619
3620         case ISO_shift_out:
3621           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3622               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3623             goto invalid_code;
3624           CODING_ISO_INVOCATION (coding, 0) = 1;
3625           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3626           continue;
3627
3628         case ISO_shift_in:
3629           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3630             goto invalid_code;
3631           CODING_ISO_INVOCATION (coding, 0) = 0;
3632           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3633           continue;
3634
3635         case ISO_single_shift_2_7:
3636           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3637             goto invalid_code;
3638         case ISO_single_shift_2:
3639           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3640             goto invalid_code;
3641           /* SS2 is handled as an escape sequence of ESC 'N' */
3642           c1 = 'N';
3643           goto label_escape_sequence;
3644
3645         case ISO_single_shift_3:
3646           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3647             goto invalid_code;
3648           /* SS2 is handled as an escape sequence of ESC 'O' */
3649           c1 = 'O';
3650           goto label_escape_sequence;
3651
3652         case ISO_control_sequence_introducer:
3653           /* CSI is handled as an escape sequence of ESC '[' ...  */
3654           c1 = '[';
3655           goto label_escape_sequence;
3656
3657         case ISO_escape:
3658           ONE_MORE_BYTE (c1);
3659         label_escape_sequence:
3660           /* Escape sequences handled here are invocation,
3661              designation, direction specification, and character
3662              composition specification.  */
3663           switch (c1)
3664             {
3665             case '&':           /* revision of following character set */
3666               ONE_MORE_BYTE (c1);
3667               if (!(c1 >= '@' && c1 <= '~'))
3668                 goto invalid_code;
3669               ONE_MORE_BYTE (c1);
3670               if (c1 != ISO_CODE_ESC)
3671                 goto invalid_code;
3672               ONE_MORE_BYTE (c1);
3673               goto label_escape_sequence;
3674
3675             case '$':           /* designation of 2-byte character set */
3676               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3677                 goto invalid_code;
3678               {
3679                 int reg, chars96;
3680
3681                 ONE_MORE_BYTE (c1);
3682                 if (c1 >= '@' && c1 <= 'B')
3683                   {     /* designation of JISX0208.1978, GB2312.1980,
3684                            or JISX0208.1980 */
3685                     reg = 0, chars96 = 0;
3686                   }
3687                 else if (c1 >= 0x28 && c1 <= 0x2B)
3688                   { /* designation of DIMENSION2_CHARS94 character set */
3689                     reg = c1 - 0x28, chars96 = 0;
3690                     ONE_MORE_BYTE (c1);
3691                   }
3692                 else if (c1 >= 0x2C && c1 <= 0x2F)
3693                   { /* designation of DIMENSION2_CHARS96 character set */
3694                     reg = c1 - 0x2C, chars96 = 1;
3695                     ONE_MORE_BYTE (c1);
3696                   }
3697                 else
3698                   goto invalid_code;
3699                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3700                 /* We must update these variables now.  */
3701                 if (reg == 0)
3702                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3703                 else if (reg == 1)
3704                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3705                 if (chars96 < 0)
3706                   goto invalid_code;
3707               }
3708               continue;
3709
3710             case 'n':           /* invocation of locking-shift-2 */
3711               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3712                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3713                 goto invalid_code;
3714               CODING_ISO_INVOCATION (coding, 0) = 2;
3715               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3716               continue;
3717
3718             case 'o':           /* invocation of locking-shift-3 */
3719               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3720                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3721                 goto invalid_code;
3722               CODING_ISO_INVOCATION (coding, 0) = 3;
3723               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3724               continue;
3725
3726             case 'N':           /* invocation of single-shift-2 */
3727               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3728                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3729                 goto invalid_code;
3730               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3731               if (charset_id_2 < 0)
3732                 charset = CHARSET_FROM_ID (charset_ascii);
3733               else
3734                 charset = CHARSET_FROM_ID (charset_id_2);
3735               ONE_MORE_BYTE (c1);
3736               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3737                 goto invalid_code;
3738               break;
3739
3740             case 'O':           /* invocation of single-shift-3 */
3741               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3742                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3743                 goto invalid_code;
3744               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3745               if (charset_id_3 < 0)
3746                 charset = CHARSET_FROM_ID (charset_ascii);
3747               else
3748                 charset = CHARSET_FROM_ID (charset_id_3);
3749               ONE_MORE_BYTE (c1);
3750               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3751                 goto invalid_code;
3752               break;
3753
3754             case '0': case '2': case '3': case '4': /* start composition */
3755               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3756                 goto invalid_code;
3757               if (last_id != charset_ascii)
3758                 {
3759                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3760                   last_id = charset_ascii;
3761                   last_offset = char_offset;
3762                 }
3763               DECODE_COMPOSITION_START (c1);
3764               continue;
3765
3766             case '1':           /* end composition */
3767               if (cmp_status->state == COMPOSING_NO)
3768                 goto invalid_code;
3769               DECODE_COMPOSITION_END ();
3770               continue;
3771
3772             case '[':           /* specification of direction */
3773               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3774                 goto invalid_code;
3775               /* For the moment, nested direction is not supported.
3776                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3777                  left-to-right, and nonzero means right-to-left.  */
3778               ONE_MORE_BYTE (c1);
3779               switch (c1)
3780                 {
3781                 case ']':       /* end of the current direction */
3782                   coding->mode &= ~CODING_MODE_DIRECTION;
3783
3784                 case '0':       /* end of the current direction */
3785                 case '1':       /* start of left-to-right direction */
3786                   ONE_MORE_BYTE (c1);
3787                   if (c1 == ']')
3788                     coding->mode &= ~CODING_MODE_DIRECTION;
3789                   else
3790                     goto invalid_code;
3791                   break;
3792
3793                 case '2':       /* start of right-to-left direction */
3794                   ONE_MORE_BYTE (c1);
3795                   if (c1 == ']')
3796                     coding->mode |= CODING_MODE_DIRECTION;
3797                   else
3798                     goto invalid_code;
3799                   break;
3800
3801                 default:
3802                   goto invalid_code;
3803                 }
3804               continue;
3805
3806             case '%':
3807               ONE_MORE_BYTE (c1);
3808               if (c1 == '/')
3809                 {
3810                   /* CTEXT extended segment:
3811                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3812                      We keep these bytes as is for the moment.
3813                      They may be decoded by post-read-conversion.  */
3814                   int dim, M, L;
3815                   int size;
3816
3817                   ONE_MORE_BYTE (dim);
3818                   if (dim < '0' || dim > '4')
3819                     goto invalid_code;
3820                   ONE_MORE_BYTE (M);
3821                   if (M < 128)
3822                     goto invalid_code;
3823                   ONE_MORE_BYTE (L);
3824                   if (L < 128)
3825                     goto invalid_code;
3826                   size = ((M - 128) * 128) + (L - 128);
3827                   if (charbuf + 6 > charbuf_end)
3828                     goto break_loop;
3829                   *charbuf++ = ISO_CODE_ESC;
3830                   *charbuf++ = '%';
3831                   *charbuf++ = '/';
3832                   *charbuf++ = dim;
3833                   *charbuf++ = BYTE8_TO_CHAR (M);
3834                   *charbuf++ = BYTE8_TO_CHAR (L);
3835                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3836                 }
3837               else if (c1 == 'G')
3838                 {
3839                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3840                      ESC % G --UTF-8-BYTES-- ESC % @
3841                      We keep these bytes as is for the moment.
3842                      They may be decoded by post-read-conversion.  */
3843                   if (charbuf + 3 > charbuf_end)
3844                     goto break_loop;
3845                   *charbuf++ = ISO_CODE_ESC;
3846                   *charbuf++ = '%';
3847                   *charbuf++ = 'G';
3848                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3849                 }
3850               else
3851                 goto invalid_code;
3852               continue;
3853               break;
3854
3855             default:
3856               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3857                 goto invalid_code;
3858               {
3859                 int reg, chars96;
3860
3861                 if (c1 >= 0x28 && c1 <= 0x2B)
3862                   { /* designation of DIMENSION1_CHARS94 character set */
3863                     reg = c1 - 0x28, chars96 = 0;
3864                     ONE_MORE_BYTE (c1);
3865                   }
3866                 else if (c1 >= 0x2C && c1 <= 0x2F)
3867                   { /* designation of DIMENSION1_CHARS96 character set */
3868                     reg = c1 - 0x2C, chars96 = 1;
3869                     ONE_MORE_BYTE (c1);
3870                   }
3871                 else
3872                   goto invalid_code;
3873                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3874                 /* We must update these variables now.  */
3875                 if (reg == 0)
3876                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3877                 else if (reg == 1)
3878                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3879                 if (chars96 < 0)
3880                   goto invalid_code;
3881               }
3882               continue;
3883             }
3884           break;
3885
3886         default:
3887           emacs_abort ();
3888         }
3889
3890       if (cmp_status->state == COMPOSING_NO
3891           && charset->id != charset_ascii
3892           && last_id != charset->id)
3893         {
3894           if (last_id != charset_ascii)
3895             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3896           last_id = charset->id;
3897           last_offset = char_offset;
3898         }
3899
3900       /* Now we know CHARSET and 1st position code C1 of a character.
3901          Produce a decoded character while getting 2nd and 3rd
3902          position codes C2, C3 if necessary.  */
3903       if (CHARSET_DIMENSION (charset) > 1)
3904         {
3905           ONE_MORE_BYTE (c2);
3906           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3907               || ((c1 & 0x80) != (c2 & 0x80)))
3908             /* C2 is not in a valid range.  */
3909             goto invalid_code;
3910           if (CHARSET_DIMENSION (charset) == 2)
3911             c1 = (c1 << 8) | c2;
3912           else
3913             {
3914               ONE_MORE_BYTE (c3);
3915               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3916                   || ((c1 & 0x80) != (c3 & 0x80)))
3917                 /* C3 is not in a valid range.  */
3918                 goto invalid_code;
3919               c1 = (c1 << 16) | (c2 << 8) | c2;
3920             }
3921         }
3922       c1 &= 0x7F7F7F;
3923       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3924       if (c < 0)
3925         {
3926           MAYBE_FINISH_COMPOSITION ();
3927           for (; src_base < src; src_base++, char_offset++)
3928             {
3929               if (ASCII_BYTE_P (*src_base))
3930                 *charbuf++ = *src_base;
3931               else
3932                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3933             }
3934         }
3935       else if (cmp_status->state == COMPOSING_NO)
3936         {
3937           *charbuf++ = c;
3938           char_offset++;
3939         }
3940       else if ((cmp_status->state == COMPOSING_CHAR
3941                 ? cmp_status->nchars
3942                 : cmp_status->ncomps)
3943                >= MAX_COMPOSITION_COMPONENTS)
3944         {
3945           /* Too long composition.  */
3946           MAYBE_FINISH_COMPOSITION ();
3947           *charbuf++ = c;
3948           char_offset++;
3949         }
3950       else
3951         STORE_COMPOSITION_CHAR (c);
3952       continue;
3953
3954     invalid_code:
3955       MAYBE_FINISH_COMPOSITION ();
3956       src = src_base;
3957       consumed_chars = consumed_chars_base;
3958       ONE_MORE_BYTE (c);
3959       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3960       char_offset++;
3961       coding->errors++;
3962       /* Reset the invocation and designation status to the safest
3963          one; i.e. designate ASCII to the graphic register 0, and
3964          invoke that register to the graphic plane 0.  This typically
3965          helps the case that an designation sequence for ASCII "ESC (
3966          B" is somehow broken (e.g. broken by a newline).  */
3967       CODING_ISO_INVOCATION (coding, 0) = 0;
3968       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3969       charset_id_0 = charset_ascii;
3970       continue;
3971
3972     break_loop:
3973       break;
3974     }
3975
3976  no_more_source:
3977   if (cmp_status->state != COMPOSING_NO)
3978     {
3979       if (coding->mode & CODING_MODE_LAST_BLOCK)
3980         MAYBE_FINISH_COMPOSITION ();
3981       else
3982         {
3983           charbuf -= cmp_status->length;
3984           for (i = 0; i < cmp_status->length; i++)
3985             cmp_status->carryover[i] = charbuf[i];
3986         }
3987     }
3988   else if (last_id != charset_ascii)
3989     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3990   coding->consumed_char += consumed_chars_base;
3991   coding->consumed = src_base - coding->source;
3992   coding->charbuf_used = charbuf - coding->charbuf;
3993 }
3994
3995
3996 /* ISO2022 encoding stuff.  */
3997
3998 /*
3999    It is not enough to say just "ISO2022" on encoding, we have to
4000    specify more details.  In Emacs, each coding system of ISO2022
4001    variant has the following specifications:
4002         1. Initial designation to G0 thru G3.
4003         2. Allows short-form designation?
4004         3. ASCII should be designated to G0 before control characters?
4005         4. ASCII should be designated to G0 at end of line?
4006         5. 7-bit environment or 8-bit environment?
4007         6. Use locking-shift?
4008         7. Use Single-shift?
4009    And the following two are only for Japanese:
4010         8. Use ASCII in place of JIS0201-1976-Roman?
4011         9. Use JISX0208-1983 in place of JISX0208-1978?
4012    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4013    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4014    details.
4015 */
4016
4017 /* Produce codes (escape sequence) for designating CHARSET to graphic
4018    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4019    '@', 'A', or 'B' and the coding system CODING allows, produce
4020    designation sequence of short-form.  */
4021
4022 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4023   do {                                                                  \
4024     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4025     const char *intermediate_char_94 = "()*+";                          \
4026     const char *intermediate_char_96 = ",-./";                          \
4027     int revision = -1;                                                  \
4028                                                                         \
4029     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4030       revision = CHARSET_ISO_REVISION (charset);                        \
4031                                                                         \
4032     if (revision >= 0)                                                  \
4033       {                                                                 \
4034         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4035         EMIT_ONE_BYTE ('@' + revision);                                 \
4036       }                                                                 \
4037     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4038     if (CHARSET_DIMENSION (charset) == 1)                               \
4039       {                                                                 \
4040         int b;                                                          \
4041         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4042           b = intermediate_char_94[reg];                                \
4043         else                                                            \
4044           b = intermediate_char_96[reg];                                \
4045         EMIT_ONE_ASCII_BYTE (b);                                        \
4046       }                                                                 \
4047     else                                                                \
4048       {                                                                 \
4049         EMIT_ONE_ASCII_BYTE ('$');                                      \
4050         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4051           {                                                             \
4052             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4053                 || reg != 0                                             \
4054                 || final_char < '@' || final_char > 'B')                \
4055               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4056           }                                                             \
4057         else                                                            \
4058           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4059       }                                                                 \
4060     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4061                                                                         \
4062     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4063   } while (0)
4064
4065
4066 /* The following two macros produce codes (control character or escape
4067    sequence) for ISO2022 single-shift functions (single-shift-2 and
4068    single-shift-3).  */
4069
4070 #define ENCODE_SINGLE_SHIFT_2                                           \
4071   do {                                                                  \
4072     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4073       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4074     else                                                                \
4075       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4076     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4077   } while (0)
4078
4079
4080 #define ENCODE_SINGLE_SHIFT_3                                           \
4081   do {                                                                  \
4082     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4083       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4084     else                                                                \
4085       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4086     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4087   } while (0)
4088
4089
4090 /* The following four macros produce codes (control character or
4091    escape sequence) for ISO2022 locking-shift functions (shift-in,
4092    shift-out, locking-shift-2, and locking-shift-3).  */
4093
4094 #define ENCODE_SHIFT_IN                                 \
4095   do {                                                  \
4096     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4097     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4098   } while (0)
4099
4100
4101 #define ENCODE_SHIFT_OUT                                \
4102   do {                                                  \
4103     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4104     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4105   } while (0)
4106
4107
4108 #define ENCODE_LOCKING_SHIFT_2                          \
4109   do {                                                  \
4110     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4111     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4112   } while (0)
4113
4114
4115 #define ENCODE_LOCKING_SHIFT_3                          \
4116   do {                                                  \
4117     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4118     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4119   } while (0)
4120
4121
4122 /* Produce codes for a DIMENSION1 character whose character set is
4123    CHARSET and whose position-code is C1.  Designation and invocation
4124    sequences are also produced in advance if necessary.  */
4125
4126 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4127   do {                                                                  \
4128     int id = CHARSET_ID (charset);                                      \
4129                                                                         \
4130     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4131         && id == charset_ascii)                                         \
4132       {                                                                 \
4133         id = charset_jisx0201_roman;                                    \
4134         charset = CHARSET_FROM_ID (id);                                 \
4135       }                                                                 \
4136                                                                         \
4137     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4138       {                                                                 \
4139         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4140           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4141         else                                                            \
4142           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4143         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4144         break;                                                          \
4145       }                                                                 \
4146     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4147       {                                                                 \
4148         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4149         break;                                                          \
4150       }                                                                 \
4151     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4152       {                                                                 \
4153         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4154         break;                                                          \
4155       }                                                                 \
4156     else                                                                \
4157       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4158          must invoke it, or, at first, designate it to some graphic     \
4159          register.  Then repeat the loop to actually produce the        \
4160          character.  */                                                 \
4161       dst = encode_invocation_designation (charset, coding, dst,        \
4162                                            &produced_chars);            \
4163   } while (1)
4164
4165
4166 /* Produce codes for a DIMENSION2 character whose character set is
4167    CHARSET and whose position-codes are C1 and C2.  Designation and
4168    invocation codes are also produced in advance if necessary.  */
4169
4170 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4171   do {                                                                  \
4172     int id = CHARSET_ID (charset);                                      \
4173                                                                         \
4174     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4175         && id == charset_jisx0208)                                      \
4176       {                                                                 \
4177         id = charset_jisx0208_1978;                                     \
4178         charset = CHARSET_FROM_ID (id);                                 \
4179       }                                                                 \
4180                                                                         \
4181     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4182       {                                                                 \
4183         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4184           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4185         else                                                            \
4186           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4187         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4188         break;                                                          \
4189       }                                                                 \
4190     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4191       {                                                                 \
4192         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4193         break;                                                          \
4194       }                                                                 \
4195     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4196       {                                                                 \
4197         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4198         break;                                                          \
4199       }                                                                 \
4200     else                                                                \
4201       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4202          must invoke it, or, at first, designate it to some graphic     \
4203          register.  Then repeat the loop to actually produce the        \
4204          character.  */                                                 \
4205       dst = encode_invocation_designation (charset, coding, dst,        \
4206                                            &produced_chars);            \
4207   } while (1)
4208
4209
4210 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4211   do {                                                                     \
4212     unsigned code;                                                         \
4213     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4214                                                                            \
4215     if (CHARSET_DIMENSION (charset) == 1)                                  \
4216       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4217     else                                                                   \
4218       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4219   } while (0)
4220
4221
4222 /* Produce designation and invocation codes at a place pointed by DST
4223    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4224    Return new DST.  */
4225
4226 static unsigned char *
4227 encode_invocation_designation (struct charset *charset,
4228                                struct coding_system *coding,
4229                                unsigned char *dst, ptrdiff_t *p_nchars)
4230 {
4231   bool multibytep = coding->dst_multibyte;
4232   ptrdiff_t produced_chars = *p_nchars;
4233   int reg;                      /* graphic register number */
4234   int id = CHARSET_ID (charset);
4235
4236   /* At first, check designations.  */
4237   for (reg = 0; reg < 4; reg++)
4238     if (id == CODING_ISO_DESIGNATION (coding, reg))
4239       break;
4240
4241   if (reg >= 4)
4242     {
4243       /* CHARSET is not yet designated to any graphic registers.  */
4244       /* At first check the requested designation.  */
4245       reg = CODING_ISO_REQUEST (coding, id);
4246       if (reg < 0)
4247         /* Since CHARSET requests no special designation, designate it
4248            to graphic register 0.  */
4249         reg = 0;
4250
4251       ENCODE_DESIGNATION (charset, reg, coding);
4252     }
4253
4254   if (CODING_ISO_INVOCATION (coding, 0) != reg
4255       && CODING_ISO_INVOCATION (coding, 1) != reg)
4256     {
4257       /* Since the graphic register REG is not invoked to any graphic
4258          planes, invoke it to graphic plane 0.  */
4259       switch (reg)
4260         {
4261         case 0:                 /* graphic register 0 */
4262           ENCODE_SHIFT_IN;
4263           break;
4264
4265         case 1:                 /* graphic register 1 */
4266           ENCODE_SHIFT_OUT;
4267           break;
4268
4269         case 2:                 /* graphic register 2 */
4270           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4271             ENCODE_SINGLE_SHIFT_2;
4272           else
4273             ENCODE_LOCKING_SHIFT_2;
4274           break;
4275
4276         case 3:                 /* graphic register 3 */
4277           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4278             ENCODE_SINGLE_SHIFT_3;
4279           else
4280             ENCODE_LOCKING_SHIFT_3;
4281           break;
4282         }
4283     }
4284
4285   *p_nchars = produced_chars;
4286   return dst;
4287 }
4288
4289
4290 /* Produce codes for designation and invocation to reset the graphic
4291    planes and registers to initial state.  */
4292 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4293   do {                                                                  \
4294     int reg;                                                            \
4295     struct charset *charset;                                            \
4296                                                                         \
4297     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4298       ENCODE_SHIFT_IN;                                                  \
4299     for (reg = 0; reg < 4; reg++)                                       \
4300       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4301           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4302               != CODING_ISO_INITIAL (coding, reg)))                     \
4303         {                                                               \
4304           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4305           ENCODE_DESIGNATION (charset, reg, coding);                    \
4306         }                                                               \
4307   } while (0)
4308
4309
4310 /* Produce designation sequences of charsets in the line started from
4311    CHARBUF to a place pointed by DST, and return the number of
4312    produced bytes.  DST should not directly point a buffer text area
4313    which may be relocated by char_charset call.
4314
4315    If the current block ends before any end-of-line, we may fail to
4316    find all the necessary designations.  */
4317
4318 static ptrdiff_t
4319 encode_designation_at_bol (struct coding_system *coding,
4320                            int *charbuf, int *charbuf_end,
4321                            unsigned char *dst)
4322 {
4323   unsigned char *orig = dst;
4324   struct charset *charset;
4325   /* Table of charsets to be designated to each graphic register.  */
4326   int r[4];
4327   int c, found = 0, reg;
4328   ptrdiff_t produced_chars = 0;
4329   bool multibytep = coding->dst_multibyte;
4330   Lisp_Object attrs;
4331   Lisp_Object charset_list;
4332
4333   attrs = CODING_ID_ATTRS (coding->id);
4334   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4335   if (EQ (charset_list, Qiso_2022))
4336     charset_list = Viso_2022_charset_list;
4337
4338   for (reg = 0; reg < 4; reg++)
4339     r[reg] = -1;
4340
4341   while (charbuf < charbuf_end && found < 4)
4342     {
4343       int id;
4344
4345       c = *charbuf++;
4346       if (c == '\n')
4347         break;
4348       charset = char_charset (c, charset_list, NULL);
4349       id = CHARSET_ID (charset);
4350       reg = CODING_ISO_REQUEST (coding, id);
4351       if (reg >= 0 && r[reg] < 0)
4352         {
4353           found++;
4354           r[reg] = id;
4355         }
4356     }
4357
4358   if (found)
4359     {
4360       for (reg = 0; reg < 4; reg++)
4361         if (r[reg] >= 0
4362             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4363           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4364     }
4365
4366   return dst - orig;
4367 }
4368
4369 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4370
4371 static bool
4372 encode_coding_iso_2022 (struct coding_system *coding)
4373 {
4374   bool multibytep = coding->dst_multibyte;
4375   int *charbuf = coding->charbuf;
4376   int *charbuf_end = charbuf + coding->charbuf_used;
4377   unsigned char *dst = coding->destination + coding->produced;
4378   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4379   int safe_room = 16;
4380   bool bol_designation
4381     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4382        && CODING_ISO_BOL (coding));
4383   ptrdiff_t produced_chars = 0;
4384   Lisp_Object attrs, eol_type, charset_list;
4385   bool ascii_compatible;
4386   int c;
4387   int preferred_charset_id = -1;
4388
4389   CODING_GET_INFO (coding, attrs, charset_list);
4390   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4391   if (VECTORP (eol_type))
4392     eol_type = Qunix;
4393
4394   setup_iso_safe_charsets (attrs);
4395   /* Charset list may have been changed.  */
4396   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4397   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4398
4399   ascii_compatible
4400     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4401        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4402                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4403
4404   while (charbuf < charbuf_end)
4405     {
4406       ASSURE_DESTINATION (safe_room);
4407
4408       if (bol_designation)
4409         {
4410           /* We have to produce designation sequences if any now.  */
4411           unsigned char desig_buf[16];
4412           int nbytes;
4413           ptrdiff_t offset;
4414
4415           charset_map_loaded = 0;
4416           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4417                                               desig_buf);
4418           if (charset_map_loaded
4419               && (offset = coding_change_destination (coding)))
4420             {
4421               dst += offset;
4422               dst_end += offset;
4423             }
4424           memcpy (dst, desig_buf, nbytes);
4425           dst += nbytes;
4426           /* We are sure that designation sequences are all ASCII bytes.  */
4427           produced_chars += nbytes;
4428           bol_designation = 0;
4429           ASSURE_DESTINATION (safe_room);
4430         }
4431
4432       c = *charbuf++;
4433
4434       if (c < 0)
4435         {
4436           /* Handle an annotation.  */
4437           switch (*charbuf)
4438             {
4439             case CODING_ANNOTATE_COMPOSITION_MASK:
4440               /* Not yet implemented.  */
4441               break;
4442             case CODING_ANNOTATE_CHARSET_MASK:
4443               preferred_charset_id = charbuf[2];
4444               if (preferred_charset_id >= 0
4445                   && NILP (Fmemq (make_number (preferred_charset_id),
4446                                   charset_list)))
4447                 preferred_charset_id = -1;
4448               break;
4449             default:
4450               emacs_abort ();
4451             }
4452           charbuf += -c - 1;
4453           continue;
4454         }
4455
4456       /* Now encode the character C.  */
4457       if (c < 0x20 || c == 0x7F)
4458         {
4459           if (c == '\n'
4460               || (c == '\r' && EQ (eol_type, Qmac)))
4461             {
4462               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4463                 ENCODE_RESET_PLANE_AND_REGISTER ();
4464               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4465                 {
4466                   int i;
4467
4468                   for (i = 0; i < 4; i++)
4469                     CODING_ISO_DESIGNATION (coding, i)
4470                       = CODING_ISO_INITIAL (coding, i);
4471                 }
4472               bol_designation = ((CODING_ISO_FLAGS (coding)
4473                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4474                                  != 0);
4475             }
4476           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4477             ENCODE_RESET_PLANE_AND_REGISTER ();
4478           EMIT_ONE_ASCII_BYTE (c);
4479         }
4480       else if (ASCII_CHAR_P (c))
4481         {
4482           if (ascii_compatible)
4483             EMIT_ONE_ASCII_BYTE (c);
4484           else
4485             {
4486               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4487               ENCODE_ISO_CHARACTER (charset, c);
4488             }
4489         }
4490       else if (CHAR_BYTE8_P (c))
4491         {
4492           c = CHAR_TO_BYTE8 (c);
4493           EMIT_ONE_BYTE (c);
4494         }
4495       else
4496         {
4497           struct charset *charset;
4498
4499           if (preferred_charset_id >= 0)
4500             {
4501               bool result;
4502
4503               charset = CHARSET_FROM_ID (preferred_charset_id);
4504               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4505               if (! result)
4506                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4507                                      NULL, charset);
4508             }
4509           else
4510             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4511                                  NULL, charset);
4512           if (!charset)
4513             {
4514               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4515                 {
4516                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4517                   charset = CHARSET_FROM_ID (charset_ascii);
4518                 }
4519               else
4520                 {
4521                   c = coding->default_char;
4522                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4523                                        charset_list, NULL, charset);
4524                 }
4525             }
4526           ENCODE_ISO_CHARACTER (charset, c);
4527         }
4528     }
4529
4530   if (coding->mode & CODING_MODE_LAST_BLOCK
4531       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4532     {
4533       ASSURE_DESTINATION (safe_room);
4534       ENCODE_RESET_PLANE_AND_REGISTER ();
4535     }
4536   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4537   CODING_ISO_BOL (coding) = bol_designation;
4538   coding->produced_char += produced_chars;
4539   coding->produced = dst - coding->destination;
4540   return 0;
4541 }
4542
4543 \f
4544 /*** 8,9. SJIS and BIG5 handlers ***/
4545
4546 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4547    quite widely.  So, for the moment, Emacs supports them in the bare
4548    C code.  But, in the future, they may be supported only by CCL.  */
4549
4550 /* SJIS is a coding system encoding three character sets: ASCII, right
4551    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4552    as is.  A character of charset katakana-jisx0201 is encoded by
4553    "position-code + 0x80".  A character of charset japanese-jisx0208
4554    is encoded in 2-byte but two position-codes are divided and shifted
4555    so that it fit in the range below.
4556
4557    --- CODE RANGE of SJIS ---
4558    (character set)      (range)
4559    ASCII                0x00 .. 0x7F
4560    KATAKANA-JISX0201    0xA0 .. 0xDF
4561    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4562             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4563    -------------------------------
4564
4565 */
4566
4567 /* BIG5 is a coding system encoding two character sets: ASCII and
4568    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4569    character set and is encoded in two-byte.
4570
4571    --- CODE RANGE of BIG5 ---
4572    (character set)      (range)
4573    ASCII                0x00 .. 0x7F
4574    Big5 (1st byte)      0xA1 .. 0xFE
4575         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4576    --------------------------
4577
4578   */
4579
4580 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4581    Return true if a text is encoded in SJIS.  */
4582
4583 static bool
4584 detect_coding_sjis (struct coding_system *coding,
4585                     struct coding_detection_info *detect_info)
4586 {
4587   const unsigned char *src = coding->source, *src_base;
4588   const unsigned char *src_end = coding->source + coding->src_bytes;
4589   bool multibytep = coding->src_multibyte;
4590   ptrdiff_t consumed_chars = 0;
4591   int found = 0;
4592   int c;
4593   Lisp_Object attrs, charset_list;
4594   int max_first_byte_of_2_byte_code;
4595
4596   CODING_GET_INFO (coding, attrs, charset_list);
4597   max_first_byte_of_2_byte_code
4598     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4599
4600   detect_info->checked |= CATEGORY_MASK_SJIS;
4601   /* A coding system of this category is always ASCII compatible.  */
4602   src += coding->head_ascii;
4603
4604   while (1)
4605     {
4606       src_base = src;
4607       ONE_MORE_BYTE (c);
4608       if (c < 0x80)
4609         continue;
4610       if ((c >= 0x81 && c <= 0x9F)
4611           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4612         {
4613           ONE_MORE_BYTE (c);
4614           if (c < 0x40 || c == 0x7F || c > 0xFC)
4615             break;
4616           found = CATEGORY_MASK_SJIS;
4617         }
4618       else if (c >= 0xA0 && c < 0xE0)
4619         found = CATEGORY_MASK_SJIS;
4620       else
4621         break;
4622     }
4623   detect_info->rejected |= CATEGORY_MASK_SJIS;
4624   return 0;
4625
4626  no_more_source:
4627   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4628     {
4629       detect_info->rejected |= CATEGORY_MASK_SJIS;
4630       return 0;
4631     }
4632   detect_info->found |= found;
4633   return 1;
4634 }
4635
4636 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4637    Return true if a text is encoded in BIG5.  */
4638
4639 static bool
4640 detect_coding_big5 (struct coding_system *coding,
4641                     struct coding_detection_info *detect_info)
4642 {
4643   const unsigned char *src = coding->source, *src_base;
4644   const unsigned char *src_end = coding->source + coding->src_bytes;
4645   bool multibytep = coding->src_multibyte;
4646   ptrdiff_t consumed_chars = 0;
4647   int found = 0;
4648   int c;
4649
4650   detect_info->checked |= CATEGORY_MASK_BIG5;
4651   /* A coding system of this category is always ASCII compatible.  */
4652   src += coding->head_ascii;
4653
4654   while (1)
4655     {
4656       src_base = src;
4657       ONE_MORE_BYTE (c);
4658       if (c < 0x80)
4659         continue;
4660       if (c >= 0xA1)
4661         {
4662           ONE_MORE_BYTE (c);
4663           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4664             return 0;
4665           found = CATEGORY_MASK_BIG5;
4666         }
4667       else
4668         break;
4669     }
4670   detect_info->rejected |= CATEGORY_MASK_BIG5;
4671   return 0;
4672
4673  no_more_source:
4674   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4675     {
4676       detect_info->rejected |= CATEGORY_MASK_BIG5;
4677       return 0;
4678     }
4679   detect_info->found |= found;
4680   return 1;
4681 }
4682
4683 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4684
4685 static void
4686 decode_coding_sjis (struct coding_system *coding)
4687 {
4688   const unsigned char *src = coding->source + coding->consumed;
4689   const unsigned char *src_end = coding->source + coding->src_bytes;
4690   const unsigned char *src_base;
4691   int *charbuf = coding->charbuf + coding->charbuf_used;
4692   /* We may produce one charset annotation in one loop and one more at
4693      the end.  */
4694   int *charbuf_end
4695     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4696   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4697   bool multibytep = coding->src_multibyte;
4698   struct charset *charset_roman, *charset_kanji, *charset_kana;
4699   struct charset *charset_kanji2;
4700   Lisp_Object attrs, charset_list, val;
4701   ptrdiff_t char_offset = coding->produced_char;
4702   ptrdiff_t last_offset = char_offset;
4703   int last_id = charset_ascii;
4704   bool eol_dos
4705     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4706   int byte_after_cr = -1;
4707
4708   CODING_GET_INFO (coding, attrs, charset_list);
4709
4710   val = charset_list;
4711   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4712   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4713   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4714   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4715
4716   while (1)
4717     {
4718       int c, c1;
4719       struct charset *charset;
4720
4721       src_base = src;
4722       consumed_chars_base = consumed_chars;
4723
4724       if (charbuf >= charbuf_end)
4725         {
4726           if (byte_after_cr >= 0)
4727             src_base--;
4728           break;
4729         }
4730
4731       if (byte_after_cr >= 0)
4732         c = byte_after_cr, byte_after_cr = -1;
4733       else
4734         ONE_MORE_BYTE (c);
4735       if (c < 0)
4736         goto invalid_code;
4737       if (c < 0x80)
4738         {
4739           if (eol_dos && c == '\r')
4740             ONE_MORE_BYTE (byte_after_cr);
4741           charset = charset_roman;
4742         }
4743       else if (c == 0x80 || c == 0xA0)
4744         goto invalid_code;
4745       else if (c >= 0xA1 && c <= 0xDF)
4746         {
4747           /* SJIS -> JISX0201-Kana */
4748           c &= 0x7F;
4749           charset = charset_kana;
4750         }
4751       else if (c <= 0xEF)
4752         {
4753           /* SJIS -> JISX0208 */
4754           ONE_MORE_BYTE (c1);
4755           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4756             goto invalid_code;
4757           c = (c << 8) | c1;
4758           SJIS_TO_JIS (c);
4759           charset = charset_kanji;
4760         }
4761       else if (c <= 0xFC && charset_kanji2)
4762         {
4763           /* SJIS -> JISX0213-2 */
4764           ONE_MORE_BYTE (c1);
4765           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4766             goto invalid_code;
4767           c = (c << 8) | c1;
4768           SJIS_TO_JIS2 (c);
4769           charset = charset_kanji2;
4770         }
4771       else
4772         goto invalid_code;
4773       if (charset->id != charset_ascii
4774           && last_id != charset->id)
4775         {
4776           if (last_id != charset_ascii)
4777             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4778           last_id = charset->id;
4779           last_offset = char_offset;
4780         }
4781       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4782       *charbuf++ = c;
4783       char_offset++;
4784       continue;
4785
4786     invalid_code:
4787       src = src_base;
4788       consumed_chars = consumed_chars_base;
4789       ONE_MORE_BYTE (c);
4790       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4791       char_offset++;
4792       coding->errors++;
4793     }
4794
4795  no_more_source:
4796   if (last_id != charset_ascii)
4797     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4798   coding->consumed_char += consumed_chars_base;
4799   coding->consumed = src_base - coding->source;
4800   coding->charbuf_used = charbuf - coding->charbuf;
4801 }
4802
4803 static void
4804 decode_coding_big5 (struct coding_system *coding)
4805 {
4806   const unsigned char *src = coding->source + coding->consumed;
4807   const unsigned char *src_end = coding->source + coding->src_bytes;
4808   const unsigned char *src_base;
4809   int *charbuf = coding->charbuf + coding->charbuf_used;
4810   /* We may produce one charset annotation in one loop and one more at
4811      the end.  */
4812   int *charbuf_end
4813     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4814   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4815   bool multibytep = coding->src_multibyte;
4816   struct charset *charset_roman, *charset_big5;
4817   Lisp_Object attrs, charset_list, val;
4818   ptrdiff_t char_offset = coding->produced_char;
4819   ptrdiff_t last_offset = char_offset;
4820   int last_id = charset_ascii;
4821   bool eol_dos
4822     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4823   int byte_after_cr = -1;
4824
4825   CODING_GET_INFO (coding, attrs, charset_list);
4826   val = charset_list;
4827   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4828   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4829
4830   while (1)
4831     {
4832       int c, c1;
4833       struct charset *charset;
4834
4835       src_base = src;
4836       consumed_chars_base = consumed_chars;
4837
4838       if (charbuf >= charbuf_end)
4839         {
4840           if (byte_after_cr >= 0)
4841             src_base--;
4842           break;
4843         }
4844
4845       if (byte_after_cr >= 0)
4846         c = byte_after_cr, byte_after_cr = -1;
4847       else
4848         ONE_MORE_BYTE (c);
4849
4850       if (c < 0)
4851         goto invalid_code;
4852       if (c < 0x80)
4853         {
4854           if (eol_dos && c == '\r')
4855             ONE_MORE_BYTE (byte_after_cr);
4856           charset = charset_roman;
4857         }
4858       else
4859         {
4860           /* BIG5 -> Big5 */
4861           if (c < 0xA1 || c > 0xFE)
4862             goto invalid_code;
4863           ONE_MORE_BYTE (c1);
4864           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4865             goto invalid_code;
4866           c = c << 8 | c1;
4867           charset = charset_big5;
4868         }
4869       if (charset->id != charset_ascii
4870           && last_id != charset->id)
4871         {
4872           if (last_id != charset_ascii)
4873             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4874           last_id = charset->id;
4875           last_offset = char_offset;
4876         }
4877       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4878       *charbuf++ = c;
4879       char_offset++;
4880       continue;
4881
4882     invalid_code:
4883       src = src_base;
4884       consumed_chars = consumed_chars_base;
4885       ONE_MORE_BYTE (c);
4886       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4887       char_offset++;
4888       coding->errors++;
4889     }
4890
4891  no_more_source:
4892   if (last_id != charset_ascii)
4893     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4894   coding->consumed_char += consumed_chars_base;
4895   coding->consumed = src_base - coding->source;
4896   coding->charbuf_used = charbuf - coding->charbuf;
4897 }
4898
4899 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4900    This function can encode charsets `ascii', `katakana-jisx0201',
4901    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4902    are sure that all these charsets are registered as official charset
4903    (i.e. do not have extended leading-codes).  Characters of other
4904    charsets are produced without any encoding.  */
4905
4906 static bool
4907 encode_coding_sjis (struct coding_system *coding)
4908 {
4909   bool multibytep = coding->dst_multibyte;
4910   int *charbuf = coding->charbuf;
4911   int *charbuf_end = charbuf + coding->charbuf_used;
4912   unsigned char *dst = coding->destination + coding->produced;
4913   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4914   int safe_room = 4;
4915   ptrdiff_t produced_chars = 0;
4916   Lisp_Object attrs, charset_list, val;
4917   bool ascii_compatible;
4918   struct charset *charset_kanji, *charset_kana;
4919   struct charset *charset_kanji2;
4920   int c;
4921
4922   CODING_GET_INFO (coding, attrs, charset_list);
4923   val = XCDR (charset_list);
4924   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4925   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4926   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4927
4928   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4929
4930   while (charbuf < charbuf_end)
4931     {
4932       ASSURE_DESTINATION (safe_room);
4933       c = *charbuf++;
4934       /* Now encode the character C.  */
4935       if (ASCII_CHAR_P (c) && ascii_compatible)
4936         EMIT_ONE_ASCII_BYTE (c);
4937       else if (CHAR_BYTE8_P (c))
4938         {
4939           c = CHAR_TO_BYTE8 (c);
4940           EMIT_ONE_BYTE (c);
4941         }
4942       else
4943         {
4944           unsigned code;
4945           struct charset *charset;
4946           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4947                                &code, charset);
4948
4949           if (!charset)
4950             {
4951               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4952                 {
4953                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4954                   charset = CHARSET_FROM_ID (charset_ascii);
4955                 }
4956               else
4957                 {
4958                   c = coding->default_char;
4959                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4960                                        charset_list, &code, charset);
4961                 }
4962             }
4963           if (code == CHARSET_INVALID_CODE (charset))
4964             emacs_abort ();
4965           if (charset == charset_kanji)
4966             {
4967               int c1, c2;
4968               JIS_TO_SJIS (code);
4969               c1 = code >> 8, c2 = code & 0xFF;
4970               EMIT_TWO_BYTES (c1, c2);
4971             }
4972           else if (charset == charset_kana)
4973             EMIT_ONE_BYTE (code | 0x80);
4974           else if (charset_kanji2 && charset == charset_kanji2)
4975             {
4976               int c1, c2;
4977
4978               c1 = code >> 8;
4979               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4980                   || c1 == 0x28
4981                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4982                 {
4983                   JIS_TO_SJIS2 (code);
4984                   c1 = code >> 8, c2 = code & 0xFF;
4985                   EMIT_TWO_BYTES (c1, c2);
4986                 }
4987               else
4988                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4989             }
4990           else
4991             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4992         }
4993     }
4994   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4995   coding->produced_char += produced_chars;
4996   coding->produced = dst - coding->destination;
4997   return 0;
4998 }
4999
5000 static bool
5001 encode_coding_big5 (struct coding_system *coding)
5002 {
5003   bool multibytep = coding->dst_multibyte;
5004   int *charbuf = coding->charbuf;
5005   int *charbuf_end = charbuf + coding->charbuf_used;
5006   unsigned char *dst = coding->destination + coding->produced;
5007   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5008   int safe_room = 4;
5009   ptrdiff_t produced_chars = 0;
5010   Lisp_Object attrs, charset_list, val;
5011   bool ascii_compatible;
5012   struct charset *charset_big5;
5013   int c;
5014
5015   CODING_GET_INFO (coding, attrs, charset_list);
5016   val = XCDR (charset_list);
5017   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5018   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5019
5020   while (charbuf < charbuf_end)
5021     {
5022       ASSURE_DESTINATION (safe_room);
5023       c = *charbuf++;
5024       /* Now encode the character C.  */
5025       if (ASCII_CHAR_P (c) && ascii_compatible)
5026         EMIT_ONE_ASCII_BYTE (c);
5027       else if (CHAR_BYTE8_P (c))
5028         {
5029           c = CHAR_TO_BYTE8 (c);
5030           EMIT_ONE_BYTE (c);
5031         }
5032       else
5033         {
5034           unsigned code;
5035           struct charset *charset;
5036           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5037                                &code, charset);
5038
5039           if (! charset)
5040             {
5041               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5042                 {
5043                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5044                   charset = CHARSET_FROM_ID (charset_ascii);
5045                 }
5046               else
5047                 {
5048                   c = coding->default_char;
5049                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5050                                        charset_list, &code, charset);
5051                 }
5052             }
5053           if (code == CHARSET_INVALID_CODE (charset))
5054             emacs_abort ();
5055           if (charset == charset_big5)
5056             {
5057               int c1, c2;
5058
5059               c1 = code >> 8, c2 = code & 0xFF;
5060               EMIT_TWO_BYTES (c1, c2);
5061             }
5062           else
5063             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5064         }
5065     }
5066   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5067   coding->produced_char += produced_chars;
5068   coding->produced = dst - coding->destination;
5069   return 0;
5070 }
5071
5072 \f
5073 /*** 10. CCL handlers ***/
5074
5075 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5076    Return true if a text is encoded in a coding system of which
5077    encoder/decoder are written in CCL program.  */
5078
5079 static bool
5080 detect_coding_ccl (struct coding_system *coding,
5081                    struct coding_detection_info *detect_info)
5082 {
5083   const unsigned char *src = coding->source, *src_base;
5084   const unsigned char *src_end = coding->source + coding->src_bytes;
5085   bool multibytep = coding->src_multibyte;
5086   ptrdiff_t consumed_chars = 0;
5087   int found = 0;
5088   unsigned char *valids;
5089   ptrdiff_t head_ascii = coding->head_ascii;
5090   Lisp_Object attrs;
5091
5092   detect_info->checked |= CATEGORY_MASK_CCL;
5093
5094   coding = &coding_categories[coding_category_ccl];
5095   valids = CODING_CCL_VALIDS (coding);
5096   attrs = CODING_ID_ATTRS (coding->id);
5097   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5098     src += head_ascii;
5099
5100   while (1)
5101     {
5102       int c;
5103
5104       src_base = src;
5105       ONE_MORE_BYTE (c);
5106       if (c < 0 || ! valids[c])
5107         break;
5108       if ((valids[c] > 1))
5109         found = CATEGORY_MASK_CCL;
5110     }
5111   detect_info->rejected |= CATEGORY_MASK_CCL;
5112   return 0;
5113
5114  no_more_source:
5115   detect_info->found |= found;
5116   return 1;
5117 }
5118
5119 static void
5120 decode_coding_ccl (struct coding_system *coding)
5121 {
5122   const unsigned char *src = coding->source + coding->consumed;
5123   const unsigned char *src_end = coding->source + coding->src_bytes;
5124   int *charbuf = coding->charbuf + coding->charbuf_used;
5125   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5126   ptrdiff_t consumed_chars = 0;
5127   bool multibytep = coding->src_multibyte;
5128   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5129   int source_charbuf[1024];
5130   int source_byteidx[1025];
5131   Lisp_Object attrs, charset_list;
5132
5133   CODING_GET_INFO (coding, attrs, charset_list);
5134
5135   while (1)
5136     {
5137       const unsigned char *p = src;
5138       ptrdiff_t offset;
5139       int i = 0;
5140
5141       if (multibytep)
5142         {
5143           while (i < 1024 && p < src_end)
5144             {
5145               source_byteidx[i] = p - src;
5146               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5147             }
5148           source_byteidx[i] = p - src;
5149         }
5150       else
5151         while (i < 1024 && p < src_end)
5152           source_charbuf[i++] = *p++;
5153
5154       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5155         ccl->last_block = 1;
5156       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5157       charset_map_loaded = 0;
5158       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5159                   charset_list);
5160       if (charset_map_loaded
5161           && (offset = coding_change_source (coding)))
5162         {
5163           p += offset;
5164           src += offset;
5165           src_end += offset;
5166         }
5167       charbuf += ccl->produced;
5168       if (multibytep)
5169         src += source_byteidx[ccl->consumed];
5170       else
5171         src += ccl->consumed;
5172       consumed_chars += ccl->consumed;
5173       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5174         break;
5175     }
5176
5177   switch (ccl->status)
5178     {
5179     case CCL_STAT_SUSPEND_BY_SRC:
5180       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5181       break;
5182     case CCL_STAT_SUSPEND_BY_DST:
5183       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5184       break;
5185     case CCL_STAT_QUIT:
5186     case CCL_STAT_INVALID_CMD:
5187       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5188       break;
5189     default:
5190       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5191       break;
5192     }
5193   coding->consumed_char += consumed_chars;
5194   coding->consumed = src - coding->source;
5195   coding->charbuf_used = charbuf - coding->charbuf;
5196 }
5197
5198 static bool
5199 encode_coding_ccl (struct coding_system *coding)
5200 {
5201   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5202   bool multibytep = coding->dst_multibyte;
5203   int *charbuf = coding->charbuf;
5204   int *charbuf_end = charbuf + coding->charbuf_used;
5205   unsigned char *dst = coding->destination + coding->produced;
5206   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5207   int destination_charbuf[1024];
5208   ptrdiff_t produced_chars = 0;
5209   int i;
5210   Lisp_Object attrs, charset_list;
5211
5212   CODING_GET_INFO (coding, attrs, charset_list);
5213   if (coding->consumed_char == coding->src_chars
5214       && coding->mode & CODING_MODE_LAST_BLOCK)
5215     ccl->last_block = 1;
5216
5217   do
5218     {
5219       ptrdiff_t offset;
5220
5221       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5222       charset_map_loaded = 0;
5223       ccl_driver (ccl, charbuf, destination_charbuf,
5224                   charbuf_end - charbuf, 1024, charset_list);
5225       if (charset_map_loaded
5226           && (offset = coding_change_destination (coding)))
5227         dst += offset;
5228       if (multibytep)
5229         {
5230           ASSURE_DESTINATION (ccl->produced * 2);
5231           for (i = 0; i < ccl->produced; i++)
5232             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5233         }
5234       else
5235         {
5236           ASSURE_DESTINATION (ccl->produced);
5237           for (i = 0; i < ccl->produced; i++)
5238             *dst++ = destination_charbuf[i] & 0xFF;
5239           produced_chars += ccl->produced;
5240         }
5241       charbuf += ccl->consumed;
5242       if (ccl->status == CCL_STAT_QUIT
5243           || ccl->status == CCL_STAT_INVALID_CMD)
5244         break;
5245     }
5246   while (charbuf < charbuf_end);
5247
5248   switch (ccl->status)
5249     {
5250     case CCL_STAT_SUSPEND_BY_SRC:
5251       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5252       break;
5253     case CCL_STAT_SUSPEND_BY_DST:
5254       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5255       break;
5256     case CCL_STAT_QUIT:
5257     case CCL_STAT_INVALID_CMD:
5258       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5259       break;
5260     default:
5261       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5262       break;
5263     }
5264
5265   coding->produced_char += produced_chars;
5266   coding->produced = dst - coding->destination;
5267   return 0;
5268 }
5269
5270 \f
5271 /*** 10, 11. no-conversion handlers ***/
5272
5273 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5274
5275 static void
5276 decode_coding_raw_text (struct coding_system *coding)
5277 {
5278   bool eol_dos
5279     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5280
5281   coding->chars_at_source = 1;
5282   coding->consumed_char = coding->src_chars;
5283   coding->consumed = coding->src_bytes;
5284   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5285     {
5286       coding->consumed_char--;
5287       coding->consumed--;
5288       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5289     }
5290   else
5291     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5292 }
5293
5294 static bool
5295 encode_coding_raw_text (struct coding_system *coding)
5296 {
5297   bool multibytep = coding->dst_multibyte;
5298   int *charbuf = coding->charbuf;
5299   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5300   unsigned char *dst = coding->destination + coding->produced;
5301   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5302   ptrdiff_t produced_chars = 0;
5303   int c;
5304
5305   if (multibytep)
5306     {
5307       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5308
5309       if (coding->src_multibyte)
5310         while (charbuf < charbuf_end)
5311           {
5312             ASSURE_DESTINATION (safe_room);
5313             c = *charbuf++;
5314             if (ASCII_CHAR_P (c))
5315               EMIT_ONE_ASCII_BYTE (c);
5316             else if (CHAR_BYTE8_P (c))
5317               {
5318                 c = CHAR_TO_BYTE8 (c);
5319                 EMIT_ONE_BYTE (c);
5320               }
5321             else
5322               {
5323                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5324
5325                 CHAR_STRING_ADVANCE (c, p1);
5326                 do
5327                   {
5328                     EMIT_ONE_BYTE (*p0);
5329                     p0++;
5330                   }
5331                 while (p0 < p1);
5332               }
5333           }
5334       else
5335         while (charbuf < charbuf_end)
5336           {
5337             ASSURE_DESTINATION (safe_room);
5338             c = *charbuf++;
5339             EMIT_ONE_BYTE (c);
5340           }
5341     }
5342   else
5343     {
5344       if (coding->src_multibyte)
5345         {
5346           int safe_room = MAX_MULTIBYTE_LENGTH;
5347
5348           while (charbuf < charbuf_end)
5349             {
5350               ASSURE_DESTINATION (safe_room);
5351               c = *charbuf++;
5352               if (ASCII_CHAR_P (c))
5353                 *dst++ = c;
5354               else if (CHAR_BYTE8_P (c))
5355                 *dst++ = CHAR_TO_BYTE8 (c);
5356               else
5357                 CHAR_STRING_ADVANCE (c, dst);
5358             }
5359         }
5360       else
5361         {
5362           ASSURE_DESTINATION (charbuf_end - charbuf);
5363           while (charbuf < charbuf_end && dst < dst_end)
5364             *dst++ = *charbuf++;
5365         }
5366       produced_chars = dst - (coding->destination + coding->produced);
5367     }
5368   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5369   coding->produced_char += produced_chars;
5370   coding->produced = dst - coding->destination;
5371   return 0;
5372 }
5373
5374 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5375    Return true if a text is encoded in a charset-based coding system.  */
5376
5377 static bool
5378 detect_coding_charset (struct coding_system *coding,
5379                        struct coding_detection_info *detect_info)
5380 {
5381   const unsigned char *src = coding->source, *src_base;
5382   const unsigned char *src_end = coding->source + coding->src_bytes;
5383   bool multibytep = coding->src_multibyte;
5384   ptrdiff_t consumed_chars = 0;
5385   Lisp_Object attrs, valids, name;
5386   int found = 0;
5387   ptrdiff_t head_ascii = coding->head_ascii;
5388   bool check_latin_extra = 0;
5389
5390   detect_info->checked |= CATEGORY_MASK_CHARSET;
5391
5392   coding = &coding_categories[coding_category_charset];
5393   attrs = CODING_ID_ATTRS (coding->id);
5394   valids = AREF (attrs, coding_attr_charset_valids);
5395   name = CODING_ID_NAME (coding->id);
5396   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5397                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5398       || strncmp (SSDATA (SYMBOL_NAME (name)),
5399                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5400     check_latin_extra = 1;
5401
5402   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5403     src += head_ascii;
5404
5405   while (1)
5406     {
5407       int c;
5408       Lisp_Object val;
5409       struct charset *charset;
5410       int dim, idx;
5411
5412       src_base = src;
5413       ONE_MORE_BYTE (c);
5414       if (c < 0)
5415         continue;
5416       val = AREF (valids, c);
5417       if (NILP (val))
5418         break;
5419       if (c >= 0x80)
5420         {
5421           if (c < 0xA0
5422               && check_latin_extra
5423               && (!VECTORP (Vlatin_extra_code_table)
5424                   || NILP (AREF (Vlatin_extra_code_table, c))))
5425             break;
5426           found = CATEGORY_MASK_CHARSET;
5427         }
5428       if (INTEGERP (val))
5429         {
5430           charset = CHARSET_FROM_ID (XFASTINT (val));
5431           dim = CHARSET_DIMENSION (charset);
5432           for (idx = 1; idx < dim; idx++)
5433             {
5434               if (src == src_end)
5435                 goto too_short;
5436               ONE_MORE_BYTE (c);
5437               if (c < charset->code_space[(dim - 1 - idx) * 4]
5438                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5439                 break;
5440             }
5441           if (idx < dim)
5442             break;
5443         }
5444       else
5445         {
5446           idx = 1;
5447           for (; CONSP (val); val = XCDR (val))
5448             {
5449               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5450               dim = CHARSET_DIMENSION (charset);
5451               while (idx < dim)
5452                 {
5453                   if (src == src_end)
5454                     goto too_short;
5455                   ONE_MORE_BYTE (c);
5456                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5457                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5458                     break;
5459                   idx++;
5460                 }
5461               if (idx == dim)
5462                 {
5463                   val = Qnil;
5464                   break;
5465                 }
5466             }
5467           if (CONSP (val))
5468             break;
5469         }
5470     }
5471  too_short:
5472   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5473   return 0;
5474
5475  no_more_source:
5476   detect_info->found |= found;
5477   return 1;
5478 }
5479
5480 static void
5481 decode_coding_charset (struct coding_system *coding)
5482 {
5483   const unsigned char *src = coding->source + coding->consumed;
5484   const unsigned char *src_end = coding->source + coding->src_bytes;
5485   const unsigned char *src_base;
5486   int *charbuf = coding->charbuf + coding->charbuf_used;
5487   /* We may produce one charset annotation in one loop and one more at
5488      the end.  */
5489   int *charbuf_end
5490     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5491   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5492   bool multibytep = coding->src_multibyte;
5493   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5494   Lisp_Object valids;
5495   ptrdiff_t char_offset = coding->produced_char;
5496   ptrdiff_t last_offset = char_offset;
5497   int last_id = charset_ascii;
5498   bool eol_dos
5499     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5500   int byte_after_cr = -1;
5501
5502   valids = AREF (attrs, coding_attr_charset_valids);
5503
5504   while (1)
5505     {
5506       int c;
5507       Lisp_Object val;
5508       struct charset *charset;
5509       int dim;
5510       int len = 1;
5511       unsigned code;
5512
5513       src_base = src;
5514       consumed_chars_base = consumed_chars;
5515
5516       if (charbuf >= charbuf_end)
5517         {
5518           if (byte_after_cr >= 0)
5519             src_base--;
5520           break;
5521         }
5522
5523       if (byte_after_cr >= 0)
5524         {
5525           c = byte_after_cr;
5526           byte_after_cr = -1;
5527         }
5528       else
5529         {
5530           ONE_MORE_BYTE (c);
5531           if (eol_dos && c == '\r')
5532             ONE_MORE_BYTE (byte_after_cr);
5533         }
5534       if (c < 0)
5535         goto invalid_code;
5536       code = c;
5537
5538       val = AREF (valids, c);
5539       if (! INTEGERP (val) && ! CONSP (val))
5540         goto invalid_code;
5541       if (INTEGERP (val))
5542         {
5543           charset = CHARSET_FROM_ID (XFASTINT (val));
5544           dim = CHARSET_DIMENSION (charset);
5545           while (len < dim)
5546             {
5547               ONE_MORE_BYTE (c);
5548               code = (code << 8) | c;
5549               len++;
5550             }
5551           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5552                               charset, code, c);
5553         }
5554       else
5555         {
5556           /* VAL is a list of charset IDs.  It is assured that the
5557              list is sorted by charset dimensions (smaller one
5558              comes first).  */
5559           while (CONSP (val))
5560             {
5561               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5562               dim = CHARSET_DIMENSION (charset);
5563               while (len < dim)
5564                 {
5565                   ONE_MORE_BYTE (c);
5566                   code = (code << 8) | c;
5567                   len++;
5568                 }
5569               CODING_DECODE_CHAR (coding, src, src_base,
5570                                   src_end, charset, code, c);
5571               if (c >= 0)
5572                 break;
5573               val = XCDR (val);
5574             }
5575         }
5576       if (c < 0)
5577         goto invalid_code;
5578       if (charset->id != charset_ascii
5579           && last_id != charset->id)
5580         {
5581           if (last_id != charset_ascii)
5582             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5583           last_id = charset->id;
5584           last_offset = char_offset;
5585         }
5586
5587       *charbuf++ = c;
5588       char_offset++;
5589       continue;
5590
5591     invalid_code:
5592       src = src_base;
5593       consumed_chars = consumed_chars_base;
5594       ONE_MORE_BYTE (c);
5595       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5596       char_offset++;
5597       coding->errors++;
5598     }
5599
5600  no_more_source:
5601   if (last_id != charset_ascii)
5602     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5603   coding->consumed_char += consumed_chars_base;
5604   coding->consumed = src_base - coding->source;
5605   coding->charbuf_used = charbuf - coding->charbuf;
5606 }
5607
5608 static bool
5609 encode_coding_charset (struct coding_system *coding)
5610 {
5611   bool multibytep = coding->dst_multibyte;
5612   int *charbuf = coding->charbuf;
5613   int *charbuf_end = charbuf + coding->charbuf_used;
5614   unsigned char *dst = coding->destination + coding->produced;
5615   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5616   int safe_room = MAX_MULTIBYTE_LENGTH;
5617   ptrdiff_t produced_chars = 0;
5618   Lisp_Object attrs, charset_list;
5619   bool ascii_compatible;
5620   int c;
5621
5622   CODING_GET_INFO (coding, attrs, charset_list);
5623   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5624
5625   while (charbuf < charbuf_end)
5626     {
5627       struct charset *charset;
5628       unsigned code;
5629
5630       ASSURE_DESTINATION (safe_room);
5631       c = *charbuf++;
5632       if (ascii_compatible && ASCII_CHAR_P (c))
5633         EMIT_ONE_ASCII_BYTE (c);
5634       else if (CHAR_BYTE8_P (c))
5635         {
5636           c = CHAR_TO_BYTE8 (c);
5637           EMIT_ONE_BYTE (c);
5638         }
5639       else
5640         {
5641           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5642                                &code, charset);
5643
5644           if (charset)
5645             {
5646               if (CHARSET_DIMENSION (charset) == 1)
5647                 EMIT_ONE_BYTE (code);
5648               else if (CHARSET_DIMENSION (charset) == 2)
5649                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5650               else if (CHARSET_DIMENSION (charset) == 3)
5651                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5652               else
5653                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5654                                  (code >> 8) & 0xFF, code & 0xFF);
5655             }
5656           else
5657             {
5658               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5659                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5660               else
5661                 c = coding->default_char;
5662               EMIT_ONE_BYTE (c);
5663             }
5664         }
5665     }
5666
5667   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5668   coding->produced_char += produced_chars;
5669   coding->produced = dst - coding->destination;
5670   return 0;
5671 }
5672
5673 \f
5674 /*** 7. C library functions ***/
5675
5676 /* Setup coding context CODING from information about CODING_SYSTEM.
5677    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5678    CODING_SYSTEM is invalid, signal an error.  */
5679
5680 void
5681 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5682 {
5683   Lisp_Object attrs;
5684   Lisp_Object eol_type;
5685   Lisp_Object coding_type;
5686   Lisp_Object val;
5687
5688   if (NILP (coding_system))
5689     coding_system = Qundecided;
5690
5691   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5692
5693   attrs = CODING_ID_ATTRS (coding->id);
5694   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5695
5696   coding->mode = 0;
5697   if (VECTORP (eol_type))
5698     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5699                             | CODING_REQUIRE_DETECTION_MASK);
5700   else if (! EQ (eol_type, Qunix))
5701     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5702                             | CODING_REQUIRE_ENCODING_MASK);
5703   else
5704     coding->common_flags = 0;
5705   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5706     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5707   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5708     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5709   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5710     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5711
5712   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5713   coding->max_charset_id = SCHARS (val) - 1;
5714   coding->safe_charsets = SDATA (val);
5715   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5716   coding->carryover_bytes = 0;
5717
5718   coding_type = CODING_ATTR_TYPE (attrs);
5719   if (EQ (coding_type, Qundecided))
5720     {
5721       coding->detector = NULL;
5722       coding->decoder = decode_coding_raw_text;
5723       coding->encoder = encode_coding_raw_text;
5724       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5725       coding->spec.undecided.inhibit_nbd
5726         = (encode_inhibit_flag
5727            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5728       coding->spec.undecided.inhibit_ied
5729         = (encode_inhibit_flag
5730            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5731       coding->spec.undecided.prefer_utf_8
5732         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5733     }
5734   else if (EQ (coding_type, Qiso_2022))
5735     {
5736       int i;
5737       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5738
5739       /* Invoke graphic register 0 to plane 0.  */
5740       CODING_ISO_INVOCATION (coding, 0) = 0;
5741       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5742       CODING_ISO_INVOCATION (coding, 1)
5743         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5744       /* Setup the initial status of designation.  */
5745       for (i = 0; i < 4; i++)
5746         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5747       /* Not single shifting initially.  */
5748       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5749       /* Beginning of buffer should also be regarded as bol. */
5750       CODING_ISO_BOL (coding) = 1;
5751       coding->detector = detect_coding_iso_2022;
5752       coding->decoder = decode_coding_iso_2022;
5753       coding->encoder = encode_coding_iso_2022;
5754       if (flags & CODING_ISO_FLAG_SAFE)
5755         coding->mode |= CODING_MODE_SAFE_ENCODING;
5756       coding->common_flags
5757         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5758             | CODING_REQUIRE_FLUSHING_MASK);
5759       if (flags & CODING_ISO_FLAG_COMPOSITION)
5760         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5761       if (flags & CODING_ISO_FLAG_DESIGNATION)
5762         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5763       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5764         {
5765           setup_iso_safe_charsets (attrs);
5766           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5767           coding->max_charset_id = SCHARS (val) - 1;
5768           coding->safe_charsets = SDATA (val);
5769         }
5770       CODING_ISO_FLAGS (coding) = flags;
5771       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5772       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5773       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5774       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5775     }
5776   else if (EQ (coding_type, Qcharset))
5777     {
5778       coding->detector = detect_coding_charset;
5779       coding->decoder = decode_coding_charset;
5780       coding->encoder = encode_coding_charset;
5781       coding->common_flags
5782         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5783     }
5784   else if (EQ (coding_type, Qutf_8))
5785     {
5786       val = AREF (attrs, coding_attr_utf_bom);
5787       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5788                                    : EQ (val, Qt) ? utf_with_bom
5789                                    : utf_without_bom);
5790       coding->detector = detect_coding_utf_8;
5791       coding->decoder = decode_coding_utf_8;
5792       coding->encoder = encode_coding_utf_8;
5793       coding->common_flags
5794         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5795       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5796         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5797     }
5798   else if (EQ (coding_type, Qutf_16))
5799     {
5800       val = AREF (attrs, coding_attr_utf_bom);
5801       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5802                                     : EQ (val, Qt) ? utf_with_bom
5803                                     : utf_without_bom);
5804       val = AREF (attrs, coding_attr_utf_16_endian);
5805       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5806                                        : utf_16_little_endian);
5807       CODING_UTF_16_SURROGATE (coding) = 0;
5808       coding->detector = detect_coding_utf_16;
5809       coding->decoder = decode_coding_utf_16;
5810       coding->encoder = encode_coding_utf_16;
5811       coding->common_flags
5812         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5813       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5814         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5815     }
5816   else if (EQ (coding_type, Qccl))
5817     {
5818       coding->detector = detect_coding_ccl;
5819       coding->decoder = decode_coding_ccl;
5820       coding->encoder = encode_coding_ccl;
5821       coding->common_flags
5822         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5823             | CODING_REQUIRE_FLUSHING_MASK);
5824     }
5825   else if (EQ (coding_type, Qemacs_mule))
5826     {
5827       coding->detector = detect_coding_emacs_mule;
5828       coding->decoder = decode_coding_emacs_mule;
5829       coding->encoder = encode_coding_emacs_mule;
5830       coding->common_flags
5831         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5832       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5833           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5834         {
5835           Lisp_Object tail, safe_charsets;
5836           int max_charset_id = 0;
5837
5838           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5839                tail = XCDR (tail))
5840             if (max_charset_id < XFASTINT (XCAR (tail)))
5841               max_charset_id = XFASTINT (XCAR (tail));
5842           safe_charsets = make_uninit_string (max_charset_id + 1);
5843           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5844           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5845                tail = XCDR (tail))
5846             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5847           coding->max_charset_id = max_charset_id;
5848           coding->safe_charsets = SDATA (safe_charsets);
5849         }
5850       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5851       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5852     }
5853   else if (EQ (coding_type, Qshift_jis))
5854     {
5855       coding->detector = detect_coding_sjis;
5856       coding->decoder = decode_coding_sjis;
5857       coding->encoder = encode_coding_sjis;
5858       coding->common_flags
5859         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5860     }
5861   else if (EQ (coding_type, Qbig5))
5862     {
5863       coding->detector = detect_coding_big5;
5864       coding->decoder = decode_coding_big5;
5865       coding->encoder = encode_coding_big5;
5866       coding->common_flags
5867         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5868     }
5869   else                          /* EQ (coding_type, Qraw_text) */
5870     {
5871       coding->detector = NULL;
5872       coding->decoder = decode_coding_raw_text;
5873       coding->encoder = encode_coding_raw_text;
5874       if (! EQ (eol_type, Qunix))
5875         {
5876           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5877           if (! VECTORP (eol_type))
5878             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5879         }
5880
5881     }
5882
5883   return;
5884 }
5885
5886 /* Return a list of charsets supported by CODING.  */
5887
5888 Lisp_Object
5889 coding_charset_list (struct coding_system *coding)
5890 {
5891   Lisp_Object attrs, charset_list;
5892
5893   CODING_GET_INFO (coding, attrs, charset_list);
5894   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5895     {
5896       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5897
5898       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5899         charset_list = Viso_2022_charset_list;
5900     }
5901   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5902     {
5903       charset_list = Vemacs_mule_charset_list;
5904     }
5905   return charset_list;
5906 }
5907
5908
5909 /* Return a list of charsets supported by CODING-SYSTEM.  */
5910
5911 Lisp_Object
5912 coding_system_charset_list (Lisp_Object coding_system)
5913 {
5914   ptrdiff_t id;
5915   Lisp_Object attrs, charset_list;
5916
5917   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5918   attrs = CODING_ID_ATTRS (id);
5919
5920   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5921     {
5922       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5923
5924       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5925         charset_list = Viso_2022_charset_list;
5926       else
5927         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5928     }
5929   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5930     {
5931       charset_list = Vemacs_mule_charset_list;
5932     }
5933   else
5934     {
5935       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5936     }
5937   return charset_list;
5938 }
5939
5940
5941 /* Return raw-text or one of its subsidiaries that has the same
5942    eol_type as CODING-SYSTEM.  */
5943
5944 Lisp_Object
5945 raw_text_coding_system (Lisp_Object coding_system)
5946 {
5947   Lisp_Object spec, attrs;
5948   Lisp_Object eol_type, raw_text_eol_type;
5949
5950   if (NILP (coding_system))
5951     return Qraw_text;
5952   spec = CODING_SYSTEM_SPEC (coding_system);
5953   attrs = AREF (spec, 0);
5954
5955   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5956     return coding_system;
5957
5958   eol_type = AREF (spec, 2);
5959   if (VECTORP (eol_type))
5960     return Qraw_text;
5961   spec = CODING_SYSTEM_SPEC (Qraw_text);
5962   raw_text_eol_type = AREF (spec, 2);
5963   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5964           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5965           : AREF (raw_text_eol_type, 2));
5966 }
5967
5968
5969 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5970    the subsidiary that has the same eol-spec as PARENT (if it is not
5971    nil and specifies end-of-line format) or the system's setting
5972    (system_eol_type).  */
5973
5974 Lisp_Object
5975 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5976 {
5977   Lisp_Object spec, eol_type;
5978
5979   if (NILP (coding_system))
5980     coding_system = Qraw_text;
5981   spec = CODING_SYSTEM_SPEC (coding_system);
5982   eol_type = AREF (spec, 2);
5983   if (VECTORP (eol_type))
5984     {
5985       Lisp_Object parent_eol_type;
5986
5987       if (! NILP (parent))
5988         {
5989           Lisp_Object parent_spec;
5990
5991           parent_spec = CODING_SYSTEM_SPEC (parent);
5992           parent_eol_type = AREF (parent_spec, 2);
5993           if (VECTORP (parent_eol_type))
5994             parent_eol_type = system_eol_type;
5995         }
5996       else
5997         parent_eol_type = system_eol_type;
5998       if (EQ (parent_eol_type, Qunix))
5999         coding_system = AREF (eol_type, 0);
6000       else if (EQ (parent_eol_type, Qdos))
6001         coding_system = AREF (eol_type, 1);
6002       else if (EQ (parent_eol_type, Qmac))
6003         coding_system = AREF (eol_type, 2);
6004     }
6005   return coding_system;
6006 }
6007
6008
6009 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6010    decided for writing to a process.  If not, complement them, and
6011    return a new coding system.  */
6012
6013 Lisp_Object
6014 complement_process_encoding_system (Lisp_Object coding_system)
6015 {
6016   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6017   Lisp_Object spec, attrs;
6018   int i;
6019
6020   for (i = 0; i < 3; i++)
6021     {
6022       if (i == 1)
6023         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6024       else if (i == 2)
6025         coding_system = preferred_coding_system ();
6026       spec = CODING_SYSTEM_SPEC (coding_system);
6027       if (NILP (spec))
6028         continue;
6029       attrs = AREF (spec, 0);
6030       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6031         coding_base = CODING_ATTR_BASE_NAME (attrs);
6032       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6033         eol_base = coding_system;
6034       if (! NILP (coding_base) && ! NILP (eol_base))
6035         break;
6036     }
6037
6038   if (i > 0)
6039     /* The original CODING_SYSTEM didn't specify text-conversion or
6040        eol-conversion.  Be sure that we return a fully complemented
6041        coding system.  */
6042     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6043   return coding_system;
6044 }
6045
6046
6047 /* Emacs has a mechanism to automatically detect a coding system if it
6048    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6049    it's impossible to distinguish some coding systems accurately
6050    because they use the same range of codes.  So, at first, coding
6051    systems are categorized into 7, those are:
6052
6053    o coding-category-emacs-mule
6054
6055         The category for a coding system which has the same code range
6056         as Emacs' internal format.  Assigned the coding-system (Lisp
6057         symbol) `emacs-mule' by default.
6058
6059    o coding-category-sjis
6060
6061         The category for a coding system which has the same code range
6062         as SJIS.  Assigned the coding-system (Lisp
6063         symbol) `japanese-shift-jis' by default.
6064
6065    o coding-category-iso-7
6066
6067         The category for a coding system which has the same code range
6068         as ISO2022 of 7-bit environment.  This doesn't use any locking
6069         shift and single shift functions.  This can encode/decode all
6070         charsets.  Assigned the coding-system (Lisp symbol)
6071         `iso-2022-7bit' by default.
6072
6073    o coding-category-iso-7-tight
6074
6075         Same as coding-category-iso-7 except that this can
6076         encode/decode only the specified charsets.
6077
6078    o coding-category-iso-8-1
6079
6080         The category for a coding system which has the same code range
6081         as ISO2022 of 8-bit environment and graphic plane 1 used only
6082         for DIMENSION1 charset.  This doesn't use any locking shift
6083         and single shift functions.  Assigned the coding-system (Lisp
6084         symbol) `iso-latin-1' by default.
6085
6086    o coding-category-iso-8-2
6087
6088         The category for a coding system which has the same code range
6089         as ISO2022 of 8-bit environment and graphic plane 1 used only
6090         for DIMENSION2 charset.  This doesn't use any locking shift
6091         and single shift functions.  Assigned the coding-system (Lisp
6092         symbol) `japanese-iso-8bit' by default.
6093
6094    o coding-category-iso-7-else
6095
6096         The category for a coding system which has the same code range
6097         as ISO2022 of 7-bit environment but uses locking shift or
6098         single shift functions.  Assigned the coding-system (Lisp
6099         symbol) `iso-2022-7bit-lock' by default.
6100
6101    o coding-category-iso-8-else
6102
6103         The category for a coding system which has the same code range
6104         as ISO2022 of 8-bit environment but uses locking shift or
6105         single shift functions.  Assigned the coding-system (Lisp
6106         symbol) `iso-2022-8bit-ss2' by default.
6107
6108    o coding-category-big5
6109
6110         The category for a coding system which has the same code range
6111         as BIG5.  Assigned the coding-system (Lisp symbol)
6112         `cn-big5' by default.
6113
6114    o coding-category-utf-8
6115
6116         The category for a coding system which has the same code range
6117         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6118         symbol) `utf-8' by default.
6119
6120    o coding-category-utf-16-be
6121
6122         The category for a coding system in which a text has an
6123         Unicode signature (cf. Unicode Standard) in the order of BIG
6124         endian at the head.  Assigned the coding-system (Lisp symbol)
6125         `utf-16-be' by default.
6126
6127    o coding-category-utf-16-le
6128
6129         The category for a coding system in which a text has an
6130         Unicode signature (cf. Unicode Standard) in the order of
6131         LITTLE endian at the head.  Assigned the coding-system (Lisp
6132         symbol) `utf-16-le' by default.
6133
6134    o coding-category-ccl
6135
6136         The category for a coding system of which encoder/decoder is
6137         written in CCL programs.  The default value is nil, i.e., no
6138         coding system is assigned.
6139
6140    o coding-category-binary
6141
6142         The category for a coding system not categorized in any of the
6143         above.  Assigned the coding-system (Lisp symbol)
6144         `no-conversion' by default.
6145
6146    Each of them is a Lisp symbol and the value is an actual
6147    `coding-system's (this is also a Lisp symbol) assigned by a user.
6148    What Emacs does actually is to detect a category of coding system.
6149    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6150    decide only one possible category, it selects a category of the
6151    highest priority.  Priorities of categories are also specified by a
6152    user in a Lisp variable `coding-category-list'.
6153
6154 */
6155
6156 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6157                                            int eol_seen);
6158
6159
6160 /* Return the number of ASCII characters at the head of the source.
6161    By side effects, set coding->head_ascii and update
6162    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6163    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6164    reliable only when all the source bytes are ASCII.  */
6165
6166 static int
6167 check_ascii (struct coding_system *coding)
6168 {
6169   const unsigned char *src, *end;
6170   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6171   int eol_seen = coding->eol_seen;
6172
6173   coding_set_source (coding);
6174   src = coding->source;
6175   end = src + coding->src_bytes;
6176
6177   if (inhibit_eol_conversion
6178       || SYMBOLP (eol_type))
6179     {
6180       /* We don't have to check EOL format.  */
6181       while (src < end && !( *src & 0x80))
6182         {
6183           if (*src++ == '\n')
6184             eol_seen |= EOL_SEEN_LF;
6185         }
6186     }
6187   else
6188     {
6189       end--;                /* We look ahead one byte for "CR LF".  */
6190       while (src < end)
6191         {
6192           int c = *src;
6193
6194           if (c & 0x80)
6195             break;
6196           src++;
6197           if (c == '\r')
6198             {
6199               if (*src == '\n')
6200                 {
6201                   eol_seen |= EOL_SEEN_CRLF;
6202                   src++;
6203                 }
6204               else
6205                 eol_seen |= EOL_SEEN_CR;
6206             }
6207           else if (c == '\n')
6208             eol_seen |= EOL_SEEN_LF;
6209         }
6210       if (src == end)
6211         {
6212           int c = *src;
6213
6214           /* All bytes but the last one C are ASCII.  */
6215           if (! (c & 0x80))
6216             {
6217               if (c == '\r')
6218                 eol_seen |= EOL_SEEN_CR;
6219               else if (c  == '\n')
6220                 eol_seen |= EOL_SEEN_LF;
6221               src++;
6222             }
6223         }
6224     }
6225   coding->head_ascii = src - coding->source;
6226   coding->eol_seen = eol_seen;
6227   return (coding->head_ascii);
6228 }
6229
6230
6231 /* Return the number of characters at the source if all the bytes are
6232    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6233    effects, update coding->eol_seen.  The value of coding->eol_seen is
6234    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6235    the value is reliable only when all the source bytes are valid
6236    UTF-8.  */
6237
6238 static int
6239 check_utf_8 (struct coding_system *coding)
6240 {
6241   const unsigned char *src, *end;
6242   int eol_seen;
6243   int nchars = coding->head_ascii;
6244
6245   if (coding->head_ascii < 0)
6246     check_ascii (coding);
6247   else
6248     coding_set_source (coding);
6249   src = coding->source + coding->head_ascii;
6250   /* We look ahead one byte for CR LF.  */
6251   end = coding->source + coding->src_bytes - 1;
6252   eol_seen = coding->eol_seen;
6253   while (src < end)
6254     {
6255       int c = *src;
6256
6257       if (UTF_8_1_OCTET_P (*src))
6258         {
6259           src++;
6260           if (c < 0x20)
6261             {
6262               if (c == '\r')
6263                 {
6264                   if (*src == '\n')
6265                     {
6266                       eol_seen |= EOL_SEEN_CRLF;
6267                       src++;
6268                       nchars++;
6269                     }
6270                   else
6271                     eol_seen |= EOL_SEEN_CR;
6272                 }
6273               else if (c == '\n')
6274                 eol_seen |= EOL_SEEN_LF;
6275             }
6276         }
6277       else if (UTF_8_2_OCTET_LEADING_P (c))
6278         {
6279           if (c < 0xC2          /* overlong sequence */
6280               || src + 1 >= end
6281               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6282             return -1;
6283           src += 2;
6284         }
6285       else if (UTF_8_3_OCTET_LEADING_P (c))
6286         {
6287           if (src + 2 >= end
6288               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6289                     && UTF_8_EXTRA_OCTET_P (src[2])))
6290             return -1;
6291           c = (((c & 0xF) << 12)
6292                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6293           if (c < 0x800                       /* overlong sequence */
6294               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6295             return -1;
6296           src += 3;
6297         }
6298       else if (UTF_8_4_OCTET_LEADING_P (c))
6299         {
6300           if (src + 3 >= end
6301               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6302                     && UTF_8_EXTRA_OCTET_P (src[2])
6303                     && UTF_8_EXTRA_OCTET_P (src[3])))
6304             return -1;
6305           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6306                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6307           if (c < 0x10000       /* overlong sequence */
6308               || c >= 0x110000) /* non-Unicode character  */
6309             return -1;
6310           src += 4;
6311         }
6312       else
6313         return -1;
6314       nchars++;
6315     }
6316
6317   if (src == end)
6318     {
6319       if (! UTF_8_1_OCTET_P (*src))
6320         return -1;
6321       nchars++;
6322       if (*src == '\r')
6323         eol_seen |= EOL_SEEN_CR;
6324       else if (*src  == '\n')
6325         eol_seen |= EOL_SEEN_LF;
6326     }
6327   coding->eol_seen = eol_seen;
6328   return nchars;
6329 }
6330
6331
6332 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6333    SOURCE is encoded.  If CATEGORY is one of
6334    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6335    two-byte, else they are encoded by one-byte.
6336
6337    Return one of EOL_SEEN_XXX.  */
6338
6339 #define MAX_EOL_CHECK_COUNT 3
6340
6341 static int
6342 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6343             enum coding_category category)
6344 {
6345   const unsigned char *src = source, *src_end = src + src_bytes;
6346   unsigned char c;
6347   int total  = 0;
6348   int eol_seen = EOL_SEEN_NONE;
6349
6350   if ((1 << category) & CATEGORY_MASK_UTF_16)
6351     {
6352       bool msb = category == (coding_category_utf_16_le
6353                               | coding_category_utf_16_le_nosig);
6354       bool lsb = !msb;
6355
6356       while (src + 1 < src_end)
6357         {
6358           c = src[lsb];
6359           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6360             {
6361               int this_eol;
6362
6363               if (c == '\n')
6364                 this_eol = EOL_SEEN_LF;
6365               else if (src + 3 >= src_end
6366                        || src[msb + 2] != 0
6367                        || src[lsb + 2] != '\n')
6368                 this_eol = EOL_SEEN_CR;
6369               else
6370                 {
6371                   this_eol = EOL_SEEN_CRLF;
6372                   src += 2;
6373                 }
6374
6375               if (eol_seen == EOL_SEEN_NONE)
6376                 /* This is the first end-of-line.  */
6377                 eol_seen = this_eol;
6378               else if (eol_seen != this_eol)
6379                 {
6380                   /* The found type is different from what found before.
6381                      Allow for stray ^M characters in DOS EOL files.  */
6382                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6383                       || (eol_seen == EOL_SEEN_CRLF
6384                           && this_eol == EOL_SEEN_CR))
6385                     eol_seen = EOL_SEEN_CRLF;
6386                   else
6387                     {
6388                       eol_seen = EOL_SEEN_LF;
6389                       break;
6390                     }
6391                 }
6392               if (++total == MAX_EOL_CHECK_COUNT)
6393                 break;
6394             }
6395           src += 2;
6396         }
6397     }
6398   else
6399     while (src < src_end)
6400       {
6401         c = *src++;
6402         if (c == '\n' || c == '\r')
6403           {
6404             int this_eol;
6405
6406             if (c == '\n')
6407               this_eol = EOL_SEEN_LF;
6408             else if (src >= src_end || *src != '\n')
6409               this_eol = EOL_SEEN_CR;
6410             else
6411               this_eol = EOL_SEEN_CRLF, src++;
6412
6413             if (eol_seen == EOL_SEEN_NONE)
6414               /* This is the first end-of-line.  */
6415               eol_seen = this_eol;
6416             else if (eol_seen != this_eol)
6417               {
6418                 /* The found type is different from what found before.
6419                    Allow for stray ^M characters in DOS EOL files.  */
6420                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6421                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6422                   eol_seen = EOL_SEEN_CRLF;
6423                 else
6424                   {
6425                     eol_seen = EOL_SEEN_LF;
6426                     break;
6427                   }
6428               }
6429             if (++total == MAX_EOL_CHECK_COUNT)
6430               break;
6431           }
6432       }
6433   return eol_seen;
6434 }
6435
6436
6437 static Lisp_Object
6438 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6439 {
6440   Lisp_Object eol_type;
6441
6442   eol_type = CODING_ID_EOL_TYPE (coding->id);
6443   if (! VECTORP (eol_type))
6444     /* Already adjusted.  */
6445     return eol_type;
6446   if (eol_seen & EOL_SEEN_LF)
6447     {
6448       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6449       eol_type = Qunix;
6450     }
6451   else if (eol_seen & EOL_SEEN_CRLF)
6452     {
6453       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6454       eol_type = Qdos;
6455     }
6456   else if (eol_seen & EOL_SEEN_CR)
6457     {
6458       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6459       eol_type = Qmac;
6460     }
6461   return eol_type;
6462 }
6463
6464 /* Detect how a text specified in CODING is encoded.  If a coding
6465    system is detected, update fields of CODING by the detected coding
6466    system.  */
6467
6468 static void
6469 detect_coding (struct coding_system *coding)
6470 {
6471   const unsigned char *src, *src_end;
6472   unsigned int saved_mode = coding->mode;
6473   Lisp_Object found = Qnil;
6474   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6475
6476   coding->consumed = coding->consumed_char = 0;
6477   coding->produced = coding->produced_char = 0;
6478   coding_set_source (coding);
6479
6480   src_end = coding->source + coding->src_bytes;
6481
6482   coding->eol_seen = EOL_SEEN_NONE;
6483   /* If we have not yet decided the text encoding type, detect it
6484      now.  */
6485   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6486     {
6487       int c, i;
6488       struct coding_detection_info detect_info;
6489       bool null_byte_found = 0, eight_bit_found = 0;
6490       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6491                                        inhibit_null_byte_detection);
6492       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6493                                        inhibit_iso_escape_detection);
6494       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6495
6496       coding->head_ascii = 0;
6497       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6498       for (src = coding->source; src < src_end; src++)
6499         {
6500           c = *src;
6501           if (c & 0x80)
6502             {
6503               eight_bit_found = 1;
6504               if (null_byte_found)
6505                 break;
6506             }
6507           else if (c < 0x20)
6508             {
6509               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6510                   && ! inhibit_ied
6511                   && ! detect_info.checked)
6512                 {
6513                   if (detect_coding_iso_2022 (coding, &detect_info))
6514                     {
6515                       /* We have scanned the whole data.  */
6516                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6517                         {
6518                           /* We didn't find an 8-bit code.  We may
6519                              have found a null-byte, but it's very
6520                              rare that a binary file conforms to
6521                              ISO-2022.  */
6522                           src = src_end;
6523                           coding->head_ascii = src - coding->source;
6524                         }
6525                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6526                       break;
6527                     }
6528                 }
6529               else if (! c && !inhibit_nbd)
6530                 {
6531                   null_byte_found = 1;
6532                   if (eight_bit_found)
6533                     break;
6534                 }
6535               else if (! disable_ascii_optimization
6536                        && ! inhibit_eol_conversion)
6537                 {
6538                   if (c == '\r')
6539                     {
6540                       if (src < src_end && src[1] == '\n')
6541                         {
6542                           coding->eol_seen |= EOL_SEEN_CRLF;
6543                           src++;
6544                           if (! eight_bit_found)
6545                             coding->head_ascii++;
6546                         }
6547                       else
6548                         coding->eol_seen |= EOL_SEEN_CR;
6549                     }
6550                   else if (c == '\n')
6551                     {
6552                       coding->eol_seen |= EOL_SEEN_LF;
6553                     }
6554                 }
6555
6556               if (! eight_bit_found)
6557                 coding->head_ascii++;
6558             }
6559           else if (! eight_bit_found)
6560             coding->head_ascii++;
6561         }
6562
6563       if (null_byte_found || eight_bit_found
6564           || coding->head_ascii < coding->src_bytes
6565           || detect_info.found)
6566         {
6567           enum coding_category category;
6568           struct coding_system *this;
6569
6570           if (coding->head_ascii == coding->src_bytes)
6571             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6572             for (i = 0; i < coding_category_raw_text; i++)
6573               {
6574                 category = coding_priorities[i];
6575                 this = coding_categories + category;
6576                 if (detect_info.found & (1 << category))
6577                   break;
6578               }
6579           else
6580             {
6581               if (null_byte_found)
6582                 {
6583                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6584                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6585                 }
6586               else if (prefer_utf_8
6587                        && detect_coding_utf_8 (coding, &detect_info))
6588                 {
6589                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6590                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6591                 }
6592               for (i = 0; i < coding_category_raw_text; i++)
6593                 {
6594                   category = coding_priorities[i];
6595                   this = coding_categories + category;
6596                   /* Some of this->detector (e.g. detect_coding_sjis)
6597                      require this information.  */
6598                   coding->id = this->id;
6599                   if (this->id < 0)
6600                     {
6601                       /* No coding system of this category is defined.  */
6602                       detect_info.rejected |= (1 << category);
6603                     }
6604                   else if (category >= coding_category_raw_text)
6605                     continue;
6606                   else if (detect_info.checked & (1 << category))
6607                     {
6608                       if (detect_info.found & (1 << category))
6609                         break;
6610                     }
6611                   else if ((*(this->detector)) (coding, &detect_info)
6612                            && detect_info.found & (1 << category))
6613                     break;
6614                 }
6615             }
6616
6617           if (i < coding_category_raw_text)
6618             {
6619               if (category == coding_category_utf_8_auto)
6620                 {
6621                   Lisp_Object coding_systems;
6622
6623                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6624                                          coding_attr_utf_bom);
6625                   if (CONSP (coding_systems))
6626                     {
6627                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6628                         found = XCAR (coding_systems);
6629                       else
6630                         found = XCDR (coding_systems);
6631                     }
6632                   else
6633                     found = CODING_ID_NAME (this->id);
6634                 }
6635               else if (category == coding_category_utf_16_auto)
6636                 {
6637                   Lisp_Object coding_systems;
6638
6639                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6640                                          coding_attr_utf_bom);
6641                   if (CONSP (coding_systems))
6642                     {
6643                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6644                         found = XCAR (coding_systems);
6645                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6646                         found = XCDR (coding_systems);
6647                     }
6648                   else
6649                     found = CODING_ID_NAME (this->id);
6650                 }
6651               else
6652                 found = CODING_ID_NAME (this->id);
6653             }
6654           else if (null_byte_found)
6655             found = Qno_conversion;
6656           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6657                    == CATEGORY_MASK_ANY)
6658             found = Qraw_text;
6659           else if (detect_info.rejected)
6660             for (i = 0; i < coding_category_raw_text; i++)
6661               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6662                 {
6663                   this = coding_categories + coding_priorities[i];
6664                   found = CODING_ID_NAME (this->id);
6665                   break;
6666                 }
6667         }
6668     }
6669   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6670            == coding_category_utf_8_auto)
6671     {
6672       Lisp_Object coding_systems;
6673       struct coding_detection_info detect_info;
6674
6675       coding_systems
6676         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6677       detect_info.found = detect_info.rejected = 0;
6678       if (check_ascii (coding) == coding->src_bytes)
6679         {
6680           if (CONSP (coding_systems))
6681             found = XCDR (coding_systems);
6682         }
6683       else
6684         {
6685           if (CONSP (coding_systems)
6686               && detect_coding_utf_8 (coding, &detect_info))
6687             {
6688               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6689                 found = XCAR (coding_systems);
6690               else
6691                 found = XCDR (coding_systems);
6692             }
6693         }
6694     }
6695   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6696            == coding_category_utf_16_auto)
6697     {
6698       Lisp_Object coding_systems;
6699       struct coding_detection_info detect_info;
6700
6701       coding_systems
6702         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6703       detect_info.found = detect_info.rejected = 0;
6704       coding->head_ascii = 0;
6705       if (CONSP (coding_systems)
6706           && detect_coding_utf_16 (coding, &detect_info))
6707         {
6708           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6709             found = XCAR (coding_systems);
6710           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6711             found = XCDR (coding_systems);
6712         }
6713     }
6714
6715   if (! NILP (found))
6716     {
6717       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6718                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6719                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6720                            : EOL_SEEN_LF);
6721
6722       setup_coding_system (found, coding);
6723       if (specified_eol != EOL_SEEN_NONE)
6724         adjust_coding_eol_type (coding, specified_eol);
6725     }
6726
6727   coding->mode = saved_mode;
6728 }
6729
6730
6731 static void
6732 decode_eol (struct coding_system *coding)
6733 {
6734   Lisp_Object eol_type;
6735   unsigned char *p, *pbeg, *pend;
6736
6737   eol_type = CODING_ID_EOL_TYPE (coding->id);
6738   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6739     return;
6740
6741   if (NILP (coding->dst_object))
6742     pbeg = coding->destination;
6743   else
6744     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6745   pend = pbeg + coding->produced;
6746
6747   if (VECTORP (eol_type))
6748     {
6749       int eol_seen = EOL_SEEN_NONE;
6750
6751       for (p = pbeg; p < pend; p++)
6752         {
6753           if (*p == '\n')
6754             eol_seen |= EOL_SEEN_LF;
6755           else if (*p == '\r')
6756             {
6757               if (p + 1 < pend && *(p + 1) == '\n')
6758                 {
6759                   eol_seen |= EOL_SEEN_CRLF;
6760                   p++;
6761                 }
6762               else
6763                 eol_seen |= EOL_SEEN_CR;
6764             }
6765         }
6766       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6767       if ((eol_seen & EOL_SEEN_CRLF) != 0
6768           && (eol_seen & EOL_SEEN_CR) != 0
6769           && (eol_seen & EOL_SEEN_LF) == 0)
6770         eol_seen = EOL_SEEN_CRLF;
6771       else if (eol_seen != EOL_SEEN_NONE
6772           && eol_seen != EOL_SEEN_LF
6773           && eol_seen != EOL_SEEN_CRLF
6774           && eol_seen != EOL_SEEN_CR)
6775         eol_seen = EOL_SEEN_LF;
6776       if (eol_seen != EOL_SEEN_NONE)
6777         eol_type = adjust_coding_eol_type (coding, eol_seen);
6778     }
6779
6780   if (EQ (eol_type, Qmac))
6781     {
6782       for (p = pbeg; p < pend; p++)
6783         if (*p == '\r')
6784           *p = '\n';
6785     }
6786   else if (EQ (eol_type, Qdos))
6787     {
6788       ptrdiff_t n = 0;
6789
6790       if (NILP (coding->dst_object))
6791         {
6792           /* Start deleting '\r' from the tail to minimize the memory
6793              movement.  */
6794           for (p = pend - 2; p >= pbeg; p--)
6795             if (*p == '\r')
6796               {
6797                 memmove (p, p + 1, pend-- - p - 1);
6798                 n++;
6799               }
6800         }
6801       else
6802         {
6803           ptrdiff_t pos_byte = coding->dst_pos_byte;
6804           ptrdiff_t pos = coding->dst_pos;
6805           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6806
6807           while (pos < pos_end)
6808             {
6809               p = BYTE_POS_ADDR (pos_byte);
6810               if (*p == '\r' && p[1] == '\n')
6811                 {
6812                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6813                   n++;
6814                   pos_end--;
6815                 }
6816               pos++;
6817               if (coding->dst_multibyte)
6818                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6819               else
6820                 pos_byte++;
6821             }
6822         }
6823       coding->produced -= n;
6824       coding->produced_char -= n;
6825     }
6826 }
6827
6828
6829 /* Return a translation table (or list of them) from coding system
6830    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6831    not ENCODEP). */
6832
6833 static Lisp_Object
6834 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6835 {
6836   Lisp_Object standard, translation_table;
6837   Lisp_Object val;
6838
6839   if (NILP (Venable_character_translation))
6840     {
6841       if (max_lookup)
6842         *max_lookup = 0;
6843       return Qnil;
6844     }
6845   if (encodep)
6846     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6847       standard = Vstandard_translation_table_for_encode;
6848   else
6849     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6850       standard = Vstandard_translation_table_for_decode;
6851   if (NILP (translation_table))
6852     translation_table = standard;
6853   else
6854     {
6855       if (SYMBOLP (translation_table))
6856         translation_table = Fget (translation_table, Qtranslation_table);
6857       else if (CONSP (translation_table))
6858         {
6859           translation_table = Fcopy_sequence (translation_table);
6860           for (val = translation_table; CONSP (val); val = XCDR (val))
6861             if (SYMBOLP (XCAR (val)))
6862               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6863         }
6864       if (CHAR_TABLE_P (standard))
6865         {
6866           if (CONSP (translation_table))
6867             translation_table = nconc2 (translation_table, list1 (standard));
6868           else
6869             translation_table = list2 (translation_table, standard);
6870         }
6871     }
6872
6873   if (max_lookup)
6874     {
6875       *max_lookup = 1;
6876       if (CHAR_TABLE_P (translation_table)
6877           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6878         {
6879           val = XCHAR_TABLE (translation_table)->extras[1];
6880           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6881             *max_lookup = XFASTINT (val);
6882         }
6883       else if (CONSP (translation_table))
6884         {
6885           Lisp_Object tail;
6886
6887           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6888             if (CHAR_TABLE_P (XCAR (tail))
6889                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6890               {
6891                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6892                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6893                   *max_lookup = XFASTINT (tailval);
6894               }
6895         }
6896     }
6897   return translation_table;
6898 }
6899
6900 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6901   do {                                                          \
6902     trans = Qnil;                                               \
6903     if (CHAR_TABLE_P (table))                                   \
6904       {                                                         \
6905         trans = CHAR_TABLE_REF (table, c);                      \
6906         if (CHARACTERP (trans))                                 \
6907           c = XFASTINT (trans), trans = Qnil;                   \
6908       }                                                         \
6909     else if (CONSP (table))                                     \
6910       {                                                         \
6911         Lisp_Object tail;                                       \
6912                                                                 \
6913         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6914           if (CHAR_TABLE_P (XCAR (tail)))                       \
6915             {                                                   \
6916               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6917               if (CHARACTERP (trans))                           \
6918                 c = XFASTINT (trans), trans = Qnil;             \
6919               else if (! NILP (trans))                          \
6920                 break;                                          \
6921             }                                                   \
6922       }                                                         \
6923   } while (0)
6924
6925
6926 /* Return a translation of character(s) at BUF according to TRANS.
6927    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6928    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6929    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6930    translation is found, and Qnil if not found..
6931    If BUF is too short to lookup characters in FROM, return Qt.  */
6932
6933 static Lisp_Object
6934 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6935 {
6936
6937   if (INTEGERP (trans))
6938     return trans;
6939   for (; CONSP (trans); trans = XCDR (trans))
6940     {
6941       Lisp_Object val = XCAR (trans);
6942       Lisp_Object from = XCAR (val);
6943       ptrdiff_t len = ASIZE (from);
6944       ptrdiff_t i;
6945
6946       for (i = 0; i < len; i++)
6947         {
6948           if (buf + i == buf_end)
6949             return Qt;
6950           if (XINT (AREF (from, i)) != buf[i])
6951             break;
6952         }
6953       if (i == len)
6954         return val;
6955     }
6956   return Qnil;
6957 }
6958
6959
6960 static int
6961 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6962                bool last_block)
6963 {
6964   unsigned char *dst = coding->destination + coding->produced;
6965   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6966   ptrdiff_t produced;
6967   ptrdiff_t produced_chars = 0;
6968   int carryover = 0;
6969
6970   if (! coding->chars_at_source)
6971     {
6972       /* Source characters are in coding->charbuf.  */
6973       int *buf = coding->charbuf;
6974       int *buf_end = buf + coding->charbuf_used;
6975
6976       if (EQ (coding->src_object, coding->dst_object))
6977         {
6978           coding_set_source (coding);
6979           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6980         }
6981
6982       while (buf < buf_end)
6983         {
6984           int c = *buf;
6985           ptrdiff_t i;
6986
6987           if (c >= 0)
6988             {
6989               ptrdiff_t from_nchars = 1, to_nchars = 1;
6990               Lisp_Object trans = Qnil;
6991
6992               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6993               if (! NILP (trans))
6994                 {
6995                   trans = get_translation (trans, buf, buf_end);
6996                   if (INTEGERP (trans))
6997                     c = XINT (trans);
6998                   else if (CONSP (trans))
6999                     {
7000                       from_nchars = ASIZE (XCAR (trans));
7001                       trans = XCDR (trans);
7002                       if (INTEGERP (trans))
7003                         c = XINT (trans);
7004                       else
7005                         {
7006                           to_nchars = ASIZE (trans);
7007                           c = XINT (AREF (trans, 0));
7008                         }
7009                     }
7010                   else if (EQ (trans, Qt) && ! last_block)
7011                     break;
7012                 }
7013
7014               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7015                 {
7016                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7017                        / MAX_MULTIBYTE_LENGTH)
7018                       < to_nchars)
7019                     memory_full (SIZE_MAX);
7020                   dst = alloc_destination (coding,
7021                                            buf_end - buf
7022                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7023                                            dst);
7024                   if (EQ (coding->src_object, coding->dst_object))
7025                     {
7026                       coding_set_source (coding);
7027                       dst_end = (((unsigned char *) coding->source)
7028                                  + coding->consumed);
7029                     }
7030                   else
7031                     dst_end = coding->destination + coding->dst_bytes;
7032                 }
7033
7034               for (i = 0; i < to_nchars; i++)
7035                 {
7036                   if (i > 0)
7037                     c = XINT (AREF (trans, i));
7038                   if (coding->dst_multibyte
7039                       || ! CHAR_BYTE8_P (c))
7040                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7041                   else
7042                     *dst++ = CHAR_TO_BYTE8 (c);
7043                 }
7044               produced_chars += to_nchars;
7045               buf += from_nchars;
7046             }
7047           else
7048             /* This is an annotation datum.  (-C) is the length.  */
7049             buf += -c;
7050         }
7051       carryover = buf_end - buf;
7052     }
7053   else
7054     {
7055       /* Source characters are at coding->source.  */
7056       const unsigned char *src = coding->source;
7057       const unsigned char *src_end = src + coding->consumed;
7058
7059       if (EQ (coding->dst_object, coding->src_object))
7060         dst_end = (unsigned char *) src;
7061       if (coding->src_multibyte != coding->dst_multibyte)
7062         {
7063           if (coding->src_multibyte)
7064             {
7065               bool multibytep = 1;
7066               ptrdiff_t consumed_chars = 0;
7067
7068               while (1)
7069                 {
7070                   const unsigned char *src_base = src;
7071                   int c;
7072
7073                   ONE_MORE_BYTE (c);
7074                   if (dst == dst_end)
7075                     {
7076                       if (EQ (coding->src_object, coding->dst_object))
7077                         dst_end = (unsigned char *) src;
7078                       if (dst == dst_end)
7079                         {
7080                           ptrdiff_t offset = src - coding->source;
7081
7082                           dst = alloc_destination (coding, src_end - src + 1,
7083                                                    dst);
7084                           dst_end = coding->destination + coding->dst_bytes;
7085                           coding_set_source (coding);
7086                           src = coding->source + offset;
7087                           src_end = coding->source + coding->consumed;
7088                           if (EQ (coding->src_object, coding->dst_object))
7089                             dst_end = (unsigned char *) src;
7090                         }
7091                     }
7092                   *dst++ = c;
7093                   produced_chars++;
7094                 }
7095             no_more_source:
7096               ;
7097             }
7098           else
7099             while (src < src_end)
7100               {
7101                 bool multibytep = 1;
7102                 int c = *src++;
7103
7104                 if (dst >= dst_end - 1)
7105                   {
7106                     if (EQ (coding->src_object, coding->dst_object))
7107                       dst_end = (unsigned char *) src;
7108                     if (dst >= dst_end - 1)
7109                       {
7110                         ptrdiff_t offset = src - coding->source;
7111                         ptrdiff_t more_bytes;
7112
7113                         if (EQ (coding->src_object, coding->dst_object))
7114                           more_bytes = ((src_end - src) / 2) + 2;
7115                         else
7116                           more_bytes = src_end - src + 2;
7117                         dst = alloc_destination (coding, more_bytes, dst);
7118                         dst_end = coding->destination + coding->dst_bytes;
7119                         coding_set_source (coding);
7120                         src = coding->source + offset;
7121                         src_end = coding->source + coding->consumed;
7122                         if (EQ (coding->src_object, coding->dst_object))
7123                           dst_end = (unsigned char *) src;
7124                       }
7125                   }
7126                 EMIT_ONE_BYTE (c);
7127               }
7128         }
7129       else
7130         {
7131           if (!EQ (coding->src_object, coding->dst_object))
7132             {
7133               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7134
7135               if (require > 0)
7136                 {
7137                   ptrdiff_t offset = src - coding->source;
7138
7139                   dst = alloc_destination (coding, require, dst);
7140                   coding_set_source (coding);
7141                   src = coding->source + offset;
7142                   src_end = coding->source + coding->consumed;
7143                 }
7144             }
7145           produced_chars = coding->consumed_char;
7146           while (src < src_end)
7147             *dst++ = *src++;
7148         }
7149     }
7150
7151   produced = dst - (coding->destination + coding->produced);
7152   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7153     insert_from_gap (produced_chars, produced, 0);
7154   coding->produced += produced;
7155   coding->produced_char += produced_chars;
7156   return carryover;
7157 }
7158
7159 /* Compose text in CODING->object according to the annotation data at
7160    CHARBUF.  CHARBUF is an array:
7161      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7162  */
7163
7164 static void
7165 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7166 {
7167   int len;
7168   ptrdiff_t to;
7169   enum composition_method method;
7170   Lisp_Object components;
7171
7172   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7173   to = pos + charbuf[2];
7174   method = (enum composition_method) (charbuf[4]);
7175
7176   if (method == COMPOSITION_RELATIVE)
7177     components = Qnil;
7178   else
7179     {
7180       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7181       int i, j;
7182
7183       if (method == COMPOSITION_WITH_RULE)
7184         len = charbuf[2] * 3 - 2;
7185       charbuf += MAX_ANNOTATION_LENGTH;
7186       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7187       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7188         {
7189           if (charbuf[i] >= 0)
7190             args[j] = make_number (charbuf[i]);
7191           else
7192             {
7193               i++;
7194               args[j] = make_number (charbuf[i] % 0x100);
7195             }
7196         }
7197       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7198     }
7199   compose_text (pos, to, components, Qnil, coding->dst_object);
7200 }
7201
7202
7203 /* Put `charset' property on text in CODING->object according to
7204    the annotation data at CHARBUF.  CHARBUF is an array:
7205      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7206  */
7207
7208 static void
7209 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7210 {
7211   ptrdiff_t from = pos - charbuf[2];
7212   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7213
7214   Fput_text_property (make_number (from), make_number (pos),
7215                       Qcharset, CHARSET_NAME (charset),
7216                       coding->dst_object);
7217 }
7218
7219
7220 #define CHARBUF_SIZE 0x4000
7221
7222 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7223   do {                                                                  \
7224     coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int));        \
7225     coding->charbuf_size = CHARBUF_SIZE;                                \
7226   } while (0)
7227
7228
7229 static void
7230 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7231 {
7232   int *charbuf = coding->charbuf;
7233   int *charbuf_end = charbuf + coding->charbuf_used;
7234
7235   if (NILP (coding->dst_object))
7236     return;
7237
7238   while (charbuf < charbuf_end)
7239     {
7240       if (*charbuf >= 0)
7241         pos++, charbuf++;
7242       else
7243         {
7244           int len = -*charbuf;
7245
7246           if (len > 2)
7247             switch (charbuf[1])
7248               {
7249               case CODING_ANNOTATE_COMPOSITION_MASK:
7250                 produce_composition (coding, charbuf, pos);
7251                 break;
7252               case CODING_ANNOTATE_CHARSET_MASK:
7253                 produce_charset (coding, charbuf, pos);
7254                 break;
7255               }
7256           charbuf += len;
7257         }
7258     }
7259 }
7260
7261 /* Decode the data at CODING->src_object into CODING->dst_object.
7262    CODING->src_object is a buffer, a string, or nil.
7263    CODING->dst_object is a buffer.
7264
7265    If CODING->src_object is a buffer, it must be the current buffer.
7266    In this case, if CODING->src_pos is positive, it is a position of
7267    the source text in the buffer, otherwise, the source text is in the
7268    gap area of the buffer, and CODING->src_pos specifies the offset of
7269    the text from GPT (which must be the same as PT).  If this is the
7270    same buffer as CODING->dst_object, CODING->src_pos must be
7271    negative.
7272
7273    If CODING->src_object is a string, CODING->src_pos is an index to
7274    that string.
7275
7276    If CODING->src_object is nil, CODING->source must already point to
7277    the non-relocatable memory area.  In this case, CODING->src_pos is
7278    an offset from CODING->source.
7279
7280    The decoded data is inserted at the current point of the buffer
7281    CODING->dst_object.
7282 */
7283
7284 static void
7285 decode_coding (struct coding_system *coding)
7286 {
7287   Lisp_Object attrs;
7288   Lisp_Object undo_list;
7289   Lisp_Object translation_table;
7290   struct ccl_spec cclspec;
7291   int carryover;
7292   int i;
7293
7294   USE_SAFE_ALLOCA;
7295
7296   if (BUFFERP (coding->src_object)
7297       && coding->src_pos > 0
7298       && coding->src_pos < GPT
7299       && coding->src_pos + coding->src_chars > GPT)
7300     move_gap_both (coding->src_pos, coding->src_pos_byte);
7301
7302   undo_list = Qt;
7303   if (BUFFERP (coding->dst_object))
7304     {
7305       set_buffer_internal (XBUFFER (coding->dst_object));
7306       if (GPT != PT)
7307         move_gap_both (PT, PT_BYTE);
7308
7309       /* We must disable undo_list in order to record the whole insert
7310          transaction via record_insert at the end.  But doing so also
7311          disables the recording of the first change to the undo_list.
7312          Therefore we check for first change here and record it via
7313          record_first_change if needed.  */
7314       if (MODIFF <= SAVE_MODIFF)
7315         record_first_change ();
7316
7317       undo_list = BVAR (current_buffer, undo_list);
7318       bset_undo_list (current_buffer, Qt);
7319     }
7320
7321   coding->consumed = coding->consumed_char = 0;
7322   coding->produced = coding->produced_char = 0;
7323   coding->chars_at_source = 0;
7324   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7325   coding->errors = 0;
7326
7327   ALLOC_CONVERSION_WORK_AREA (coding);
7328
7329   attrs = CODING_ID_ATTRS (coding->id);
7330   translation_table = get_translation_table (attrs, 0, NULL);
7331
7332   carryover = 0;
7333   if (coding->decoder == decode_coding_ccl)
7334     {
7335       coding->spec.ccl = &cclspec;
7336       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7337     }
7338   do
7339     {
7340       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7341
7342       coding_set_source (coding);
7343       coding->annotated = 0;
7344       coding->charbuf_used = carryover;
7345       (*(coding->decoder)) (coding);
7346       coding_set_destination (coding);
7347       carryover = produce_chars (coding, translation_table, 0);
7348       if (coding->annotated)
7349         produce_annotation (coding, pos);
7350       for (i = 0; i < carryover; i++)
7351         coding->charbuf[i]
7352           = coding->charbuf[coding->charbuf_used - carryover + i];
7353     }
7354   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7355          || (coding->consumed < coding->src_bytes
7356              && (coding->result == CODING_RESULT_SUCCESS
7357                  || coding->result == CODING_RESULT_INVALID_SRC)));
7358
7359   if (carryover > 0)
7360     {
7361       coding_set_destination (coding);
7362       coding->charbuf_used = carryover;
7363       produce_chars (coding, translation_table, 1);
7364     }
7365
7366   coding->carryover_bytes = 0;
7367   if (coding->consumed < coding->src_bytes)
7368     {
7369       int nbytes = coding->src_bytes - coding->consumed;
7370       const unsigned char *src;
7371
7372       coding_set_source (coding);
7373       coding_set_destination (coding);
7374       src = coding->source + coding->consumed;
7375
7376       if (coding->mode & CODING_MODE_LAST_BLOCK)
7377         {
7378           /* Flush out unprocessed data as binary chars.  We are sure
7379              that the number of data is less than the size of
7380              coding->charbuf.  */
7381           coding->charbuf_used = 0;
7382           coding->chars_at_source = 0;
7383
7384           while (nbytes-- > 0)
7385             {
7386               int c = *src++;
7387
7388               if (c & 0x80)
7389                 c = BYTE8_TO_CHAR (c);
7390               coding->charbuf[coding->charbuf_used++] = c;
7391             }
7392           produce_chars (coding, Qnil, 1);
7393         }
7394       else
7395         {
7396           /* Record unprocessed bytes in coding->carryover.  We are
7397              sure that the number of data is less than the size of
7398              coding->carryover.  */
7399           unsigned char *p = coding->carryover;
7400
7401           if (nbytes > sizeof coding->carryover)
7402             nbytes = sizeof coding->carryover;
7403           coding->carryover_bytes = nbytes;
7404           while (nbytes-- > 0)
7405             *p++ = *src++;
7406         }
7407       coding->consumed = coding->src_bytes;
7408     }
7409
7410   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7411       && !inhibit_eol_conversion)
7412     decode_eol (coding);
7413   if (BUFFERP (coding->dst_object))
7414     {
7415       bset_undo_list (current_buffer, undo_list);
7416       record_insert (coding->dst_pos, coding->produced_char);
7417     }
7418
7419   SAFE_FREE ();
7420 }
7421
7422
7423 /* Extract an annotation datum from a composition starting at POS and
7424    ending before LIMIT of CODING->src_object (buffer or string), store
7425    the data in BUF, set *STOP to a starting position of the next
7426    composition (if any) or to LIMIT, and return the address of the
7427    next element of BUF.
7428
7429    If such an annotation is not found, set *STOP to a starting
7430    position of a composition after POS (if any) or to LIMIT, and
7431    return BUF.  */
7432
7433 static int *
7434 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7435                                struct coding_system *coding, int *buf,
7436                                ptrdiff_t *stop)
7437 {
7438   ptrdiff_t start, end;
7439   Lisp_Object prop;
7440
7441   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7442       || end > limit)
7443     *stop = limit;
7444   else if (start > pos)
7445     *stop = start;
7446   else
7447     {
7448       if (start == pos)
7449         {
7450           /* We found a composition.  Store the corresponding
7451              annotation data in BUF.  */
7452           int *head = buf;
7453           enum composition_method method = COMPOSITION_METHOD (prop);
7454           int nchars = COMPOSITION_LENGTH (prop);
7455
7456           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7457           if (method != COMPOSITION_RELATIVE)
7458             {
7459               Lisp_Object components;
7460               ptrdiff_t i, len, i_byte;
7461
7462               components = COMPOSITION_COMPONENTS (prop);
7463               if (VECTORP (components))
7464                 {
7465                   len = ASIZE (components);
7466                   for (i = 0; i < len; i++)
7467                     *buf++ = XINT (AREF (components, i));
7468                 }
7469               else if (STRINGP (components))
7470                 {
7471                   len = SCHARS (components);
7472                   i = i_byte = 0;
7473                   while (i < len)
7474                     {
7475                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7476                       buf++;
7477                     }
7478                 }
7479               else if (INTEGERP (components))
7480                 {
7481                   len = 1;
7482                   *buf++ = XINT (components);
7483                 }
7484               else if (CONSP (components))
7485                 {
7486                   for (len = 0; CONSP (components);
7487                        len++, components = XCDR (components))
7488                     *buf++ = XINT (XCAR (components));
7489                 }
7490               else
7491                 emacs_abort ();
7492               *head -= len;
7493             }
7494         }
7495
7496       if (find_composition (end, limit, &start, &end, &prop,
7497                             coding->src_object)
7498           && end <= limit)
7499         *stop = start;
7500       else
7501         *stop = limit;
7502     }
7503   return buf;
7504 }
7505
7506
7507 /* Extract an annotation datum from a text property `charset' at POS of
7508    CODING->src_object (buffer of string), store the data in BUF, set
7509    *STOP to the position where the value of `charset' property changes
7510    (limiting by LIMIT), and return the address of the next element of
7511    BUF.
7512
7513    If the property value is nil, set *STOP to the position where the
7514    property value is non-nil (limiting by LIMIT), and return BUF.  */
7515
7516 static int *
7517 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7518                            struct coding_system *coding, int *buf,
7519                            ptrdiff_t *stop)
7520 {
7521   Lisp_Object val, next;
7522   int id;
7523
7524   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7525   if (! NILP (val) && CHARSETP (val))
7526     id = XINT (CHARSET_SYMBOL_ID (val));
7527   else
7528     id = -1;
7529   ADD_CHARSET_DATA (buf, 0, id);
7530   next = Fnext_single_property_change (make_number (pos), Qcharset,
7531                                        coding->src_object,
7532                                        make_number (limit));
7533   *stop = XINT (next);
7534   return buf;
7535 }
7536
7537
7538 static void
7539 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7540                int max_lookup)
7541 {
7542   int *buf = coding->charbuf;
7543   int *buf_end = coding->charbuf + coding->charbuf_size;
7544   const unsigned char *src = coding->source + coding->consumed;
7545   const unsigned char *src_end = coding->source + coding->src_bytes;
7546   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7547   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7548   bool multibytep = coding->src_multibyte;
7549   Lisp_Object eol_type;
7550   int c;
7551   ptrdiff_t stop, stop_composition, stop_charset;
7552   int *lookup_buf = NULL;
7553
7554   if (! NILP (translation_table))
7555     lookup_buf = alloca (sizeof (int) * max_lookup);
7556
7557   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7558   if (VECTORP (eol_type))
7559     eol_type = Qunix;
7560
7561   /* Note: composition handling is not yet implemented.  */
7562   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7563
7564   if (NILP (coding->src_object))
7565     stop = stop_composition = stop_charset = end_pos;
7566   else
7567     {
7568       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7569         stop = stop_composition = pos;
7570       else
7571         stop = stop_composition = end_pos;
7572       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7573         stop = stop_charset = pos;
7574       else
7575         stop_charset = end_pos;
7576     }
7577
7578   /* Compensate for CRLF and conversion.  */
7579   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7580   while (buf < buf_end)
7581     {
7582       Lisp_Object trans;
7583
7584       if (pos == stop)
7585         {
7586           if (pos == end_pos)
7587             break;
7588           if (pos == stop_composition)
7589             buf = handle_composition_annotation (pos, end_pos, coding,
7590                                                  buf, &stop_composition);
7591           if (pos == stop_charset)
7592             buf = handle_charset_annotation (pos, end_pos, coding,
7593                                              buf, &stop_charset);
7594           stop = (stop_composition < stop_charset
7595                   ? stop_composition : stop_charset);
7596         }
7597
7598       if (! multibytep)
7599         {
7600           int bytes;
7601
7602           if (coding->encoder == encode_coding_raw_text
7603               || coding->encoder == encode_coding_ccl)
7604             c = *src++, pos++;
7605           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7606             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7607           else
7608             c = BYTE8_TO_CHAR (*src), src++, pos++;
7609         }
7610       else
7611         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7612       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7613         c = '\n';
7614       if (! EQ (eol_type, Qunix))
7615         {
7616           if (c == '\n')
7617             {
7618               if (EQ (eol_type, Qdos))
7619                 *buf++ = '\r';
7620               else
7621                 c = '\r';
7622             }
7623         }
7624
7625       trans = Qnil;
7626       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7627       if (NILP (trans))
7628         *buf++ = c;
7629       else
7630         {
7631           ptrdiff_t from_nchars = 1, to_nchars = 1;
7632           int *lookup_buf_end;
7633           const unsigned char *p = src;
7634           int i;
7635
7636           lookup_buf[0] = c;
7637           for (i = 1; i < max_lookup && p < src_end; i++)
7638             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7639           lookup_buf_end = lookup_buf + i;
7640           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7641           if (INTEGERP (trans))
7642             c = XINT (trans);
7643           else if (CONSP (trans))
7644             {
7645               from_nchars = ASIZE (XCAR (trans));
7646               trans = XCDR (trans);
7647               if (INTEGERP (trans))
7648                 c = XINT (trans);
7649               else
7650                 {
7651                   to_nchars = ASIZE (trans);
7652                   if (buf_end - buf < to_nchars)
7653                     break;
7654                   c = XINT (AREF (trans, 0));
7655                 }
7656             }
7657           else
7658             break;
7659           *buf++ = c;
7660           for (i = 1; i < to_nchars; i++)
7661             *buf++ = XINT (AREF (trans, i));
7662           for (i = 1; i < from_nchars; i++, pos++)
7663             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7664         }
7665     }
7666
7667   coding->consumed = src - coding->source;
7668   coding->consumed_char = pos - coding->src_pos;
7669   coding->charbuf_used = buf - coding->charbuf;
7670   coding->chars_at_source = 0;
7671 }
7672
7673
7674 /* Encode the text at CODING->src_object into CODING->dst_object.
7675    CODING->src_object is a buffer or a string.
7676    CODING->dst_object is a buffer or nil.
7677
7678    If CODING->src_object is a buffer, it must be the current buffer.
7679    In this case, if CODING->src_pos is positive, it is a position of
7680    the source text in the buffer, otherwise. the source text is in the
7681    gap area of the buffer, and coding->src_pos specifies the offset of
7682    the text from GPT (which must be the same as PT).  If this is the
7683    same buffer as CODING->dst_object, CODING->src_pos must be
7684    negative and CODING should not have `pre-write-conversion'.
7685
7686    If CODING->src_object is a string, CODING should not have
7687    `pre-write-conversion'.
7688
7689    If CODING->dst_object is a buffer, the encoded data is inserted at
7690    the current point of that buffer.
7691
7692    If CODING->dst_object is nil, the encoded data is placed at the
7693    memory area specified by CODING->destination.  */
7694
7695 static void
7696 encode_coding (struct coding_system *coding)
7697 {
7698   Lisp_Object attrs;
7699   Lisp_Object translation_table;
7700   int max_lookup;
7701   struct ccl_spec cclspec;
7702
7703   USE_SAFE_ALLOCA;
7704
7705   attrs = CODING_ID_ATTRS (coding->id);
7706   if (coding->encoder == encode_coding_raw_text)
7707     translation_table = Qnil, max_lookup = 0;
7708   else
7709     translation_table = get_translation_table (attrs, 1, &max_lookup);
7710
7711   if (BUFFERP (coding->dst_object))
7712     {
7713       set_buffer_internal (XBUFFER (coding->dst_object));
7714       coding->dst_multibyte
7715         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7716     }
7717
7718   coding->consumed = coding->consumed_char = 0;
7719   coding->produced = coding->produced_char = 0;
7720   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7721   coding->errors = 0;
7722
7723   ALLOC_CONVERSION_WORK_AREA (coding);
7724
7725   if (coding->encoder == encode_coding_ccl)
7726     {
7727       coding->spec.ccl = &cclspec;
7728       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7729     }
7730   do {
7731     coding_set_source (coding);
7732     consume_chars (coding, translation_table, max_lookup);
7733     coding_set_destination (coding);
7734     (*(coding->encoder)) (coding);
7735   } while (coding->consumed_char < coding->src_chars);
7736
7737   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7738     insert_from_gap (coding->produced_char, coding->produced, 0);
7739
7740   SAFE_FREE ();
7741 }
7742
7743
7744 /* Name (or base name) of work buffer for code conversion.  */
7745 static Lisp_Object Vcode_conversion_workbuf_name;
7746
7747 /* A working buffer used by the top level conversion.  Once it is
7748    created, it is never destroyed.  It has the name
7749    Vcode_conversion_workbuf_name.  The other working buffers are
7750    destroyed after the use is finished, and their names are modified
7751    versions of Vcode_conversion_workbuf_name.  */
7752 static Lisp_Object Vcode_conversion_reused_workbuf;
7753
7754 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7755 static bool reused_workbuf_in_use;
7756
7757
7758 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7759    multibyteness of returning buffer.  */
7760
7761 static Lisp_Object
7762 make_conversion_work_buffer (bool multibyte)
7763 {
7764   Lisp_Object name, workbuf;
7765   struct buffer *current;
7766
7767   if (reused_workbuf_in_use)
7768     {
7769       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7770       workbuf = Fget_buffer_create (name);
7771     }
7772   else
7773     {
7774       reused_workbuf_in_use = 1;
7775       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7776         Vcode_conversion_reused_workbuf
7777           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7778       workbuf = Vcode_conversion_reused_workbuf;
7779     }
7780   current = current_buffer;
7781   set_buffer_internal (XBUFFER (workbuf));
7782   /* We can't allow modification hooks to run in the work buffer.  For
7783      instance, directory_files_internal assumes that file decoding
7784      doesn't compile new regexps.  */
7785   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7786   Ferase_buffer ();
7787   bset_undo_list (current_buffer, Qt);
7788   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7789   set_buffer_internal (current);
7790   return workbuf;
7791 }
7792
7793
7794 static void
7795 code_conversion_restore (Lisp_Object arg)
7796 {
7797   Lisp_Object current, workbuf;
7798   struct gcpro gcpro1;
7799
7800   GCPRO1 (arg);
7801   current = XCAR (arg);
7802   workbuf = XCDR (arg);
7803   if (! NILP (workbuf))
7804     {
7805       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7806         reused_workbuf_in_use = 0;
7807       else
7808         Fkill_buffer (workbuf);
7809     }
7810   set_buffer_internal (XBUFFER (current));
7811   UNGCPRO;
7812 }
7813
7814 Lisp_Object
7815 code_conversion_save (bool with_work_buf, bool multibyte)
7816 {
7817   Lisp_Object workbuf = Qnil;
7818
7819   if (with_work_buf)
7820     workbuf = make_conversion_work_buffer (multibyte);
7821   record_unwind_protect (code_conversion_restore,
7822                          Fcons (Fcurrent_buffer (), workbuf));
7823   return workbuf;
7824 }
7825
7826 void
7827 decode_coding_gap (struct coding_system *coding,
7828                    ptrdiff_t chars, ptrdiff_t bytes)
7829 {
7830   ptrdiff_t count = SPECPDL_INDEX ();
7831   Lisp_Object attrs;
7832
7833   coding->src_object = Fcurrent_buffer ();
7834   coding->src_chars = chars;
7835   coding->src_bytes = bytes;
7836   coding->src_pos = -chars;
7837   coding->src_pos_byte = -bytes;
7838   coding->src_multibyte = chars < bytes;
7839   coding->dst_object = coding->src_object;
7840   coding->dst_pos = PT;
7841   coding->dst_pos_byte = PT_BYTE;
7842   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7843
7844   coding->head_ascii = -1;
7845   coding->detected_utf8_chars = -1;
7846   coding->eol_seen = EOL_SEEN_NONE;
7847   if (CODING_REQUIRE_DETECTION (coding))
7848     detect_coding (coding);
7849   attrs = CODING_ID_ATTRS (coding->id);
7850   if (! disable_ascii_optimization
7851       && ! coding->src_multibyte
7852       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7853       && NILP (CODING_ATTR_POST_READ (attrs))
7854       && NILP (get_translation_table (attrs, 0, NULL)))
7855     {
7856       chars = coding->head_ascii;
7857       if (chars < 0)
7858         chars = check_ascii (coding);
7859       if (chars != bytes)
7860         {
7861           /* There exists a non-ASCII byte.  */
7862           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7863             {
7864               if (coding->detected_utf8_chars >= 0)
7865                 chars = coding->detected_utf8_chars;
7866               else
7867                 chars = check_utf_8 (coding);
7868               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7869                   && coding->head_ascii == 0
7870                   && coding->source[0] == UTF_8_BOM_1
7871                   && coding->source[1] == UTF_8_BOM_2
7872                   && coding->source[2] == UTF_8_BOM_3)
7873                 {
7874                   chars--;
7875                   bytes -= 3;
7876                   coding->src_bytes -= 3;
7877                 }
7878             }
7879           else
7880             chars = -1;
7881         }
7882       if (chars >= 0)
7883         {
7884           Lisp_Object eol_type;
7885
7886           eol_type = CODING_ID_EOL_TYPE (coding->id);
7887           if (VECTORP (eol_type))
7888             {
7889               if (coding->eol_seen != EOL_SEEN_NONE)
7890                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7891             }
7892           if (EQ (eol_type, Qmac))
7893             {
7894               unsigned char *src_end = GAP_END_ADDR;
7895               unsigned char *src = src_end - coding->src_bytes;
7896
7897               while (src < src_end)
7898                 {
7899                   if (*src++ == '\r')
7900                     src[-1] = '\n';
7901                 }
7902             }
7903           else if (EQ (eol_type, Qdos))
7904             {
7905               unsigned char *src = GAP_END_ADDR;
7906               unsigned char *src_beg = src - coding->src_bytes;
7907               unsigned char *dst = src;
7908               ptrdiff_t diff;
7909
7910               while (src_beg < src)
7911                 {
7912                   *--dst = *--src;
7913                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7914                     src--;
7915                 }
7916               diff = dst - src;
7917               bytes -= diff;
7918               chars -= diff;
7919             }
7920           coding->produced = bytes;
7921           coding->produced_char = chars;
7922           insert_from_gap (chars, bytes, 1);
7923           return;
7924         }
7925     }
7926   code_conversion_save (0, 0);
7927
7928   coding->mode |= CODING_MODE_LAST_BLOCK;
7929   current_buffer->text->inhibit_shrinking = 1;
7930   decode_coding (coding);
7931   current_buffer->text->inhibit_shrinking = 0;
7932
7933   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7934     {
7935       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7936       Lisp_Object val;
7937
7938       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7939       val = call1 (CODING_ATTR_POST_READ (attrs),
7940                    make_number (coding->produced_char));
7941       CHECK_NATNUM (val);
7942       coding->produced_char += Z - prev_Z;
7943       coding->produced += Z_BYTE - prev_Z_BYTE;
7944     }
7945
7946   unbind_to (count, Qnil);
7947 }
7948
7949
7950 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7951    SRC_OBJECT into DST_OBJECT by coding context CODING.
7952
7953    SRC_OBJECT is a buffer, a string, or Qnil.
7954
7955    If it is a buffer, the text is at point of the buffer.  FROM and TO
7956    are positions in the buffer.
7957
7958    If it is a string, the text is at the beginning of the string.
7959    FROM and TO are indices to the string.
7960
7961    If it is nil, the text is at coding->source.  FROM and TO are
7962    indices to coding->source.
7963
7964    DST_OBJECT is a buffer, Qt, or Qnil.
7965
7966    If it is a buffer, the decoded text is inserted at point of the
7967    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7968    is deleted.
7969
7970    If it is Qt, a string is made from the decoded text, and
7971    set in CODING->dst_object.
7972
7973    If it is Qnil, the decoded text is stored at CODING->destination.
7974    The caller must allocate CODING->dst_bytes bytes at
7975    CODING->destination by xmalloc.  If the decoded text is longer than
7976    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7977  */
7978
7979 void
7980 decode_coding_object (struct coding_system *coding,
7981                       Lisp_Object src_object,
7982                       ptrdiff_t from, ptrdiff_t from_byte,
7983                       ptrdiff_t to, ptrdiff_t to_byte,
7984                       Lisp_Object dst_object)
7985 {
7986   ptrdiff_t count = SPECPDL_INDEX ();
7987   unsigned char *destination IF_LINT (= NULL);
7988   ptrdiff_t dst_bytes IF_LINT (= 0);
7989   ptrdiff_t chars = to - from;
7990   ptrdiff_t bytes = to_byte - from_byte;
7991   Lisp_Object attrs;
7992   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7993   bool need_marker_adjustment = 0;
7994   Lisp_Object old_deactivate_mark;
7995
7996   old_deactivate_mark = Vdeactivate_mark;
7997
7998   if (NILP (dst_object))
7999     {
8000       destination = coding->destination;
8001       dst_bytes = coding->dst_bytes;
8002     }
8003
8004   coding->src_object = src_object;
8005   coding->src_chars = chars;
8006   coding->src_bytes = bytes;
8007   coding->src_multibyte = chars < bytes;
8008
8009   if (STRINGP (src_object))
8010     {
8011       coding->src_pos = from;
8012       coding->src_pos_byte = from_byte;
8013     }
8014   else if (BUFFERP (src_object))
8015     {
8016       set_buffer_internal (XBUFFER (src_object));
8017       if (from != GPT)
8018         move_gap_both (from, from_byte);
8019       if (EQ (src_object, dst_object))
8020         {
8021           struct Lisp_Marker *tail;
8022
8023           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8024             {
8025               tail->need_adjustment
8026                 = tail->charpos == (tail->insertion_type ? from : to);
8027               need_marker_adjustment |= tail->need_adjustment;
8028             }
8029           saved_pt = PT, saved_pt_byte = PT_BYTE;
8030           TEMP_SET_PT_BOTH (from, from_byte);
8031           current_buffer->text->inhibit_shrinking = 1;
8032           del_range_both (from, from_byte, to, to_byte, 1);
8033           coding->src_pos = -chars;
8034           coding->src_pos_byte = -bytes;
8035         }
8036       else
8037         {
8038           coding->src_pos = from;
8039           coding->src_pos_byte = from_byte;
8040         }
8041     }
8042
8043   if (CODING_REQUIRE_DETECTION (coding))
8044     detect_coding (coding);
8045   attrs = CODING_ID_ATTRS (coding->id);
8046
8047   if (EQ (dst_object, Qt)
8048       || (! NILP (CODING_ATTR_POST_READ (attrs))
8049           && NILP (dst_object)))
8050     {
8051       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8052       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8053       coding->dst_pos = BEG;
8054       coding->dst_pos_byte = BEG_BYTE;
8055     }
8056   else if (BUFFERP (dst_object))
8057     {
8058       code_conversion_save (0, 0);
8059       coding->dst_object = dst_object;
8060       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8061       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8062       coding->dst_multibyte
8063         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8064     }
8065   else
8066     {
8067       code_conversion_save (0, 0);
8068       coding->dst_object = Qnil;
8069       /* Most callers presume this will return a multibyte result, and they
8070          won't use `binary' or `raw-text' anyway, so let's not worry about
8071          CODING_FOR_UNIBYTE.  */
8072       coding->dst_multibyte = 1;
8073     }
8074
8075   decode_coding (coding);
8076
8077   if (BUFFERP (coding->dst_object))
8078     set_buffer_internal (XBUFFER (coding->dst_object));
8079
8080   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8081     {
8082       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8083       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8084       Lisp_Object val;
8085
8086       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8087       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8088               old_deactivate_mark);
8089       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8090                         make_number (coding->produced_char));
8091       UNGCPRO;
8092       CHECK_NATNUM (val);
8093       coding->produced_char += Z - prev_Z;
8094       coding->produced += Z_BYTE - prev_Z_BYTE;
8095     }
8096
8097   if (EQ (dst_object, Qt))
8098     {
8099       coding->dst_object = Fbuffer_string ();
8100     }
8101   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8102     {
8103       set_buffer_internal (XBUFFER (coding->dst_object));
8104       if (dst_bytes < coding->produced)
8105         {
8106           eassert (coding->produced > 0);
8107           destination = xrealloc (destination, coding->produced);
8108           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8109             move_gap_both (BEGV, BEGV_BYTE);
8110           memcpy (destination, BEGV_ADDR, coding->produced);
8111           coding->destination = destination;
8112         }
8113     }
8114
8115   if (saved_pt >= 0)
8116     {
8117       /* This is the case of:
8118          (BUFFERP (src_object) && EQ (src_object, dst_object))
8119          As we have moved PT while replacing the original buffer
8120          contents, we must recover it now.  */
8121       set_buffer_internal (XBUFFER (src_object));
8122       current_buffer->text->inhibit_shrinking = 0;
8123       if (saved_pt < from)
8124         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8125       else if (saved_pt < from + chars)
8126         TEMP_SET_PT_BOTH (from, from_byte);
8127       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8128         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8129                           saved_pt_byte + (coding->produced - bytes));
8130       else
8131         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8132                           saved_pt_byte + (coding->produced - bytes));
8133
8134       if (need_marker_adjustment)
8135         {
8136           struct Lisp_Marker *tail;
8137
8138           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8139             if (tail->need_adjustment)
8140               {
8141                 tail->need_adjustment = 0;
8142                 if (tail->insertion_type)
8143                   {
8144                     tail->bytepos = from_byte;
8145                     tail->charpos = from;
8146                   }
8147                 else
8148                   {
8149                     tail->bytepos = from_byte + coding->produced;
8150                     tail->charpos
8151                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8152                          ? tail->bytepos : from + coding->produced_char);
8153                   }
8154               }
8155         }
8156     }
8157
8158   Vdeactivate_mark = old_deactivate_mark;
8159   unbind_to (count, coding->dst_object);
8160 }
8161
8162
8163 void
8164 encode_coding_object (struct coding_system *coding,
8165                       Lisp_Object src_object,
8166                       ptrdiff_t from, ptrdiff_t from_byte,
8167                       ptrdiff_t to, ptrdiff_t to_byte,
8168                       Lisp_Object dst_object)
8169 {
8170   ptrdiff_t count = SPECPDL_INDEX ();
8171   ptrdiff_t chars = to - from;
8172   ptrdiff_t bytes = to_byte - from_byte;
8173   Lisp_Object attrs;
8174   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8175   bool need_marker_adjustment = 0;
8176   bool kill_src_buffer = 0;
8177   Lisp_Object old_deactivate_mark;
8178
8179   old_deactivate_mark = Vdeactivate_mark;
8180
8181   coding->src_object = src_object;
8182   coding->src_chars = chars;
8183   coding->src_bytes = bytes;
8184   coding->src_multibyte = chars < bytes;
8185
8186   attrs = CODING_ID_ATTRS (coding->id);
8187
8188   if (EQ (src_object, dst_object))
8189     {
8190       struct Lisp_Marker *tail;
8191
8192       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8193         {
8194           tail->need_adjustment
8195             = tail->charpos == (tail->insertion_type ? from : to);
8196           need_marker_adjustment |= tail->need_adjustment;
8197         }
8198     }
8199
8200   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8201     {
8202       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8203       set_buffer_internal (XBUFFER (coding->src_object));
8204       if (STRINGP (src_object))
8205         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8206       else if (BUFFERP (src_object))
8207         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8208       else
8209         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8210
8211       if (EQ (src_object, dst_object))
8212         {
8213           set_buffer_internal (XBUFFER (src_object));
8214           saved_pt = PT, saved_pt_byte = PT_BYTE;
8215           del_range_both (from, from_byte, to, to_byte, 1);
8216           set_buffer_internal (XBUFFER (coding->src_object));
8217         }
8218
8219       {
8220         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8221
8222         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8223                 old_deactivate_mark);
8224         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8225                     make_number (BEG), make_number (Z));
8226         UNGCPRO;
8227       }
8228       if (XBUFFER (coding->src_object) != current_buffer)
8229         kill_src_buffer = 1;
8230       coding->src_object = Fcurrent_buffer ();
8231       if (BEG != GPT)
8232         move_gap_both (BEG, BEG_BYTE);
8233       coding->src_chars = Z - BEG;
8234       coding->src_bytes = Z_BYTE - BEG_BYTE;
8235       coding->src_pos = BEG;
8236       coding->src_pos_byte = BEG_BYTE;
8237       coding->src_multibyte = Z < Z_BYTE;
8238     }
8239   else if (STRINGP (src_object))
8240     {
8241       code_conversion_save (0, 0);
8242       coding->src_pos = from;
8243       coding->src_pos_byte = from_byte;
8244     }
8245   else if (BUFFERP (src_object))
8246     {
8247       code_conversion_save (0, 0);
8248       set_buffer_internal (XBUFFER (src_object));
8249       if (EQ (src_object, dst_object))
8250         {
8251           saved_pt = PT, saved_pt_byte = PT_BYTE;
8252           coding->src_object = del_range_1 (from, to, 1, 1);
8253           coding->src_pos = 0;
8254           coding->src_pos_byte = 0;
8255         }
8256       else
8257         {
8258           if (from < GPT && to >= GPT)
8259             move_gap_both (from, from_byte);
8260           coding->src_pos = from;
8261           coding->src_pos_byte = from_byte;
8262         }
8263     }
8264   else
8265     code_conversion_save (0, 0);
8266
8267   if (BUFFERP (dst_object))
8268     {
8269       coding->dst_object = dst_object;
8270       if (EQ (src_object, dst_object))
8271         {
8272           coding->dst_pos = from;
8273           coding->dst_pos_byte = from_byte;
8274         }
8275       else
8276         {
8277           struct buffer *current = current_buffer;
8278
8279           set_buffer_temp (XBUFFER (dst_object));
8280           coding->dst_pos = PT;
8281           coding->dst_pos_byte = PT_BYTE;
8282           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8283           set_buffer_temp (current);
8284         }
8285       coding->dst_multibyte
8286         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8287     }
8288   else if (EQ (dst_object, Qt))
8289     {
8290       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8291       coding->dst_object = Qnil;
8292       coding->destination = xmalloc (dst_bytes);
8293       coding->dst_bytes = dst_bytes;
8294       coding->dst_multibyte = 0;
8295     }
8296   else
8297     {
8298       coding->dst_object = Qnil;
8299       coding->dst_multibyte = 0;
8300     }
8301
8302   encode_coding (coding);
8303
8304   if (EQ (dst_object, Qt))
8305     {
8306       if (BUFFERP (coding->dst_object))
8307         coding->dst_object = Fbuffer_string ();
8308       else
8309         {
8310           coding->dst_object
8311             = make_unibyte_string ((char *) coding->destination,
8312                                    coding->produced);
8313           xfree (coding->destination);
8314         }
8315     }
8316
8317   if (saved_pt >= 0)
8318     {
8319       /* This is the case of:
8320          (BUFFERP (src_object) && EQ (src_object, dst_object))
8321          As we have moved PT while replacing the original buffer
8322          contents, we must recover it now.  */
8323       set_buffer_internal (XBUFFER (src_object));
8324       if (saved_pt < from)
8325         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8326       else if (saved_pt < from + chars)
8327         TEMP_SET_PT_BOTH (from, from_byte);
8328       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8329         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8330                           saved_pt_byte + (coding->produced - bytes));
8331       else
8332         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8333                           saved_pt_byte + (coding->produced - bytes));
8334
8335       if (need_marker_adjustment)
8336         {
8337           struct Lisp_Marker *tail;
8338
8339           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8340             if (tail->need_adjustment)
8341               {
8342                 tail->need_adjustment = 0;
8343                 if (tail->insertion_type)
8344                   {
8345                     tail->bytepos = from_byte;
8346                     tail->charpos = from;
8347                   }
8348                 else
8349                   {
8350                     tail->bytepos = from_byte + coding->produced;
8351                     tail->charpos
8352                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8353                          ? tail->bytepos : from + coding->produced_char);
8354                   }
8355               }
8356         }
8357     }
8358
8359   if (kill_src_buffer)
8360     Fkill_buffer (coding->src_object);
8361
8362   Vdeactivate_mark = old_deactivate_mark;
8363   unbind_to (count, Qnil);
8364 }
8365
8366
8367 Lisp_Object
8368 preferred_coding_system (void)
8369 {
8370   int id = coding_categories[coding_priorities[0]].id;
8371
8372   return CODING_ID_NAME (id);
8373 }
8374
8375 #if defined (WINDOWSNT) || defined (CYGWIN)
8376
8377 Lisp_Object
8378 from_unicode (Lisp_Object str)
8379 {
8380   CHECK_STRING (str);
8381   if (!STRING_MULTIBYTE (str) &&
8382       SBYTES (str) & 1)
8383     {
8384       str = Fsubstring (str, make_number (0), make_number (-1));
8385     }
8386
8387   return code_convert_string_norecord (str, Qutf_16le, 0);
8388 }
8389
8390 Lisp_Object
8391 from_unicode_buffer (const wchar_t* wstr)
8392 {
8393     return from_unicode (
8394         make_unibyte_string (
8395             (char*) wstr,
8396             /* we get one of the two final 0 bytes for free. */
8397             1 + sizeof (wchar_t) * wcslen (wstr)));
8398 }
8399
8400 wchar_t *
8401 to_unicode (Lisp_Object str, Lisp_Object *buf)
8402 {
8403   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8404   /* We need to make another copy (in addition to the one made by
8405      code_convert_string_norecord) to ensure that the final string is
8406      _doubly_ zero terminated --- that is, that the string is
8407      terminated by two zero bytes and one utf-16le null character.
8408      Because strings are already terminated with a single zero byte,
8409      we just add one additional zero. */
8410   str = make_uninit_string (SBYTES (*buf) + 1);
8411   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8412   SDATA (str) [SBYTES (*buf)] = '\0';
8413   *buf = str;
8414   return WCSDATA (*buf);
8415 }
8416
8417 #endif /* WINDOWSNT || CYGWIN */
8418
8419 \f
8420 #ifdef emacs
8421 /*** 8. Emacs Lisp library functions ***/
8422
8423 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8424        doc: /* Return t if OBJECT is nil or a coding-system.
8425 See the documentation of `define-coding-system' for information
8426 about coding-system objects.  */)
8427   (Lisp_Object object)
8428 {
8429   if (NILP (object)
8430       || CODING_SYSTEM_ID (object) >= 0)
8431     return Qt;
8432   if (! SYMBOLP (object)
8433       || NILP (Fget (object, Qcoding_system_define_form)))
8434     return Qnil;
8435   return Qt;
8436 }
8437
8438 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8439        Sread_non_nil_coding_system, 1, 1, 0,
8440        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8441   (Lisp_Object prompt)
8442 {
8443   Lisp_Object val;
8444   do
8445     {
8446       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8447                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8448     }
8449   while (SCHARS (val) == 0);
8450   return (Fintern (val, Qnil));
8451 }
8452
8453 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8454        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8455 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8456 Ignores case when completing coding systems (all Emacs coding systems
8457 are lower-case).  */)
8458   (Lisp_Object prompt, Lisp_Object default_coding_system)
8459 {
8460   Lisp_Object val;
8461   ptrdiff_t count = SPECPDL_INDEX ();
8462
8463   if (SYMBOLP (default_coding_system))
8464     default_coding_system = SYMBOL_NAME (default_coding_system);
8465   specbind (Qcompletion_ignore_case, Qt);
8466   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8467                           Qt, Qnil, Qcoding_system_history,
8468                           default_coding_system, Qnil);
8469   unbind_to (count, Qnil);
8470   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8471 }
8472
8473 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8474        1, 1, 0,
8475        doc: /* Check validity of CODING-SYSTEM.
8476 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8477 It is valid if it is nil or a symbol defined as a coding system by the
8478 function `define-coding-system'.  */)
8479   (Lisp_Object coding_system)
8480 {
8481   Lisp_Object define_form;
8482
8483   define_form = Fget (coding_system, Qcoding_system_define_form);
8484   if (! NILP (define_form))
8485     {
8486       Fput (coding_system, Qcoding_system_define_form, Qnil);
8487       safe_eval (define_form);
8488     }
8489   if (!NILP (Fcoding_system_p (coding_system)))
8490     return coding_system;
8491   xsignal1 (Qcoding_system_error, coding_system);
8492 }
8493
8494 \f
8495 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8496    HIGHEST, return the coding system of the highest
8497    priority among the detected coding systems.  Otherwise return a
8498    list of detected coding systems sorted by their priorities.  If
8499    MULTIBYTEP, it is assumed that the bytes are in correct
8500    multibyte form but contains only ASCII and eight-bit chars.
8501    Otherwise, the bytes are raw bytes.
8502
8503    CODING-SYSTEM controls the detection as below:
8504
8505    If it is nil, detect both text-format and eol-format.  If the
8506    text-format part of CODING-SYSTEM is already specified
8507    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8508    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8509    detect only text-format.  */
8510
8511 Lisp_Object
8512 detect_coding_system (const unsigned char *src,
8513                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8514                       bool highest, bool multibytep,
8515                       Lisp_Object coding_system)
8516 {
8517   const unsigned char *src_end = src + src_bytes;
8518   Lisp_Object attrs, eol_type;
8519   Lisp_Object val = Qnil;
8520   struct coding_system coding;
8521   ptrdiff_t id;
8522   struct coding_detection_info detect_info;
8523   enum coding_category base_category;
8524   bool null_byte_found = 0, eight_bit_found = 0;
8525
8526   if (NILP (coding_system))
8527     coding_system = Qundecided;
8528   setup_coding_system (coding_system, &coding);
8529   attrs = CODING_ID_ATTRS (coding.id);
8530   eol_type = CODING_ID_EOL_TYPE (coding.id);
8531   coding_system = CODING_ATTR_BASE_NAME (attrs);
8532
8533   coding.source = src;
8534   coding.src_chars = src_chars;
8535   coding.src_bytes = src_bytes;
8536   coding.src_multibyte = multibytep;
8537   coding.consumed = 0;
8538   coding.mode |= CODING_MODE_LAST_BLOCK;
8539   coding.head_ascii = 0;
8540
8541   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8542
8543   /* At first, detect text-format if necessary.  */
8544   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8545   if (base_category == coding_category_undecided)
8546     {
8547       enum coding_category category IF_LINT (= 0);
8548       struct coding_system *this IF_LINT (= NULL);
8549       int c, i;
8550       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8551                                        inhibit_null_byte_detection);
8552       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8553                                        inhibit_iso_escape_detection);
8554       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8555
8556       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8557       for (; src < src_end; src++)
8558         {
8559           c = *src;
8560           if (c & 0x80)
8561             {
8562               eight_bit_found = 1;
8563               if (null_byte_found)
8564                 break;
8565             }
8566           else if (c < 0x20)
8567             {
8568               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8569                   && ! inhibit_ied
8570                   && ! detect_info.checked)
8571                 {
8572                   if (detect_coding_iso_2022 (&coding, &detect_info))
8573                     {
8574                       /* We have scanned the whole data.  */
8575                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8576                         {
8577                           /* We didn't find an 8-bit code.  We may
8578                              have found a null-byte, but it's very
8579                              rare that a binary file confirm to
8580                              ISO-2022.  */
8581                           src = src_end;
8582                           coding.head_ascii = src - coding.source;
8583                         }
8584                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8585                       break;
8586                     }
8587                 }
8588               else if (! c && !inhibit_nbd)
8589                 {
8590                   null_byte_found = 1;
8591                   if (eight_bit_found)
8592                     break;
8593                 }
8594               if (! eight_bit_found)
8595                 coding.head_ascii++;
8596             }
8597           else if (! eight_bit_found)
8598             coding.head_ascii++;
8599         }
8600
8601       if (null_byte_found || eight_bit_found
8602           || coding.head_ascii < coding.src_bytes
8603           || detect_info.found)
8604         {
8605           if (coding.head_ascii == coding.src_bytes)
8606             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8607             for (i = 0; i < coding_category_raw_text; i++)
8608               {
8609                 category = coding_priorities[i];
8610                 this = coding_categories + category;
8611                 if (detect_info.found & (1 << category))
8612                   break;
8613               }
8614           else
8615             {
8616               if (null_byte_found)
8617                 {
8618                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8619                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8620                 }
8621               else if (prefer_utf_8
8622                        && detect_coding_utf_8 (&coding, &detect_info))
8623                 {
8624                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8625                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8626                 }
8627               for (i = 0; i < coding_category_raw_text; i++)
8628                 {
8629                   category = coding_priorities[i];
8630                   this = coding_categories + category;
8631
8632                   if (this->id < 0)
8633                     {
8634                       /* No coding system of this category is defined.  */
8635                       detect_info.rejected |= (1 << category);
8636                     }
8637                   else if (category >= coding_category_raw_text)
8638                     continue;
8639                   else if (detect_info.checked & (1 << category))
8640                     {
8641                       if (highest
8642                           && (detect_info.found & (1 << category)))
8643                         break;
8644                     }
8645                   else if ((*(this->detector)) (&coding, &detect_info)
8646                            && highest
8647                            && (detect_info.found & (1 << category)))
8648                     {
8649                       if (category == coding_category_utf_16_auto)
8650                         {
8651                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8652                             category = coding_category_utf_16_le;
8653                           else
8654                             category = coding_category_utf_16_be;
8655                         }
8656                       break;
8657                     }
8658                 }
8659             }
8660         }
8661
8662       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8663           || null_byte_found)
8664         {
8665           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8666           id = CODING_SYSTEM_ID (Qno_conversion);
8667           val = list1 (make_number (id));
8668         }
8669       else if (! detect_info.rejected && ! detect_info.found)
8670         {
8671           detect_info.found = CATEGORY_MASK_ANY;
8672           id = coding_categories[coding_category_undecided].id;
8673           val = list1 (make_number (id));
8674         }
8675       else if (highest)
8676         {
8677           if (detect_info.found)
8678             {
8679               detect_info.found = 1 << category;
8680               val = list1 (make_number (this->id));
8681             }
8682           else
8683             for (i = 0; i < coding_category_raw_text; i++)
8684               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8685                 {
8686                   detect_info.found = 1 << coding_priorities[i];
8687                   id = coding_categories[coding_priorities[i]].id;
8688                   val = list1 (make_number (id));
8689                   break;
8690                 }
8691         }
8692       else
8693         {
8694           int mask = detect_info.rejected | detect_info.found;
8695           int found = 0;
8696
8697           for (i = coding_category_raw_text - 1; i >= 0; i--)
8698             {
8699               category = coding_priorities[i];
8700               if (! (mask & (1 << category)))
8701                 {
8702                   found |= 1 << category;
8703                   id = coding_categories[category].id;
8704                   if (id >= 0)
8705                     val = list1 (make_number (id));
8706                 }
8707             }
8708           for (i = coding_category_raw_text - 1; i >= 0; i--)
8709             {
8710               category = coding_priorities[i];
8711               if (detect_info.found & (1 << category))
8712                 {
8713                   id = coding_categories[category].id;
8714                   val = Fcons (make_number (id), val);
8715                 }
8716             }
8717           detect_info.found |= found;
8718         }
8719     }
8720   else if (base_category == coding_category_utf_8_auto)
8721     {
8722       if (detect_coding_utf_8 (&coding, &detect_info))
8723         {
8724           struct coding_system *this;
8725
8726           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8727             this = coding_categories + coding_category_utf_8_sig;
8728           else
8729             this = coding_categories + coding_category_utf_8_nosig;
8730           val = list1 (make_number (this->id));
8731         }
8732     }
8733   else if (base_category == coding_category_utf_16_auto)
8734     {
8735       if (detect_coding_utf_16 (&coding, &detect_info))
8736         {
8737           struct coding_system *this;
8738
8739           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8740             this = coding_categories + coding_category_utf_16_le;
8741           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8742             this = coding_categories + coding_category_utf_16_be;
8743           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8744             this = coding_categories + coding_category_utf_16_be_nosig;
8745           else
8746             this = coding_categories + coding_category_utf_16_le_nosig;
8747           val = list1 (make_number (this->id));
8748         }
8749     }
8750   else
8751     {
8752       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8753       val = list1 (make_number (coding.id));
8754     }
8755
8756   /* Then, detect eol-format if necessary.  */
8757   {
8758     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8759     Lisp_Object tail;
8760
8761     if (VECTORP (eol_type))
8762       {
8763         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8764           {
8765             if (null_byte_found)
8766               normal_eol = EOL_SEEN_LF;
8767             else
8768               normal_eol = detect_eol (coding.source, src_bytes,
8769                                        coding_category_raw_text);
8770           }
8771         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8772                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8773           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8774                                       coding_category_utf_16_be);
8775         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8776                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8777           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8778                                       coding_category_utf_16_le);
8779       }
8780     else
8781       {
8782         if (EQ (eol_type, Qunix))
8783           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8784         else if (EQ (eol_type, Qdos))
8785           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8786         else
8787           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8788       }
8789
8790     for (tail = val; CONSP (tail); tail = XCDR (tail))
8791       {
8792         enum coding_category category;
8793         int this_eol;
8794
8795         id = XINT (XCAR (tail));
8796         attrs = CODING_ID_ATTRS (id);
8797         category = XINT (CODING_ATTR_CATEGORY (attrs));
8798         eol_type = CODING_ID_EOL_TYPE (id);
8799         if (VECTORP (eol_type))
8800           {
8801             if (category == coding_category_utf_16_be
8802                 || category == coding_category_utf_16_be_nosig)
8803               this_eol = utf_16_be_eol;
8804             else if (category == coding_category_utf_16_le
8805                      || category == coding_category_utf_16_le_nosig)
8806               this_eol = utf_16_le_eol;
8807             else
8808               this_eol = normal_eol;
8809
8810             if (this_eol == EOL_SEEN_LF)
8811               XSETCAR (tail, AREF (eol_type, 0));
8812             else if (this_eol == EOL_SEEN_CRLF)
8813               XSETCAR (tail, AREF (eol_type, 1));
8814             else if (this_eol == EOL_SEEN_CR)
8815               XSETCAR (tail, AREF (eol_type, 2));
8816             else
8817               XSETCAR (tail, CODING_ID_NAME (id));
8818           }
8819         else
8820           XSETCAR (tail, CODING_ID_NAME (id));
8821       }
8822   }
8823
8824   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8825 }
8826
8827
8828 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8829        2, 3, 0,
8830        doc: /* Detect coding system of the text in the region between START and END.
8831 Return a list of possible coding systems ordered by priority.
8832 The coding systems to try and their priorities follows what
8833 the function `coding-system-priority-list' (which see) returns.
8834
8835 If only ASCII characters are found (except for such ISO-2022 control
8836 characters as ESC), it returns a list of single element `undecided'
8837 or its subsidiary coding system according to a detected end-of-line
8838 format.
8839
8840 If optional argument HIGHEST is non-nil, return the coding system of
8841 highest priority.  */)
8842   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8843 {
8844   ptrdiff_t from, to;
8845   ptrdiff_t from_byte, to_byte;
8846
8847   validate_region (&start, &end);
8848   from = XINT (start), to = XINT (end);
8849   from_byte = CHAR_TO_BYTE (from);
8850   to_byte = CHAR_TO_BYTE (to);
8851
8852   if (from < GPT && to >= GPT)
8853     move_gap_both (to, to_byte);
8854
8855   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8856                                to - from, to_byte - from_byte,
8857                                !NILP (highest),
8858                                !NILP (BVAR (current_buffer
8859                                       , enable_multibyte_characters)),
8860                                Qnil);
8861 }
8862
8863 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8864        1, 2, 0,
8865        doc: /* Detect coding system of the text in STRING.
8866 Return a list of possible coding systems ordered by priority.
8867 The coding systems to try and their priorities follows what
8868 the function `coding-system-priority-list' (which see) returns.
8869
8870 If only ASCII characters are found (except for such ISO-2022 control
8871 characters as ESC), it returns a list of single element `undecided'
8872 or its subsidiary coding system according to a detected end-of-line
8873 format.
8874
8875 If optional argument HIGHEST is non-nil, return the coding system of
8876 highest priority.  */)
8877   (Lisp_Object string, Lisp_Object highest)
8878 {
8879   CHECK_STRING (string);
8880
8881   return detect_coding_system (SDATA (string),
8882                                SCHARS (string), SBYTES (string),
8883                                !NILP (highest), STRING_MULTIBYTE (string),
8884                                Qnil);
8885 }
8886
8887
8888 static bool
8889 char_encodable_p (int c, Lisp_Object attrs)
8890 {
8891   Lisp_Object tail;
8892   struct charset *charset;
8893   Lisp_Object translation_table;
8894
8895   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8896   if (! NILP (translation_table))
8897     c = translate_char (translation_table, c);
8898   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8899        CONSP (tail); tail = XCDR (tail))
8900     {
8901       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8902       if (CHAR_CHARSET_P (c, charset))
8903         break;
8904     }
8905   return (! NILP (tail));
8906 }
8907
8908
8909 /* Return a list of coding systems that safely encode the text between
8910    START and END.  If EXCLUDE is non-nil, it is a list of coding
8911    systems not to check.  The returned list doesn't contain any such
8912    coding systems.  In any case, if the text contains only ASCII or is
8913    unibyte, return t.  */
8914
8915 DEFUN ("find-coding-systems-region-internal",
8916        Ffind_coding_systems_region_internal,
8917        Sfind_coding_systems_region_internal, 2, 3, 0,
8918        doc: /* Internal use only.  */)
8919   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8920 {
8921   Lisp_Object coding_attrs_list, safe_codings;
8922   ptrdiff_t start_byte, end_byte;
8923   const unsigned char *p, *pbeg, *pend;
8924   int c;
8925   Lisp_Object tail, elt, work_table;
8926
8927   if (STRINGP (start))
8928     {
8929       if (!STRING_MULTIBYTE (start)
8930           || SCHARS (start) == SBYTES (start))
8931         return Qt;
8932       start_byte = 0;
8933       end_byte = SBYTES (start);
8934     }
8935   else
8936     {
8937       CHECK_NUMBER_COERCE_MARKER (start);
8938       CHECK_NUMBER_COERCE_MARKER (end);
8939       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8940         args_out_of_range (start, end);
8941       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8942         return Qt;
8943       start_byte = CHAR_TO_BYTE (XINT (start));
8944       end_byte = CHAR_TO_BYTE (XINT (end));
8945       if (XINT (end) - XINT (start) == end_byte - start_byte)
8946         return Qt;
8947
8948       if (XINT (start) < GPT && XINT (end) > GPT)
8949         {
8950           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8951             move_gap_both (XINT (start), start_byte);
8952           else
8953             move_gap_both (XINT (end), end_byte);
8954         }
8955     }
8956
8957   coding_attrs_list = Qnil;
8958   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8959     if (NILP (exclude)
8960         || NILP (Fmemq (XCAR (tail), exclude)))
8961       {
8962         Lisp_Object attrs;
8963
8964         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8965         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
8966           {
8967             ASET (attrs, coding_attr_trans_tbl,
8968                   get_translation_table (attrs, 1, NULL));
8969             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8970           }
8971       }
8972
8973   if (STRINGP (start))
8974     p = pbeg = SDATA (start);
8975   else
8976     p = pbeg = BYTE_POS_ADDR (start_byte);
8977   pend = p + (end_byte - start_byte);
8978
8979   while (p < pend && ASCII_BYTE_P (*p)) p++;
8980   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8981
8982   work_table = Fmake_char_table (Qnil, Qnil);
8983   while (p < pend)
8984     {
8985       if (ASCII_BYTE_P (*p))
8986         p++;
8987       else
8988         {
8989           c = STRING_CHAR_ADVANCE (p);
8990           if (!NILP (char_table_ref (work_table, c)))
8991             /* This character was already checked.  Ignore it.  */
8992             continue;
8993
8994           charset_map_loaded = 0;
8995           for (tail = coding_attrs_list; CONSP (tail);)
8996             {
8997               elt = XCAR (tail);
8998               if (NILP (elt))
8999                 tail = XCDR (tail);
9000               else if (char_encodable_p (c, elt))
9001                 tail = XCDR (tail);
9002               else if (CONSP (XCDR (tail)))
9003                 {
9004                   XSETCAR (tail, XCAR (XCDR (tail)));
9005                   XSETCDR (tail, XCDR (XCDR (tail)));
9006                 }
9007               else
9008                 {
9009                   XSETCAR (tail, Qnil);
9010                   tail = XCDR (tail);
9011                 }
9012             }
9013           if (charset_map_loaded)
9014             {
9015               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9016
9017               if (STRINGP (start))
9018                 pbeg = SDATA (start);
9019               else
9020                 pbeg = BYTE_POS_ADDR (start_byte);
9021               p = pbeg + p_offset;
9022               pend = pbeg + pend_offset;
9023             }
9024           char_table_set (work_table, c, Qt);
9025         }
9026     }
9027
9028   safe_codings = list2 (Qraw_text, Qno_conversion);
9029   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9030     if (! NILP (XCAR (tail)))
9031       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9032
9033   return safe_codings;
9034 }
9035
9036
9037 DEFUN ("unencodable-char-position", Funencodable_char_position,
9038        Sunencodable_char_position, 3, 5, 0,
9039        doc: /*
9040 Return position of first un-encodable character in a region.
9041 START and END specify the region and CODING-SYSTEM specifies the
9042 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9043
9044 If optional 4th argument COUNT is non-nil, it specifies at most how
9045 many un-encodable characters to search.  In this case, the value is a
9046 list of positions.
9047
9048 If optional 5th argument STRING is non-nil, it is a string to search
9049 for un-encodable characters.  In that case, START and END are indexes
9050 to the string.  */)
9051   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
9052 {
9053   EMACS_INT n;
9054   struct coding_system coding;
9055   Lisp_Object attrs, charset_list, translation_table;
9056   Lisp_Object positions;
9057   ptrdiff_t from, to;
9058   const unsigned char *p, *stop, *pend;
9059   bool ascii_compatible;
9060
9061   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9062   attrs = CODING_ID_ATTRS (coding.id);
9063   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9064     return Qnil;
9065   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9066   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9067   translation_table = get_translation_table (attrs, 1, NULL);
9068
9069   if (NILP (string))
9070     {
9071       validate_region (&start, &end);
9072       from = XINT (start);
9073       to = XINT (end);
9074       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9075           || (ascii_compatible
9076               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9077         return Qnil;
9078       p = CHAR_POS_ADDR (from);
9079       pend = CHAR_POS_ADDR (to);
9080       if (from < GPT && to >= GPT)
9081         stop = GPT_ADDR;
9082       else
9083         stop = pend;
9084     }
9085   else
9086     {
9087       CHECK_STRING (string);
9088       CHECK_NATNUM (start);
9089       CHECK_NATNUM (end);
9090       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
9091         args_out_of_range_3 (string, start, end);
9092       from = XINT (start);
9093       to = XINT (end);
9094       if (! STRING_MULTIBYTE (string))
9095         return Qnil;
9096       p = SDATA (string) + string_char_to_byte (string, from);
9097       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9098       if (ascii_compatible && (to - from) == (pend - p))
9099         return Qnil;
9100     }
9101
9102   if (NILP (count))
9103     n = 1;
9104   else
9105     {
9106       CHECK_NATNUM (count);
9107       n = XINT (count);
9108     }
9109
9110   positions = Qnil;
9111   charset_map_loaded = 0;
9112   while (1)
9113     {
9114       int c;
9115
9116       if (ascii_compatible)
9117         while (p < stop && ASCII_BYTE_P (*p))
9118           p++, from++;
9119       if (p >= stop)
9120         {
9121           if (p >= pend)
9122             break;
9123           stop = pend;
9124           p = GAP_END_ADDR;
9125         }
9126
9127       c = STRING_CHAR_ADVANCE (p);
9128       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9129           && ! char_charset (translate_char (translation_table, c),
9130                              charset_list, NULL))
9131         {
9132           positions = Fcons (make_number (from), positions);
9133           n--;
9134           if (n == 0)
9135             break;
9136         }
9137
9138       from++;
9139       if (charset_map_loaded && NILP (string))
9140         {
9141           p = CHAR_POS_ADDR (from);
9142           pend = CHAR_POS_ADDR (to);
9143           if (from < GPT && to >= GPT)
9144             stop = GPT_ADDR;
9145           else
9146             stop = pend;
9147           charset_map_loaded = 0;
9148         }
9149     }
9150
9151   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9152 }
9153
9154
9155 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9156        Scheck_coding_systems_region, 3, 3, 0,
9157        doc: /* Check if the region is encodable by coding systems.
9158
9159 START and END are buffer positions specifying the region.
9160 CODING-SYSTEM-LIST is a list of coding systems to check.
9161
9162 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9163 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9164 whole region, POS0, POS1, ... are buffer positions where non-encodable
9165 characters are found.
9166
9167 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9168 value is nil.
9169
9170 START may be a string.  In that case, check if the string is
9171 encodable, and the value contains indices to the string instead of
9172 buffer positions.  END is ignored.
9173
9174 If the current buffer (or START if it is a string) is unibyte, the value
9175 is nil.  */)
9176   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9177 {
9178   Lisp_Object list;
9179   ptrdiff_t start_byte, end_byte;
9180   ptrdiff_t pos;
9181   const unsigned char *p, *pbeg, *pend;
9182   int c;
9183   Lisp_Object tail, elt, attrs;
9184
9185   if (STRINGP (start))
9186     {
9187       if (!STRING_MULTIBYTE (start)
9188           || SCHARS (start) == SBYTES (start))
9189         return Qnil;
9190       start_byte = 0;
9191       end_byte = SBYTES (start);
9192       pos = 0;
9193     }
9194   else
9195     {
9196       CHECK_NUMBER_COERCE_MARKER (start);
9197       CHECK_NUMBER_COERCE_MARKER (end);
9198       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9199         args_out_of_range (start, end);
9200       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9201         return Qnil;
9202       start_byte = CHAR_TO_BYTE (XINT (start));
9203       end_byte = CHAR_TO_BYTE (XINT (end));
9204       if (XINT (end) - XINT (start) == end_byte - start_byte)
9205         return Qnil;
9206
9207       if (XINT (start) < GPT && XINT (end) > GPT)
9208         {
9209           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9210             move_gap_both (XINT (start), start_byte);
9211           else
9212             move_gap_both (XINT (end), end_byte);
9213         }
9214       pos = XINT (start);
9215     }
9216
9217   list = Qnil;
9218   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9219     {
9220       elt = XCAR (tail);
9221       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9222       ASET (attrs, coding_attr_trans_tbl,
9223             get_translation_table (attrs, 1, NULL));
9224       list = Fcons (list2 (elt, attrs), list);
9225     }
9226
9227   if (STRINGP (start))
9228     p = pbeg = SDATA (start);
9229   else
9230     p = pbeg = BYTE_POS_ADDR (start_byte);
9231   pend = p + (end_byte - start_byte);
9232
9233   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9234   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9235
9236   while (p < pend)
9237     {
9238       if (ASCII_BYTE_P (*p))
9239         p++;
9240       else
9241         {
9242           c = STRING_CHAR_ADVANCE (p);
9243
9244           charset_map_loaded = 0;
9245           for (tail = list; CONSP (tail); tail = XCDR (tail))
9246             {
9247               elt = XCDR (XCAR (tail));
9248               if (! char_encodable_p (c, XCAR (elt)))
9249                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9250             }
9251           if (charset_map_loaded)
9252             {
9253               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9254
9255               if (STRINGP (start))
9256                 pbeg = SDATA (start);
9257               else
9258                 pbeg = BYTE_POS_ADDR (start_byte);
9259               p = pbeg + p_offset;
9260               pend = pbeg + pend_offset;
9261             }
9262         }
9263       pos++;
9264     }
9265
9266   tail = list;
9267   list = Qnil;
9268   for (; CONSP (tail); tail = XCDR (tail))
9269     {
9270       elt = XCAR (tail);
9271       if (CONSP (XCDR (XCDR (elt))))
9272         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9273                       list);
9274     }
9275
9276   return list;
9277 }
9278
9279
9280 static Lisp_Object
9281 code_convert_region (Lisp_Object start, Lisp_Object end,
9282                      Lisp_Object coding_system, Lisp_Object dst_object,
9283                      bool encodep, bool norecord)
9284 {
9285   struct coding_system coding;
9286   ptrdiff_t from, from_byte, to, to_byte;
9287   Lisp_Object src_object;
9288
9289   if (NILP (coding_system))
9290     coding_system = Qno_conversion;
9291   else
9292     CHECK_CODING_SYSTEM (coding_system);
9293   src_object = Fcurrent_buffer ();
9294   if (NILP (dst_object))
9295     dst_object = src_object;
9296   else if (! EQ (dst_object, Qt))
9297     CHECK_BUFFER (dst_object);
9298
9299   validate_region (&start, &end);
9300   from = XFASTINT (start);
9301   from_byte = CHAR_TO_BYTE (from);
9302   to = XFASTINT (end);
9303   to_byte = CHAR_TO_BYTE (to);
9304
9305   setup_coding_system (coding_system, &coding);
9306   coding.mode |= CODING_MODE_LAST_BLOCK;
9307
9308   if (encodep)
9309     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9310                           dst_object);
9311   else
9312     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9313                           dst_object);
9314   if (! norecord)
9315     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9316
9317   return (BUFFERP (dst_object)
9318           ? make_number (coding.produced_char)
9319           : coding.dst_object);
9320 }
9321
9322
9323 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9324        3, 4, "r\nzCoding system: ",
9325        doc: /* Decode the current region from the specified coding system.
9326 When called from a program, takes four arguments:
9327         START, END, CODING-SYSTEM, and DESTINATION.
9328 START and END are buffer positions.
9329
9330 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9331 If nil, the region between START and END is replaced by the decoded text.
9332 If buffer, the decoded text is inserted in that buffer after point (point
9333 does not move).
9334 In those cases, the length of the decoded text is returned.
9335 If DESTINATION is t, the decoded text is returned.
9336
9337 This function sets `last-coding-system-used' to the precise coding system
9338 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9339 not fully specified.)  */)
9340   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9341 {
9342   return code_convert_region (start, end, coding_system, destination, 0, 0);
9343 }
9344
9345 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9346        3, 4, "r\nzCoding system: ",
9347        doc: /* Encode the current region by specified coding system.
9348 When called from a program, takes four arguments:
9349         START, END, CODING-SYSTEM and DESTINATION.
9350 START and END are buffer positions.
9351
9352 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9353 If nil, the region between START and END is replace by the encoded text.
9354 If buffer, the encoded text is inserted in that buffer after point (point
9355 does not move).
9356 In those cases, the length of the encoded text is returned.
9357 If DESTINATION is t, the encoded text is returned.
9358
9359 This function sets `last-coding-system-used' to the precise coding system
9360 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9361 not fully specified.)  */)
9362   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9363 {
9364   return code_convert_region (start, end, coding_system, destination, 1, 0);
9365 }
9366
9367 Lisp_Object
9368 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9369                      Lisp_Object dst_object, bool encodep, bool nocopy,
9370                      bool norecord)
9371 {
9372   struct coding_system coding;
9373   ptrdiff_t chars, bytes;
9374
9375   CHECK_STRING (string);
9376   if (NILP (coding_system))
9377     {
9378       if (! norecord)
9379         Vlast_coding_system_used = Qno_conversion;
9380       if (NILP (dst_object))
9381         return (nocopy ? Fcopy_sequence (string) : string);
9382     }
9383
9384   if (NILP (coding_system))
9385     coding_system = Qno_conversion;
9386   else
9387     CHECK_CODING_SYSTEM (coding_system);
9388   if (NILP (dst_object))
9389     dst_object = Qt;
9390   else if (! EQ (dst_object, Qt))
9391     CHECK_BUFFER (dst_object);
9392
9393   setup_coding_system (coding_system, &coding);
9394   coding.mode |= CODING_MODE_LAST_BLOCK;
9395   chars = SCHARS (string);
9396   bytes = SBYTES (string);
9397   if (encodep)
9398     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9399   else
9400     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9401   if (! norecord)
9402     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9403
9404   return (BUFFERP (dst_object)
9405           ? make_number (coding.produced_char)
9406           : coding.dst_object);
9407 }
9408
9409
9410 /* Encode or decode STRING according to CODING_SYSTEM.
9411    Do not set Vlast_coding_system_used.
9412
9413    This function is called only from macros DECODE_FILE and
9414    ENCODE_FILE, thus we ignore character composition.  */
9415
9416 Lisp_Object
9417 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9418                               bool encodep)
9419 {
9420   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9421 }
9422
9423
9424 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9425        2, 4, 0,
9426        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9427
9428 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9429 if the decoding operation is trivial.
9430
9431 Optional fourth arg BUFFER non-nil means that the decoded text is
9432 inserted in that buffer after point (point does not move).  In this
9433 case, the return value is the length of the decoded text.
9434
9435 This function sets `last-coding-system-used' to the precise coding system
9436 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9437 not fully specified.)  */)
9438   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9439 {
9440   return code_convert_string (string, coding_system, buffer,
9441                               0, ! NILP (nocopy), 0);
9442 }
9443
9444 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9445        2, 4, 0,
9446        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9447
9448 Optional third arg NOCOPY non-nil means it is OK to return STRING
9449 itself if the encoding operation is trivial.
9450
9451 Optional fourth arg BUFFER non-nil means that the encoded text is
9452 inserted in that buffer after point (point does not move).  In this
9453 case, the return value is the length of the encoded text.
9454
9455 This function sets `last-coding-system-used' to the precise coding system
9456 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9457 not fully specified.)  */)
9458   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9459 {
9460   return code_convert_string (string, coding_system, buffer,
9461                               1, ! NILP (nocopy), 0);
9462 }
9463
9464 \f
9465 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9466        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9467 Return the corresponding character.  */)
9468   (Lisp_Object code)
9469 {
9470   Lisp_Object spec, attrs, val;
9471   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9472   EMACS_INT ch;
9473   int c;
9474
9475   CHECK_NATNUM (code);
9476   ch = XFASTINT (code);
9477   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9478   attrs = AREF (spec, 0);
9479
9480   if (ASCII_BYTE_P (ch)
9481       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9482     return code;
9483
9484   val = CODING_ATTR_CHARSET_LIST (attrs);
9485   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9486   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9487   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9488
9489   if (ch <= 0x7F)
9490     {
9491       c = ch;
9492       charset = charset_roman;
9493     }
9494   else if (ch >= 0xA0 && ch < 0xDF)
9495     {
9496       c = ch - 0x80;
9497       charset = charset_kana;
9498     }
9499   else
9500     {
9501       EMACS_INT c1 = ch >> 8;
9502       int c2 = ch & 0xFF;
9503
9504       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9505           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9506         error ("Invalid code: %"pI"d", ch);
9507       c = ch;
9508       SJIS_TO_JIS (c);
9509       charset = charset_kanji;
9510     }
9511   c = DECODE_CHAR (charset, c);
9512   if (c < 0)
9513     error ("Invalid code: %"pI"d", ch);
9514   return make_number (c);
9515 }
9516
9517
9518 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9519        doc: /* Encode a Japanese character CH to shift_jis encoding.
9520 Return the corresponding code in SJIS.  */)
9521   (Lisp_Object ch)
9522 {
9523   Lisp_Object spec, attrs, charset_list;
9524   int c;
9525   struct charset *charset;
9526   unsigned code;
9527
9528   CHECK_CHARACTER (ch);
9529   c = XFASTINT (ch);
9530   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9531   attrs = AREF (spec, 0);
9532
9533   if (ASCII_CHAR_P (c)
9534       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9535     return ch;
9536
9537   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9538   charset = char_charset (c, charset_list, &code);
9539   if (code == CHARSET_INVALID_CODE (charset))
9540     error ("Can't encode by shift_jis encoding: %c", c);
9541   JIS_TO_SJIS (code);
9542
9543   return make_number (code);
9544 }
9545
9546 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9547        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9548 Return the corresponding character.  */)
9549   (Lisp_Object code)
9550 {
9551   Lisp_Object spec, attrs, val;
9552   struct charset *charset_roman, *charset_big5, *charset;
9553   EMACS_INT ch;
9554   int c;
9555
9556   CHECK_NATNUM (code);
9557   ch = XFASTINT (code);
9558   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9559   attrs = AREF (spec, 0);
9560
9561   if (ASCII_BYTE_P (ch)
9562       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9563     return code;
9564
9565   val = CODING_ATTR_CHARSET_LIST (attrs);
9566   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9567   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9568
9569   if (ch <= 0x7F)
9570     {
9571       c = ch;
9572       charset = charset_roman;
9573     }
9574   else
9575     {
9576       EMACS_INT b1 = ch >> 8;
9577       int b2 = ch & 0x7F;
9578       if (b1 < 0xA1 || b1 > 0xFE
9579           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9580         error ("Invalid code: %"pI"d", ch);
9581       c = ch;
9582       charset = charset_big5;
9583     }
9584   c = DECODE_CHAR (charset, c);
9585   if (c < 0)
9586     error ("Invalid code: %"pI"d", ch);
9587   return make_number (c);
9588 }
9589
9590 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9591        doc: /* Encode the Big5 character CH to BIG5 coding system.
9592 Return the corresponding character code in Big5.  */)
9593   (Lisp_Object ch)
9594 {
9595   Lisp_Object spec, attrs, charset_list;
9596   struct charset *charset;
9597   int c;
9598   unsigned code;
9599
9600   CHECK_CHARACTER (ch);
9601   c = XFASTINT (ch);
9602   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9603   attrs = AREF (spec, 0);
9604   if (ASCII_CHAR_P (c)
9605       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9606     return ch;
9607
9608   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9609   charset = char_charset (c, charset_list, &code);
9610   if (code == CHARSET_INVALID_CODE (charset))
9611     error ("Can't encode by Big5 encoding: %c", c);
9612
9613   return make_number (code);
9614 }
9615
9616 \f
9617 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9618        Sset_terminal_coding_system_internal, 1, 2, 0,
9619        doc: /* Internal use only.  */)
9620   (Lisp_Object coding_system, Lisp_Object terminal)
9621 {
9622   struct terminal *term = get_terminal (terminal, 1);
9623   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9624   CHECK_SYMBOL (coding_system);
9625   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9626   /* We had better not send unsafe characters to terminal.  */
9627   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9628   /* Character composition should be disabled.  */
9629   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9630   terminal_coding->src_multibyte = 1;
9631   terminal_coding->dst_multibyte = 0;
9632   tset_charset_list
9633     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9634             ? coding_charset_list (terminal_coding)
9635             : list1 (make_number (charset_ascii))));
9636   return Qnil;
9637 }
9638
9639 DEFUN ("set-safe-terminal-coding-system-internal",
9640        Fset_safe_terminal_coding_system_internal,
9641        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9642        doc: /* Internal use only.  */)
9643   (Lisp_Object coding_system)
9644 {
9645   CHECK_SYMBOL (coding_system);
9646   setup_coding_system (Fcheck_coding_system (coding_system),
9647                        &safe_terminal_coding);
9648   /* Character composition should be disabled.  */
9649   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9650   safe_terminal_coding.src_multibyte = 1;
9651   safe_terminal_coding.dst_multibyte = 0;
9652   return Qnil;
9653 }
9654
9655 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9656        Sterminal_coding_system, 0, 1, 0,
9657        doc: /* Return coding system specified for terminal output on the given terminal.
9658 TERMINAL may be a terminal object, a frame, or nil for the selected
9659 frame's terminal device.  */)
9660   (Lisp_Object terminal)
9661 {
9662   struct coding_system *terminal_coding
9663     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9664   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9665
9666   /* For backward compatibility, return nil if it is `undecided'.  */
9667   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9668 }
9669
9670 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9671        Sset_keyboard_coding_system_internal, 1, 2, 0,
9672        doc: /* Internal use only.  */)
9673   (Lisp_Object coding_system, Lisp_Object terminal)
9674 {
9675   struct terminal *t = get_terminal (terminal, 1);
9676   CHECK_SYMBOL (coding_system);
9677   if (NILP (coding_system))
9678     coding_system = Qno_conversion;
9679   else
9680     Fcheck_coding_system (coding_system);
9681   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9682   /* Character composition should be disabled.  */
9683   TERMINAL_KEYBOARD_CODING (t)->common_flags
9684     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9685   return Qnil;
9686 }
9687
9688 DEFUN ("keyboard-coding-system",
9689        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9690        doc: /* Return coding system specified for decoding keyboard input.  */)
9691   (Lisp_Object terminal)
9692 {
9693   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9694                          (get_terminal (terminal, 1))->id);
9695 }
9696
9697 \f
9698 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9699        Sfind_operation_coding_system,  1, MANY, 0,
9700        doc: /* Choose a coding system for an operation based on the target name.
9701 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9702 DECODING-SYSTEM is the coding system to use for decoding
9703 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9704 for encoding (in case OPERATION does encoding).
9705
9706 The first argument OPERATION specifies an I/O primitive:
9707   For file I/O, `insert-file-contents' or `write-region'.
9708   For process I/O, `call-process', `call-process-region', or `start-process'.
9709   For network I/O, `open-network-stream'.
9710
9711 The remaining arguments should be the same arguments that were passed
9712 to the primitive.  Depending on which primitive, one of those arguments
9713 is selected as the TARGET.  For example, if OPERATION does file I/O,
9714 whichever argument specifies the file name is TARGET.
9715
9716 TARGET has a meaning which depends on OPERATION:
9717   For file I/O, TARGET is a file name (except for the special case below).
9718   For process I/O, TARGET is a process name.
9719   For network I/O, TARGET is a service name or a port number.
9720
9721 This function looks up what is specified for TARGET in
9722 `file-coding-system-alist', `process-coding-system-alist',
9723 or `network-coding-system-alist' depending on OPERATION.
9724 They may specify a coding system, a cons of coding systems,
9725 or a function symbol to call.
9726 In the last case, we call the function with one argument,
9727 which is a list of all the arguments given to this function.
9728 If the function can't decide a coding system, it can return
9729 `undecided' so that the normal code-detection is performed.
9730
9731 If OPERATION is `insert-file-contents', the argument corresponding to
9732 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9733 file name to look up, and BUFFER is a buffer that contains the file's
9734 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9735 function to call for FILENAME, that function should examine the
9736 contents of BUFFER instead of reading the file.
9737
9738 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9739   (ptrdiff_t nargs, Lisp_Object *args)
9740 {
9741   Lisp_Object operation, target_idx, target, val;
9742   register Lisp_Object chain;
9743
9744   if (nargs < 2)
9745     error ("Too few arguments");
9746   operation = args[0];
9747   if (!SYMBOLP (operation)
9748       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9749     error ("Invalid first argument");
9750   if (nargs <= 1 + XFASTINT (target_idx))
9751     error ("Too few arguments for operation `%s'",
9752            SDATA (SYMBOL_NAME (operation)));
9753   target = args[XFASTINT (target_idx) + 1];
9754   if (!(STRINGP (target)
9755         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9756             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9757         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9758     error ("Invalid argument %"pI"d of operation `%s'",
9759            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9760   if (CONSP (target))
9761     target = XCAR (target);
9762
9763   chain = ((EQ (operation, Qinsert_file_contents)
9764             || EQ (operation, Qwrite_region))
9765            ? Vfile_coding_system_alist
9766            : (EQ (operation, Qopen_network_stream)
9767               ? Vnetwork_coding_system_alist
9768               : Vprocess_coding_system_alist));
9769   if (NILP (chain))
9770     return Qnil;
9771
9772   for (; CONSP (chain); chain = XCDR (chain))
9773     {
9774       Lisp_Object elt;
9775
9776       elt = XCAR (chain);
9777       if (CONSP (elt)
9778           && ((STRINGP (target)
9779                && STRINGP (XCAR (elt))
9780                && fast_string_match (XCAR (elt), target) >= 0)
9781               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9782         {
9783           val = XCDR (elt);
9784           /* Here, if VAL is both a valid coding system and a valid
9785              function symbol, we return VAL as a coding system.  */
9786           if (CONSP (val))
9787             return val;
9788           if (! SYMBOLP (val))
9789             return Qnil;
9790           if (! NILP (Fcoding_system_p (val)))
9791             return Fcons (val, val);
9792           if (! NILP (Ffboundp (val)))
9793             {
9794               /* We use call1 rather than safe_call1
9795                  so as to get bug reports about functions called here
9796                  which don't handle the current interface.  */
9797               val = call1 (val, Flist (nargs, args));
9798               if (CONSP (val))
9799                 return val;
9800               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9801                 return Fcons (val, val);
9802             }
9803           return Qnil;
9804         }
9805     }
9806   return Qnil;
9807 }
9808
9809 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9810        Sset_coding_system_priority, 0, MANY, 0,
9811        doc: /* Assign higher priority to the coding systems given as arguments.
9812 If multiple coding systems belong to the same category,
9813 all but the first one are ignored.
9814
9815 usage: (set-coding-system-priority &rest coding-systems)  */)
9816   (ptrdiff_t nargs, Lisp_Object *args)
9817 {
9818   ptrdiff_t i, j;
9819   bool changed[coding_category_max];
9820   enum coding_category priorities[coding_category_max];
9821
9822   memset (changed, 0, sizeof changed);
9823
9824   for (i = j = 0; i < nargs; i++)
9825     {
9826       enum coding_category category;
9827       Lisp_Object spec, attrs;
9828
9829       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9830       attrs = AREF (spec, 0);
9831       category = XINT (CODING_ATTR_CATEGORY (attrs));
9832       if (changed[category])
9833         /* Ignore this coding system because a coding system of the
9834            same category already had a higher priority.  */
9835         continue;
9836       changed[category] = 1;
9837       priorities[j++] = category;
9838       if (coding_categories[category].id >= 0
9839           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9840         setup_coding_system (args[i], &coding_categories[category]);
9841       Fset (AREF (Vcoding_category_table, category), args[i]);
9842     }
9843
9844   /* Now we have decided top J priorities.  Reflect the order of the
9845      original priorities to the remaining priorities.  */
9846
9847   for (i = j, j = 0; i < coding_category_max; i++, j++)
9848     {
9849       while (j < coding_category_max
9850              && changed[coding_priorities[j]])
9851         j++;
9852       if (j == coding_category_max)
9853         emacs_abort ();
9854       priorities[i] = coding_priorities[j];
9855     }
9856
9857   memcpy (coding_priorities, priorities, sizeof priorities);
9858
9859   /* Update `coding-category-list'.  */
9860   Vcoding_category_list = Qnil;
9861   for (i = coding_category_max; i-- > 0; )
9862     Vcoding_category_list
9863       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9864                Vcoding_category_list);
9865
9866   return Qnil;
9867 }
9868
9869 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9870        Scoding_system_priority_list, 0, 1, 0,
9871        doc: /* Return a list of coding systems ordered by their priorities.
9872 The list contains a subset of coding systems; i.e. coding systems
9873 assigned to each coding category (see `coding-category-list').
9874
9875 HIGHESTP non-nil means just return the highest priority one.  */)
9876   (Lisp_Object highestp)
9877 {
9878   int i;
9879   Lisp_Object val;
9880
9881   for (i = 0, val = Qnil; i < coding_category_max; i++)
9882     {
9883       enum coding_category category = coding_priorities[i];
9884       int id = coding_categories[category].id;
9885       Lisp_Object attrs;
9886
9887       if (id < 0)
9888         continue;
9889       attrs = CODING_ID_ATTRS (id);
9890       if (! NILP (highestp))
9891         return CODING_ATTR_BASE_NAME (attrs);
9892       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9893     }
9894   return Fnreverse (val);
9895 }
9896
9897 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9898
9899 static Lisp_Object
9900 make_subsidiaries (Lisp_Object base)
9901 {
9902   Lisp_Object subsidiaries;
9903   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9904   char *buf = alloca (base_name_len + 6);
9905   int i;
9906
9907   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9908   subsidiaries = make_uninit_vector (3);
9909   for (i = 0; i < 3; i++)
9910     {
9911       strcpy (buf + base_name_len, suffixes[i]);
9912       ASET (subsidiaries, i, intern (buf));
9913     }
9914   return subsidiaries;
9915 }
9916
9917
9918 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9919        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9920        doc: /* For internal use only.
9921 usage: (define-coding-system-internal ...)  */)
9922   (ptrdiff_t nargs, Lisp_Object *args)
9923 {
9924   Lisp_Object name;
9925   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9926   Lisp_Object attrs;            /* Vector of attributes.  */
9927   Lisp_Object eol_type;
9928   Lisp_Object aliases;
9929   Lisp_Object coding_type, charset_list, safe_charsets;
9930   enum coding_category category;
9931   Lisp_Object tail, val;
9932   int max_charset_id = 0;
9933   int i;
9934
9935   if (nargs < coding_arg_max)
9936     goto short_args;
9937
9938   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9939
9940   name = args[coding_arg_name];
9941   CHECK_SYMBOL (name);
9942   ASET (attrs, coding_attr_base_name, name);
9943
9944   val = args[coding_arg_mnemonic];
9945   if (! STRINGP (val))
9946     CHECK_CHARACTER (val);
9947   ASET (attrs, coding_attr_mnemonic, val);
9948
9949   coding_type = args[coding_arg_coding_type];
9950   CHECK_SYMBOL (coding_type);
9951   ASET (attrs, coding_attr_type, coding_type);
9952
9953   charset_list = args[coding_arg_charset_list];
9954   if (SYMBOLP (charset_list))
9955     {
9956       if (EQ (charset_list, Qiso_2022))
9957         {
9958           if (! EQ (coding_type, Qiso_2022))
9959             error ("Invalid charset-list");
9960           charset_list = Viso_2022_charset_list;
9961         }
9962       else if (EQ (charset_list, Qemacs_mule))
9963         {
9964           if (! EQ (coding_type, Qemacs_mule))
9965             error ("Invalid charset-list");
9966           charset_list = Vemacs_mule_charset_list;
9967         }
9968       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9969         {
9970           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9971             error ("Invalid charset-list");
9972           if (max_charset_id < XFASTINT (XCAR (tail)))
9973             max_charset_id = XFASTINT (XCAR (tail));
9974         }
9975     }
9976   else
9977     {
9978       charset_list = Fcopy_sequence (charset_list);
9979       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9980         {
9981           struct charset *charset;
9982
9983           val = XCAR (tail);
9984           CHECK_CHARSET_GET_CHARSET (val, charset);
9985           if (EQ (coding_type, Qiso_2022)
9986               ? CHARSET_ISO_FINAL (charset) < 0
9987               : EQ (coding_type, Qemacs_mule)
9988               ? CHARSET_EMACS_MULE_ID (charset) < 0
9989               : 0)
9990             error ("Can't handle charset `%s'",
9991                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9992
9993           XSETCAR (tail, make_number (charset->id));
9994           if (max_charset_id < charset->id)
9995             max_charset_id = charset->id;
9996         }
9997     }
9998   ASET (attrs, coding_attr_charset_list, charset_list);
9999
10000   safe_charsets = make_uninit_string (max_charset_id + 1);
10001   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10002   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10003     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10004   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10005
10006   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10007
10008   val = args[coding_arg_decode_translation_table];
10009   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10010     CHECK_SYMBOL (val);
10011   ASET (attrs, coding_attr_decode_tbl, val);
10012
10013   val = args[coding_arg_encode_translation_table];
10014   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10015     CHECK_SYMBOL (val);
10016   ASET (attrs, coding_attr_encode_tbl, val);
10017
10018   val = args[coding_arg_post_read_conversion];
10019   CHECK_SYMBOL (val);
10020   ASET (attrs, coding_attr_post_read, val);
10021
10022   val = args[coding_arg_pre_write_conversion];
10023   CHECK_SYMBOL (val);
10024   ASET (attrs, coding_attr_pre_write, val);
10025
10026   val = args[coding_arg_default_char];
10027   if (NILP (val))
10028     ASET (attrs, coding_attr_default_char, make_number (' '));
10029   else
10030     {
10031       CHECK_CHARACTER (val);
10032       ASET (attrs, coding_attr_default_char, val);
10033     }
10034
10035   val = args[coding_arg_for_unibyte];
10036   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10037
10038   val = args[coding_arg_plist];
10039   CHECK_LIST (val);
10040   ASET (attrs, coding_attr_plist, val);
10041
10042   if (EQ (coding_type, Qcharset))
10043     {
10044       /* Generate a lisp vector of 256 elements.  Each element is nil,
10045          integer, or a list of charset IDs.
10046
10047          If Nth element is nil, the byte code N is invalid in this
10048          coding system.
10049
10050          If Nth element is a number NUM, N is the first byte of a
10051          charset whose ID is NUM.
10052
10053          If Nth element is a list of charset IDs, N is the first byte
10054          of one of them.  The list is sorted by dimensions of the
10055          charsets.  A charset of smaller dimension comes first. */
10056       val = Fmake_vector (make_number (256), Qnil);
10057
10058       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10059         {
10060           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10061           int dim = CHARSET_DIMENSION (charset);
10062           int idx = (dim - 1) * 4;
10063
10064           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10065             ASET (attrs, coding_attr_ascii_compat, Qt);
10066
10067           for (i = charset->code_space[idx];
10068                i <= charset->code_space[idx + 1]; i++)
10069             {
10070               Lisp_Object tmp, tmp2;
10071               int dim2;
10072
10073               tmp = AREF (val, i);
10074               if (NILP (tmp))
10075                 tmp = XCAR (tail);
10076               else if (NUMBERP (tmp))
10077                 {
10078                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10079                   if (dim < dim2)
10080                     tmp = list2 (XCAR (tail), tmp);
10081                   else
10082                     tmp = list2 (tmp, XCAR (tail));
10083                 }
10084               else
10085                 {
10086                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10087                     {
10088                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10089                       if (dim < dim2)
10090                         break;
10091                     }
10092                   if (NILP (tmp2))
10093                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10094                   else
10095                     {
10096                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10097                       XSETCAR (tmp2, XCAR (tail));
10098                     }
10099                 }
10100               ASET (val, i, tmp);
10101             }
10102         }
10103       ASET (attrs, coding_attr_charset_valids, val);
10104       category = coding_category_charset;
10105     }
10106   else if (EQ (coding_type, Qccl))
10107     {
10108       Lisp_Object valids;
10109
10110       if (nargs < coding_arg_ccl_max)
10111         goto short_args;
10112
10113       val = args[coding_arg_ccl_decoder];
10114       CHECK_CCL_PROGRAM (val);
10115       if (VECTORP (val))
10116         val = Fcopy_sequence (val);
10117       ASET (attrs, coding_attr_ccl_decoder, val);
10118
10119       val = args[coding_arg_ccl_encoder];
10120       CHECK_CCL_PROGRAM (val);
10121       if (VECTORP (val))
10122         val = Fcopy_sequence (val);
10123       ASET (attrs, coding_attr_ccl_encoder, val);
10124
10125       val = args[coding_arg_ccl_valids];
10126       valids = Fmake_string (make_number (256), make_number (0));
10127       for (tail = val; CONSP (tail); tail = XCDR (tail))
10128         {
10129           int from, to;
10130
10131           val = XCAR (tail);
10132           if (INTEGERP (val))
10133             {
10134               if (! (0 <= XINT (val) && XINT (val) <= 255))
10135                 args_out_of_range_3 (val, make_number (0), make_number (255));
10136               from = to = XINT (val);
10137             }
10138           else
10139             {
10140               CHECK_CONS (val);
10141               CHECK_NATNUM_CAR (val);
10142               CHECK_NUMBER_CDR (val);
10143               if (XINT (XCAR (val)) > 255)
10144                 args_out_of_range_3 (XCAR (val),
10145                                      make_number (0), make_number (255));
10146               from = XINT (XCAR (val));
10147               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10148                 args_out_of_range_3 (XCDR (val),
10149                                      XCAR (val), make_number (255));
10150               to = XINT (XCDR (val));
10151             }
10152           for (i = from; i <= to; i++)
10153             SSET (valids, i, 1);
10154         }
10155       ASET (attrs, coding_attr_ccl_valids, valids);
10156
10157       category = coding_category_ccl;
10158     }
10159   else if (EQ (coding_type, Qutf_16))
10160     {
10161       Lisp_Object bom, endian;
10162
10163       ASET (attrs, coding_attr_ascii_compat, Qnil);
10164
10165       if (nargs < coding_arg_utf16_max)
10166         goto short_args;
10167
10168       bom = args[coding_arg_utf16_bom];
10169       if (! NILP (bom) && ! EQ (bom, Qt))
10170         {
10171           CHECK_CONS (bom);
10172           val = XCAR (bom);
10173           CHECK_CODING_SYSTEM (val);
10174           val = XCDR (bom);
10175           CHECK_CODING_SYSTEM (val);
10176         }
10177       ASET (attrs, coding_attr_utf_bom, bom);
10178
10179       endian = args[coding_arg_utf16_endian];
10180       CHECK_SYMBOL (endian);
10181       if (NILP (endian))
10182         endian = Qbig;
10183       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10184         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10185       ASET (attrs, coding_attr_utf_16_endian, endian);
10186
10187       category = (CONSP (bom)
10188                   ? coding_category_utf_16_auto
10189                   : NILP (bom)
10190                   ? (EQ (endian, Qbig)
10191                      ? coding_category_utf_16_be_nosig
10192                      : coding_category_utf_16_le_nosig)
10193                   : (EQ (endian, Qbig)
10194                      ? coding_category_utf_16_be
10195                      : coding_category_utf_16_le));
10196     }
10197   else if (EQ (coding_type, Qiso_2022))
10198     {
10199       Lisp_Object initial, reg_usage, request, flags;
10200
10201       if (nargs < coding_arg_iso2022_max)
10202         goto short_args;
10203
10204       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10205       CHECK_VECTOR (initial);
10206       for (i = 0; i < 4; i++)
10207         {
10208           val = AREF (initial, i);
10209           if (! NILP (val))
10210             {
10211               struct charset *charset;
10212
10213               CHECK_CHARSET_GET_CHARSET (val, charset);
10214               ASET (initial, i, make_number (CHARSET_ID (charset)));
10215               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10216                 ASET (attrs, coding_attr_ascii_compat, Qt);
10217             }
10218           else
10219             ASET (initial, i, make_number (-1));
10220         }
10221
10222       reg_usage = args[coding_arg_iso2022_reg_usage];
10223       CHECK_CONS (reg_usage);
10224       CHECK_NUMBER_CAR (reg_usage);
10225       CHECK_NUMBER_CDR (reg_usage);
10226
10227       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10228       for (tail = request; CONSP (tail); tail = XCDR (tail))
10229         {
10230           int id;
10231           Lisp_Object tmp1;
10232
10233           val = XCAR (tail);
10234           CHECK_CONS (val);
10235           tmp1 = XCAR (val);
10236           CHECK_CHARSET_GET_ID (tmp1, id);
10237           CHECK_NATNUM_CDR (val);
10238           if (XINT (XCDR (val)) >= 4)
10239             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10240           XSETCAR (val, make_number (id));
10241         }
10242
10243       flags = args[coding_arg_iso2022_flags];
10244       CHECK_NATNUM (flags);
10245       i = XINT (flags) & INT_MAX;
10246       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10247         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10248       flags = make_number (i);
10249
10250       ASET (attrs, coding_attr_iso_initial, initial);
10251       ASET (attrs, coding_attr_iso_usage, reg_usage);
10252       ASET (attrs, coding_attr_iso_request, request);
10253       ASET (attrs, coding_attr_iso_flags, flags);
10254       setup_iso_safe_charsets (attrs);
10255
10256       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10257         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10258                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10259                     ? coding_category_iso_7_else
10260                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10261                     ? coding_category_iso_7
10262                     : coding_category_iso_7_tight);
10263       else
10264         {
10265           int id = XINT (AREF (initial, 1));
10266
10267           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10268                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10269                        || id < 0)
10270                       ? coding_category_iso_8_else
10271                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10272                       ? coding_category_iso_8_1
10273                       : coding_category_iso_8_2);
10274         }
10275       if (category != coding_category_iso_8_1
10276           && category != coding_category_iso_8_2)
10277         ASET (attrs, coding_attr_ascii_compat, Qnil);
10278     }
10279   else if (EQ (coding_type, Qemacs_mule))
10280     {
10281       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10282         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10283       ASET (attrs, coding_attr_ascii_compat, Qt);
10284       category = coding_category_emacs_mule;
10285     }
10286   else if (EQ (coding_type, Qshift_jis))
10287     {
10288
10289       struct charset *charset;
10290
10291       if (XINT (Flength (charset_list)) != 3
10292           && XINT (Flength (charset_list)) != 4)
10293         error ("There should be three or four charsets");
10294
10295       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10296       if (CHARSET_DIMENSION (charset) != 1)
10297         error ("Dimension of charset %s is not one",
10298                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10299       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10300         ASET (attrs, coding_attr_ascii_compat, Qt);
10301
10302       charset_list = XCDR (charset_list);
10303       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10304       if (CHARSET_DIMENSION (charset) != 1)
10305         error ("Dimension of charset %s is not one",
10306                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10307
10308       charset_list = XCDR (charset_list);
10309       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10310       if (CHARSET_DIMENSION (charset) != 2)
10311         error ("Dimension of charset %s is not two",
10312                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10313
10314       charset_list = XCDR (charset_list);
10315       if (! NILP (charset_list))
10316         {
10317           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10318           if (CHARSET_DIMENSION (charset) != 2)
10319             error ("Dimension of charset %s is not two",
10320                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10321         }
10322
10323       category = coding_category_sjis;
10324       Vsjis_coding_system = name;
10325     }
10326   else if (EQ (coding_type, Qbig5))
10327     {
10328       struct charset *charset;
10329
10330       if (XINT (Flength (charset_list)) != 2)
10331         error ("There should be just two charsets");
10332
10333       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10334       if (CHARSET_DIMENSION (charset) != 1)
10335         error ("Dimension of charset %s is not one",
10336                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10337       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10338         ASET (attrs, coding_attr_ascii_compat, Qt);
10339
10340       charset_list = XCDR (charset_list);
10341       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10342       if (CHARSET_DIMENSION (charset) != 2)
10343         error ("Dimension of charset %s is not two",
10344                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10345
10346       category = coding_category_big5;
10347       Vbig5_coding_system = name;
10348     }
10349   else if (EQ (coding_type, Qraw_text))
10350     {
10351       category = coding_category_raw_text;
10352       ASET (attrs, coding_attr_ascii_compat, Qt);
10353     }
10354   else if (EQ (coding_type, Qutf_8))
10355     {
10356       Lisp_Object bom;
10357
10358       if (nargs < coding_arg_utf8_max)
10359         goto short_args;
10360
10361       bom = args[coding_arg_utf8_bom];
10362       if (! NILP (bom) && ! EQ (bom, Qt))
10363         {
10364           CHECK_CONS (bom);
10365           val = XCAR (bom);
10366           CHECK_CODING_SYSTEM (val);
10367           val = XCDR (bom);
10368           CHECK_CODING_SYSTEM (val);
10369         }
10370       ASET (attrs, coding_attr_utf_bom, bom);
10371       if (NILP (bom))
10372         ASET (attrs, coding_attr_ascii_compat, Qt);
10373
10374       category = (CONSP (bom) ? coding_category_utf_8_auto
10375                   : NILP (bom) ? coding_category_utf_8_nosig
10376                   : coding_category_utf_8_sig);
10377     }
10378   else if (EQ (coding_type, Qundecided))
10379     {
10380       if (nargs < coding_arg_undecided_max)
10381         goto short_args;
10382       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10383             args[coding_arg_undecided_inhibit_null_byte_detection]);
10384       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10385             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10386       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10387             args[coding_arg_undecided_prefer_utf_8]);
10388       category = coding_category_undecided;
10389     }
10390   else
10391     error ("Invalid coding system type: %s",
10392            SDATA (SYMBOL_NAME (coding_type)));
10393
10394   ASET (attrs, coding_attr_category, make_number (category));
10395   ASET (attrs, coding_attr_plist,
10396         Fcons (QCcategory,
10397                Fcons (AREF (Vcoding_category_table, category),
10398                       CODING_ATTR_PLIST (attrs))));
10399   ASET (attrs, coding_attr_plist,
10400         Fcons (QCascii_compatible_p,
10401                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10402                       CODING_ATTR_PLIST (attrs))));
10403
10404   eol_type = args[coding_arg_eol_type];
10405   if (! NILP (eol_type)
10406       && ! EQ (eol_type, Qunix)
10407       && ! EQ (eol_type, Qdos)
10408       && ! EQ (eol_type, Qmac))
10409     error ("Invalid eol-type");
10410
10411   aliases = list1 (name);
10412
10413   if (NILP (eol_type))
10414     {
10415       eol_type = make_subsidiaries (name);
10416       for (i = 0; i < 3; i++)
10417         {
10418           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10419
10420           this_name = AREF (eol_type, i);
10421           this_aliases = list1 (this_name);
10422           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10423           this_spec = make_uninit_vector (3);
10424           ASET (this_spec, 0, attrs);
10425           ASET (this_spec, 1, this_aliases);
10426           ASET (this_spec, 2, this_eol_type);
10427           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10428           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10429           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10430           if (NILP (val))
10431             Vcoding_system_alist
10432               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10433                        Vcoding_system_alist);
10434         }
10435     }
10436
10437   spec_vec = make_uninit_vector (3);
10438   ASET (spec_vec, 0, attrs);
10439   ASET (spec_vec, 1, aliases);
10440   ASET (spec_vec, 2, eol_type);
10441
10442   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10443   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10444   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10445   if (NILP (val))
10446     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10447                                   Vcoding_system_alist);
10448
10449   {
10450     int id = coding_categories[category].id;
10451
10452     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10453       setup_coding_system (name, &coding_categories[category]);
10454   }
10455
10456   return Qnil;
10457
10458  short_args:
10459   return Fsignal (Qwrong_number_of_arguments,
10460                   Fcons (intern ("define-coding-system-internal"),
10461                          make_number (nargs)));
10462 }
10463
10464
10465 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10466        3, 3, 0,
10467        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10468   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10469 {
10470   Lisp_Object spec, attrs;
10471
10472   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10473   attrs = AREF (spec, 0);
10474   if (EQ (prop, QCmnemonic))
10475     {
10476       if (! STRINGP (val))
10477         CHECK_CHARACTER (val);
10478       ASET (attrs, coding_attr_mnemonic, val);
10479     }
10480   else if (EQ (prop, QCdefault_char))
10481     {
10482       if (NILP (val))
10483         val = make_number (' ');
10484       else
10485         CHECK_CHARACTER (val);
10486       ASET (attrs, coding_attr_default_char, val);
10487     }
10488   else if (EQ (prop, QCdecode_translation_table))
10489     {
10490       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10491         CHECK_SYMBOL (val);
10492       ASET (attrs, coding_attr_decode_tbl, val);
10493     }
10494   else if (EQ (prop, QCencode_translation_table))
10495     {
10496       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10497         CHECK_SYMBOL (val);
10498       ASET (attrs, coding_attr_encode_tbl, val);
10499     }
10500   else if (EQ (prop, QCpost_read_conversion))
10501     {
10502       CHECK_SYMBOL (val);
10503       ASET (attrs, coding_attr_post_read, val);
10504     }
10505   else if (EQ (prop, QCpre_write_conversion))
10506     {
10507       CHECK_SYMBOL (val);
10508       ASET (attrs, coding_attr_pre_write, val);
10509     }
10510   else if (EQ (prop, QCascii_compatible_p))
10511     {
10512       ASET (attrs, coding_attr_ascii_compat, val);
10513     }
10514
10515   ASET (attrs, coding_attr_plist,
10516         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10517   return val;
10518 }
10519
10520
10521 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10522        Sdefine_coding_system_alias, 2, 2, 0,
10523        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10524   (Lisp_Object alias, Lisp_Object coding_system)
10525 {
10526   Lisp_Object spec, aliases, eol_type, val;
10527
10528   CHECK_SYMBOL (alias);
10529   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10530   aliases = AREF (spec, 1);
10531   /* ALIASES should be a list of length more than zero, and the first
10532      element is a base coding system.  Append ALIAS at the tail of the
10533      list.  */
10534   while (!NILP (XCDR (aliases)))
10535     aliases = XCDR (aliases);
10536   XSETCDR (aliases, list1 (alias));
10537
10538   eol_type = AREF (spec, 2);
10539   if (VECTORP (eol_type))
10540     {
10541       Lisp_Object subsidiaries;
10542       int i;
10543
10544       subsidiaries = make_subsidiaries (alias);
10545       for (i = 0; i < 3; i++)
10546         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10547                                      AREF (eol_type, i));
10548     }
10549
10550   Fputhash (alias, spec, Vcoding_system_hash_table);
10551   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10552   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10553   if (NILP (val))
10554     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10555                                   Vcoding_system_alist);
10556
10557   return Qnil;
10558 }
10559
10560 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10561        1, 1, 0,
10562        doc: /* Return the base of CODING-SYSTEM.
10563 Any alias or subsidiary coding system is not a base coding system.  */)
10564   (Lisp_Object coding_system)
10565 {
10566   Lisp_Object spec, attrs;
10567
10568   if (NILP (coding_system))
10569     return (Qno_conversion);
10570   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10571   attrs = AREF (spec, 0);
10572   return CODING_ATTR_BASE_NAME (attrs);
10573 }
10574
10575 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10576        1, 1, 0,
10577        doc: "Return the property list of CODING-SYSTEM.")
10578   (Lisp_Object coding_system)
10579 {
10580   Lisp_Object spec, attrs;
10581
10582   if (NILP (coding_system))
10583     coding_system = Qno_conversion;
10584   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10585   attrs = AREF (spec, 0);
10586   return CODING_ATTR_PLIST (attrs);
10587 }
10588
10589
10590 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10591        1, 1, 0,
10592        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10593   (Lisp_Object coding_system)
10594 {
10595   Lisp_Object spec;
10596
10597   if (NILP (coding_system))
10598     coding_system = Qno_conversion;
10599   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10600   return AREF (spec, 1);
10601 }
10602
10603 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10604        Scoding_system_eol_type, 1, 1, 0,
10605        doc: /* Return eol-type of CODING-SYSTEM.
10606 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10607
10608 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10609 and CR respectively.
10610
10611 A vector value indicates that a format of end-of-line should be
10612 detected automatically.  Nth element of the vector is the subsidiary
10613 coding system whose eol-type is N.  */)
10614   (Lisp_Object coding_system)
10615 {
10616   Lisp_Object spec, eol_type;
10617   int n;
10618
10619   if (NILP (coding_system))
10620     coding_system = Qno_conversion;
10621   if (! CODING_SYSTEM_P (coding_system))
10622     return Qnil;
10623   spec = CODING_SYSTEM_SPEC (coding_system);
10624   eol_type = AREF (spec, 2);
10625   if (VECTORP (eol_type))
10626     return Fcopy_sequence (eol_type);
10627   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10628   return make_number (n);
10629 }
10630
10631 #endif /* emacs */
10632
10633 \f
10634 /*** 9. Post-amble ***/
10635
10636 void
10637 init_coding_once (void)
10638 {
10639   int i;
10640
10641   for (i = 0; i < coding_category_max; i++)
10642     {
10643       coding_categories[i].id = -1;
10644       coding_priorities[i] = i;
10645     }
10646
10647   /* ISO2022 specific initialize routine.  */
10648   for (i = 0; i < 0x20; i++)
10649     iso_code_class[i] = ISO_control_0;
10650   for (i = 0x21; i < 0x7F; i++)
10651     iso_code_class[i] = ISO_graphic_plane_0;
10652   for (i = 0x80; i < 0xA0; i++)
10653     iso_code_class[i] = ISO_control_1;
10654   for (i = 0xA1; i < 0xFF; i++)
10655     iso_code_class[i] = ISO_graphic_plane_1;
10656   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10657   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10658   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10659   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10660   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10661   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10662   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10663   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10664   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10665
10666   for (i = 0; i < 256; i++)
10667     {
10668       emacs_mule_bytes[i] = 1;
10669     }
10670   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10671   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10672   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10673   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10674 }
10675
10676 #ifdef emacs
10677
10678 void
10679 syms_of_coding (void)
10680 {
10681   staticpro (&Vcoding_system_hash_table);
10682   {
10683     Lisp_Object args[2];
10684     args[0] = QCtest;
10685     args[1] = Qeq;
10686     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10687   }
10688
10689   staticpro (&Vsjis_coding_system);
10690   Vsjis_coding_system = Qnil;
10691
10692   staticpro (&Vbig5_coding_system);
10693   Vbig5_coding_system = Qnil;
10694
10695   staticpro (&Vcode_conversion_reused_workbuf);
10696   Vcode_conversion_reused_workbuf = Qnil;
10697
10698   staticpro (&Vcode_conversion_workbuf_name);
10699   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10700
10701   reused_workbuf_in_use = 0;
10702
10703   DEFSYM (Qcharset, "charset");
10704   DEFSYM (Qtarget_idx, "target-idx");
10705   DEFSYM (Qcoding_system_history, "coding-system-history");
10706   Fset (Qcoding_system_history, Qnil);
10707
10708   /* Target FILENAME is the first argument.  */
10709   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10710   /* Target FILENAME is the third argument.  */
10711   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10712
10713   DEFSYM (Qcall_process, "call-process");
10714   /* Target PROGRAM is the first argument.  */
10715   Fput (Qcall_process, Qtarget_idx, make_number (0));
10716
10717   DEFSYM (Qcall_process_region, "call-process-region");
10718   /* Target PROGRAM is the third argument.  */
10719   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10720
10721   DEFSYM (Qstart_process, "start-process");
10722   /* Target PROGRAM is the third argument.  */
10723   Fput (Qstart_process, Qtarget_idx, make_number (2));
10724
10725   DEFSYM (Qopen_network_stream, "open-network-stream");
10726   /* Target SERVICE is the fourth argument.  */
10727   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10728
10729   DEFSYM (Qcoding_system, "coding-system");
10730   DEFSYM (Qcoding_aliases, "coding-aliases");
10731
10732   DEFSYM (Qeol_type, "eol-type");
10733   DEFSYM (Qunix, "unix");
10734   DEFSYM (Qdos, "dos");
10735   DEFSYM (Qmac, "mac");
10736
10737   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10738   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10739   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10740   DEFSYM (Qdefault_char, "default-char");
10741   DEFSYM (Qundecided, "undecided");
10742   DEFSYM (Qno_conversion, "no-conversion");
10743   DEFSYM (Qraw_text, "raw-text");
10744
10745   DEFSYM (Qiso_2022, "iso-2022");
10746
10747   DEFSYM (Qutf_8, "utf-8");
10748   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10749
10750 #if defined (WINDOWSNT) || defined (CYGWIN)
10751   /* No, not utf-16-le: that one has a BOM.  */
10752   DEFSYM (Qutf_16le, "utf-16le");
10753 #endif
10754
10755   DEFSYM (Qutf_16, "utf-16");
10756   DEFSYM (Qbig, "big");
10757   DEFSYM (Qlittle, "little");
10758
10759   DEFSYM (Qshift_jis, "shift-jis");
10760   DEFSYM (Qbig5, "big5");
10761
10762   DEFSYM (Qcoding_system_p, "coding-system-p");
10763
10764   DEFSYM (Qcoding_system_error, "coding-system-error");
10765   Fput (Qcoding_system_error, Qerror_conditions,
10766         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10767   Fput (Qcoding_system_error, Qerror_message,
10768         build_pure_c_string ("Invalid coding system"));
10769
10770   /* Intern this now in case it isn't already done.
10771      Setting this variable twice is harmless.
10772      But don't staticpro it here--that is done in alloc.c.  */
10773   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10774
10775   DEFSYM (Qtranslation_table, "translation-table");
10776   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10777   DEFSYM (Qtranslation_table_id, "translation-table-id");
10778   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10779   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10780
10781   DEFSYM (Qvalid_codes, "valid-codes");
10782
10783   DEFSYM (Qemacs_mule, "emacs-mule");
10784
10785   DEFSYM (QCcategory, ":category");
10786   DEFSYM (QCmnemonic, ":mnemonic");
10787   DEFSYM (QCdefault_char, ":default-char");
10788   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10789   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10790   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10791   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10792   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10793
10794   Vcoding_category_table
10795     = Fmake_vector (make_number (coding_category_max), Qnil);
10796   staticpro (&Vcoding_category_table);
10797   /* Followings are target of code detection.  */
10798   ASET (Vcoding_category_table, coding_category_iso_7,
10799         intern_c_string ("coding-category-iso-7"));
10800   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10801         intern_c_string ("coding-category-iso-7-tight"));
10802   ASET (Vcoding_category_table, coding_category_iso_8_1,
10803         intern_c_string ("coding-category-iso-8-1"));
10804   ASET (Vcoding_category_table, coding_category_iso_8_2,
10805         intern_c_string ("coding-category-iso-8-2"));
10806   ASET (Vcoding_category_table, coding_category_iso_7_else,
10807         intern_c_string ("coding-category-iso-7-else"));
10808   ASET (Vcoding_category_table, coding_category_iso_8_else,
10809         intern_c_string ("coding-category-iso-8-else"));
10810   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10811         intern_c_string ("coding-category-utf-8-auto"));
10812   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10813         intern_c_string ("coding-category-utf-8"));
10814   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10815         intern_c_string ("coding-category-utf-8-sig"));
10816   ASET (Vcoding_category_table, coding_category_utf_16_be,
10817         intern_c_string ("coding-category-utf-16-be"));
10818   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10819         intern_c_string ("coding-category-utf-16-auto"));
10820   ASET (Vcoding_category_table, coding_category_utf_16_le,
10821         intern_c_string ("coding-category-utf-16-le"));
10822   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10823         intern_c_string ("coding-category-utf-16-be-nosig"));
10824   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10825         intern_c_string ("coding-category-utf-16-le-nosig"));
10826   ASET (Vcoding_category_table, coding_category_charset,
10827         intern_c_string ("coding-category-charset"));
10828   ASET (Vcoding_category_table, coding_category_sjis,
10829         intern_c_string ("coding-category-sjis"));
10830   ASET (Vcoding_category_table, coding_category_big5,
10831         intern_c_string ("coding-category-big5"));
10832   ASET (Vcoding_category_table, coding_category_ccl,
10833         intern_c_string ("coding-category-ccl"));
10834   ASET (Vcoding_category_table, coding_category_emacs_mule,
10835         intern_c_string ("coding-category-emacs-mule"));
10836   /* Followings are NOT target of code detection.  */
10837   ASET (Vcoding_category_table, coding_category_raw_text,
10838         intern_c_string ("coding-category-raw-text"));
10839   ASET (Vcoding_category_table, coding_category_undecided,
10840         intern_c_string ("coding-category-undecided"));
10841
10842   DEFSYM (Qinsufficient_source, "insufficient-source");
10843   DEFSYM (Qinvalid_source, "invalid-source");
10844   DEFSYM (Qinterrupted, "interrupted");
10845   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10846
10847   defsubr (&Scoding_system_p);
10848   defsubr (&Sread_coding_system);
10849   defsubr (&Sread_non_nil_coding_system);
10850   defsubr (&Scheck_coding_system);
10851   defsubr (&Sdetect_coding_region);
10852   defsubr (&Sdetect_coding_string);
10853   defsubr (&Sfind_coding_systems_region_internal);
10854   defsubr (&Sunencodable_char_position);
10855   defsubr (&Scheck_coding_systems_region);
10856   defsubr (&Sdecode_coding_region);
10857   defsubr (&Sencode_coding_region);
10858   defsubr (&Sdecode_coding_string);
10859   defsubr (&Sencode_coding_string);
10860   defsubr (&Sdecode_sjis_char);
10861   defsubr (&Sencode_sjis_char);
10862   defsubr (&Sdecode_big5_char);
10863   defsubr (&Sencode_big5_char);
10864   defsubr (&Sset_terminal_coding_system_internal);
10865   defsubr (&Sset_safe_terminal_coding_system_internal);
10866   defsubr (&Sterminal_coding_system);
10867   defsubr (&Sset_keyboard_coding_system_internal);
10868   defsubr (&Skeyboard_coding_system);
10869   defsubr (&Sfind_operation_coding_system);
10870   defsubr (&Sset_coding_system_priority);
10871   defsubr (&Sdefine_coding_system_internal);
10872   defsubr (&Sdefine_coding_system_alias);
10873   defsubr (&Scoding_system_put);
10874   defsubr (&Scoding_system_base);
10875   defsubr (&Scoding_system_plist);
10876   defsubr (&Scoding_system_aliases);
10877   defsubr (&Scoding_system_eol_type);
10878   defsubr (&Scoding_system_priority_list);
10879
10880   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10881                doc: /* List of coding systems.
10882
10883 Do not alter the value of this variable manually.  This variable should be
10884 updated by the functions `define-coding-system' and
10885 `define-coding-system-alias'.  */);
10886   Vcoding_system_list = Qnil;
10887
10888   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10889                doc: /* Alist of coding system names.
10890 Each element is one element list of coding system name.
10891 This variable is given to `completing-read' as COLLECTION argument.
10892
10893 Do not alter the value of this variable manually.  This variable should be
10894 updated by the functions `make-coding-system' and
10895 `define-coding-system-alias'.  */);
10896   Vcoding_system_alist = Qnil;
10897
10898   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10899                doc: /* List of coding-categories (symbols) ordered by priority.
10900
10901 On detecting a coding system, Emacs tries code detection algorithms
10902 associated with each coding-category one by one in this order.  When
10903 one algorithm agrees with a byte sequence of source text, the coding
10904 system bound to the corresponding coding-category is selected.
10905
10906 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10907   {
10908     int i;
10909
10910     Vcoding_category_list = Qnil;
10911     for (i = coding_category_max - 1; i >= 0; i--)
10912       Vcoding_category_list
10913         = Fcons (AREF (Vcoding_category_table, i),
10914                  Vcoding_category_list);
10915   }
10916
10917   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10918                doc: /* Specify the coding system for read operations.
10919 It is useful to bind this variable with `let', but do not set it globally.
10920 If the value is a coding system, it is used for decoding on read operation.
10921 If not, an appropriate element is used from one of the coding system alists.
10922 There are three such tables: `file-coding-system-alist',
10923 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10924   Vcoding_system_for_read = Qnil;
10925
10926   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10927                doc: /* Specify the coding system for write operations.
10928 Programs bind this variable with `let', but you should not set it globally.
10929 If the value is a coding system, it is used for encoding of output,
10930 when writing it to a file and when sending it to a file or subprocess.
10931
10932 If this does not specify a coding system, an appropriate element
10933 is used from one of the coding system alists.
10934 There are three such tables: `file-coding-system-alist',
10935 `process-coding-system-alist', and `network-coding-system-alist'.
10936 For output to files, if the above procedure does not specify a coding system,
10937 the value of `buffer-file-coding-system' is used.  */);
10938   Vcoding_system_for_write = Qnil;
10939
10940   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10941                doc: /*
10942 Coding system used in the latest file or process I/O.  */);
10943   Vlast_coding_system_used = Qnil;
10944
10945   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10946                doc: /*
10947 Error status of the last code conversion.
10948
10949 When an error was detected in the last code conversion, this variable
10950 is set to one of the following symbols.
10951   `insufficient-source'
10952   `inconsistent-eol'
10953   `invalid-source'
10954   `interrupted'
10955   `insufficient-memory'
10956 When no error was detected, the value doesn't change.  So, to check
10957 the error status of a code conversion by this variable, you must
10958 explicitly set this variable to nil before performing code
10959 conversion.  */);
10960   Vlast_code_conversion_error = Qnil;
10961
10962   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10963                doc: /*
10964 *Non-nil means always inhibit code conversion of end-of-line format.
10965 See info node `Coding Systems' and info node `Text and Binary' concerning
10966 such conversion.  */);
10967   inhibit_eol_conversion = 0;
10968
10969   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10970                doc: /*
10971 Non-nil means process buffer inherits coding system of process output.
10972 Bind it to t if the process output is to be treated as if it were a file
10973 read from some filesystem.  */);
10974   inherit_process_coding_system = 0;
10975
10976   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10977                doc: /*
10978 Alist to decide a coding system to use for a file I/O operation.
10979 The format is ((PATTERN . VAL) ...),
10980 where PATTERN is a regular expression matching a file name,
10981 VAL is a coding system, a cons of coding systems, or a function symbol.
10982 If VAL is a coding system, it is used for both decoding and encoding
10983 the file contents.
10984 If VAL is a cons of coding systems, the car part is used for decoding,
10985 and the cdr part is used for encoding.
10986 If VAL is a function symbol, the function must return a coding system
10987 or a cons of coding systems which are used as above.  The function is
10988 called with an argument that is a list of the arguments with which
10989 `find-operation-coding-system' was called.  If the function can't decide
10990 a coding system, it can return `undecided' so that the normal
10991 code-detection is performed.
10992
10993 See also the function `find-operation-coding-system'
10994 and the variable `auto-coding-alist'.  */);
10995   Vfile_coding_system_alist = Qnil;
10996
10997   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10998                doc: /*
10999 Alist to decide a coding system to use for a process I/O operation.
11000 The format is ((PATTERN . VAL) ...),
11001 where PATTERN is a regular expression matching a program name,
11002 VAL is a coding system, a cons of coding systems, or a function symbol.
11003 If VAL is a coding system, it is used for both decoding what received
11004 from the program and encoding what sent to the program.
11005 If VAL is a cons of coding systems, the car part is used for decoding,
11006 and the cdr part is used for encoding.
11007 If VAL is a function symbol, the function must return a coding system
11008 or a cons of coding systems which are used as above.
11009
11010 See also the function `find-operation-coding-system'.  */);
11011   Vprocess_coding_system_alist = Qnil;
11012
11013   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11014                doc: /*
11015 Alist to decide a coding system to use for a network I/O operation.
11016 The format is ((PATTERN . VAL) ...),
11017 where PATTERN is a regular expression matching a network service name
11018 or is a port number to connect to,
11019 VAL is a coding system, a cons of coding systems, or a function symbol.
11020 If VAL is a coding system, it is used for both decoding what received
11021 from the network stream and encoding what sent to the network stream.
11022 If VAL is a cons of coding systems, the car part is used for decoding,
11023 and the cdr part is used for encoding.
11024 If VAL is a function symbol, the function must return a coding system
11025 or a cons of coding systems which are used as above.
11026
11027 See also the function `find-operation-coding-system'.  */);
11028   Vnetwork_coding_system_alist = Qnil;
11029
11030   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11031                doc: /* Coding system to use with system messages.
11032 Also used for decoding keyboard input on X Window system.  */);
11033   Vlocale_coding_system = Qnil;
11034
11035   /* The eol mnemonics are reset in startup.el system-dependently.  */
11036   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11037                doc: /*
11038 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11039   eol_mnemonic_unix = build_pure_c_string (":");
11040
11041   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11042                doc: /*
11043 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11044   eol_mnemonic_dos = build_pure_c_string ("\\");
11045
11046   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11047                doc: /*
11048 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11049   eol_mnemonic_mac = build_pure_c_string ("/");
11050
11051   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11052                doc: /*
11053 *String displayed in mode line when end-of-line format is not yet determined.  */);
11054   eol_mnemonic_undecided = build_pure_c_string (":");
11055
11056   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11057                doc: /*
11058 *Non-nil enables character translation while encoding and decoding.  */);
11059   Venable_character_translation = Qt;
11060
11061   DEFVAR_LISP ("standard-translation-table-for-decode",
11062                Vstandard_translation_table_for_decode,
11063                doc: /* Table for translating characters while decoding.  */);
11064   Vstandard_translation_table_for_decode = Qnil;
11065
11066   DEFVAR_LISP ("standard-translation-table-for-encode",
11067                Vstandard_translation_table_for_encode,
11068                doc: /* Table for translating characters while encoding.  */);
11069   Vstandard_translation_table_for_encode = Qnil;
11070
11071   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11072                doc: /* Alist of charsets vs revision numbers.
11073 While encoding, if a charset (car part of an element) is found,
11074 designate it with the escape sequence identifying revision (cdr part
11075 of the element).  */);
11076   Vcharset_revision_table = Qnil;
11077
11078   DEFVAR_LISP ("default-process-coding-system",
11079                Vdefault_process_coding_system,
11080                doc: /* Cons of coding systems used for process I/O by default.
11081 The car part is used for decoding a process output,
11082 the cdr part is used for encoding a text to be sent to a process.  */);
11083   Vdefault_process_coding_system = Qnil;
11084
11085   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11086                doc: /*
11087 Table of extra Latin codes in the range 128..159 (inclusive).
11088 This is a vector of length 256.
11089 If Nth element is non-nil, the existence of code N in a file
11090 \(or output of subprocess) doesn't prevent it to be detected as
11091 a coding system of ISO 2022 variant which has a flag
11092 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11093 or reading output of a subprocess.
11094 Only 128th through 159th elements have a meaning.  */);
11095   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11096
11097   DEFVAR_LISP ("select-safe-coding-system-function",
11098                Vselect_safe_coding_system_function,
11099                doc: /*
11100 Function to call to select safe coding system for encoding a text.
11101
11102 If set, this function is called to force a user to select a proper
11103 coding system which can encode the text in the case that a default
11104 coding system used in each operation can't encode the text.  The
11105 function should take care that the buffer is not modified while
11106 the coding system is being selected.
11107
11108 The default value is `select-safe-coding-system' (which see).  */);
11109   Vselect_safe_coding_system_function = Qnil;
11110
11111   DEFVAR_BOOL ("coding-system-require-warning",
11112                coding_system_require_warning,
11113                doc: /* Internal use only.
11114 If non-nil, on writing a file, `select-safe-coding-system-function' is
11115 called even if `coding-system-for-write' is non-nil.  The command
11116 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11117   coding_system_require_warning = 0;
11118
11119
11120   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11121                inhibit_iso_escape_detection,
11122                doc: /*
11123 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11124
11125 When Emacs reads text, it tries to detect how the text is encoded.
11126 This code detection is sensitive to escape sequences.  If Emacs sees
11127 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11128 of the ISO2022 encodings, and decodes text by the corresponding coding
11129 system (e.g. `iso-2022-7bit').
11130
11131 However, there may be a case that you want to read escape sequences in
11132 a file as is.  In such a case, you can set this variable to non-nil.
11133 Then the code detection will ignore any escape sequences, and no text is
11134 detected as encoded in some ISO-2022 encoding.  The result is that all
11135 escape sequences become visible in a buffer.
11136
11137 The default value is nil, and it is strongly recommended not to change
11138 it.  That is because many Emacs Lisp source files that contain
11139 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11140 in Emacs's distribution, and they won't be decoded correctly on
11141 reading if you suppress escape sequence detection.
11142
11143 The other way to read escape sequences in a file without decoding is
11144 to explicitly specify some coding system that doesn't use ISO-2022
11145 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11146   inhibit_iso_escape_detection = 0;
11147
11148   DEFVAR_BOOL ("inhibit-null-byte-detection",
11149                inhibit_null_byte_detection,
11150                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11151 By default, Emacs treats it as binary data, and does not attempt to
11152 decode it.  The effect is as if you specified `no-conversion' for
11153 reading that text.
11154
11155 Set this to non-nil when a regular text happens to include null bytes.
11156 Examples are Index nodes of Info files and null-byte delimited output
11157 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11158 decode text as usual.  */);
11159   inhibit_null_byte_detection = 0;
11160
11161   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11162                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11163 Internal use only.  Removed after the experimental optimizer gets stable. */);
11164   disable_ascii_optimization = 0;
11165
11166   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11167                doc: /* Char table for translating self-inserting characters.
11168 This is applied to the result of input methods, not their input.
11169 See also `keyboard-translate-table'.
11170
11171 Use of this variable for character code unification was rendered
11172 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11173 internal character representation.  */);
11174     Vtranslation_table_for_input = Qnil;
11175
11176   {
11177     Lisp_Object args[coding_arg_undecided_max];
11178     Lisp_Object plist[16];
11179     int i;
11180
11181     for (i = 0; i < coding_arg_undecided_max; i++)
11182       args[i] = Qnil;
11183
11184     plist[0] = intern_c_string (":name");
11185     plist[1] = args[coding_arg_name] = Qno_conversion;
11186     plist[2] = intern_c_string (":mnemonic");
11187     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11188     plist[4] = intern_c_string (":coding-type");
11189     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11190     plist[6] = intern_c_string (":ascii-compatible-p");
11191     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11192     plist[8] = intern_c_string (":default-char");
11193     plist[9] = args[coding_arg_default_char] = make_number (0);
11194     plist[10] = intern_c_string (":for-unibyte");
11195     plist[11] = args[coding_arg_for_unibyte] = Qt;
11196     plist[12] = intern_c_string (":docstring");
11197     plist[13] = build_pure_c_string ("Do no conversion.\n\
11198 \n\
11199 When you visit a file with this coding, the file is read into a\n\
11200 unibyte buffer as is, thus each byte of a file is treated as a\n\
11201 character.");
11202     plist[14] = intern_c_string (":eol-type");
11203     plist[15] = args[coding_arg_eol_type] = Qunix;
11204     args[coding_arg_plist] = Flist (16, plist);
11205     Fdefine_coding_system_internal (coding_arg_max, args);
11206
11207     plist[1] = args[coding_arg_name] = Qundecided;
11208     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11209     plist[5] = args[coding_arg_coding_type] = Qundecided;
11210     /* This is already set.
11211        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11212     plist[8] = intern_c_string (":charset-list");
11213     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11214     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11215     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11216     plist[15] = args[coding_arg_eol_type] = Qnil;
11217     args[coding_arg_plist] = Flist (16, plist);
11218     args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11219     args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11220     Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11221   }
11222
11223   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11224
11225   {
11226     int i;
11227
11228     for (i = 0; i < coding_category_max; i++)
11229       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11230   }
11231 #if defined (DOS_NT)
11232   system_eol_type = Qdos;
11233 #else
11234   system_eol_type = Qunix;
11235 #endif
11236   staticpro (&system_eol_type);
11237 }
11238
11239 char *
11240 emacs_strerror (int error_number)
11241 {
11242   char *str;
11243
11244   synchronize_system_messages_locale ();
11245   str = strerror (error_number);
11246
11247   if (! NILP (Vlocale_coding_system))
11248     {
11249       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11250                                                       Vlocale_coding_system,
11251                                                       0);
11252       str = SSDATA (dec);
11253     }
11254
11255   return str;
11256 }
11257
11258 #endif /* emacs */