src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2018 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or (at
  16 your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  Classic Mac OS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce an encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "termhooks.h"
 301
 302 Lisp_Object Vcoding_system_hash_table;
 303
 304 /* Format of end-of-line decided by system.  This is Qunix on
 305    Unix and Mac, Qdos on DOS/Windows.
 306    This has an effect only for external encoding (i.e. for output to
 307    file and process), not for in-buffer or Lisp string encoding.  */
 308 static Lisp_Object system_eol_type;
 309
 310 #ifdef emacs
 311
 312 /* Coding-systems are handed between Emacs Lisp programs and C internal
 313    routines by the following three variables.  */
 314 /* Coding system to be used to encode text for terminal display when
 315    terminal coding system is nil.  */
 316 struct coding_system safe_terminal_coding;
 317
 318 #endif /* emacs */
 319
 320 /* Two special coding systems.  */
 321 static Lisp_Object Vsjis_coding_system;
 322 static Lisp_Object Vbig5_coding_system;
 323
 324 /* ISO2022 section */
 325
 326 #define CODING_ISO_INITIAL(coding, reg)                 \
 327   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 328                      coding_attr_iso_initial),          \
 329                reg)))
 330
 331
 332 #define CODING_ISO_REQUEST(coding, charset_id)          \
 333   (((charset_id) <= (coding)->max_charset_id            \
 334     ? ((coding)->safe_charsets[charset_id] != 255       \
 335        ? (coding)->safe_charsets[charset_id]            \
 336        : -1)                                            \
 337     : -1))
 338
 339
 340 #define CODING_ISO_FLAGS(coding)        \
 341   ((coding)->spec.iso_2022.flags)
 342 #define CODING_ISO_DESIGNATION(coding, reg)     \
 343   ((coding)->spec.iso_2022.current_designation[reg])
 344 #define CODING_ISO_INVOCATION(coding, plane)    \
 345   ((coding)->spec.iso_2022.current_invocation[plane])
 346 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 347   ((coding)->spec.iso_2022.single_shifting)
 348 #define CODING_ISO_BOL(coding)  \
 349   ((coding)->spec.iso_2022.bol)
 350 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 351   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1       \
 352    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
 353 #define CODING_ISO_CMP_STATUS(coding)   \
 354   (&(coding)->spec.iso_2022.cmp_status)
 355 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 356   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 357 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 358   ((coding)->spec.iso_2022.embedded_utf_8)
 359
 360 /* Control characters of ISO2022.  */
 361                         /* code */      /* function */
 362 #define ISO_CODE_SO     0x0E            /* shift-out */
 363 #define ISO_CODE_SI     0x0F            /* shift-in */
 364 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 365 #define ISO_CODE_ESC    0x1B            /* escape */
 366 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 367 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 368 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 369
 370 /* All code (1-byte) of ISO2022 is classified into one of the
 371    followings.  */
 372 enum iso_code_class_type
 373   {
 374     ISO_control_0,              /* Control codes in the range
 375                                    0x00..0x1F and 0x7F, except for the
 376                                    following 5 codes.  */
 377     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 378     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 379     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 380     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 381     ISO_control_1,              /* Control codes in the range
 382                                    0x80..0x9F, except for the
 383                                    following 3 codes.  */
 384     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 385     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 386     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 387     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 388     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 389     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 390     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 391   };
 392
 393 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 394     `iso-flags' attribute of an iso2022 coding system.  */
 395
 396 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 397    instead of the correct short-form sequence (e.g. ESC $ A).  */
 398 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 399
 400 /* If set, reset graphic planes and registers at end-of-line to the
 401    initial state.  */
 402 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 403
 404 /* If set, reset graphic planes and registers before any control
 405    characters to the initial state.  */
 406 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 407
 408 /* If set, encode by 7-bit environment.  */
 409 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 410
 411 /* If set, use locking-shift function.  */
 412 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 413
 414 /* If set, use single-shift function.  Overwrite
 415    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 416 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 417
 418 /* If set, use designation escape sequence.  */
 419 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 420
 421 /* If set, produce revision number sequence.  */
 422 #define CODING_ISO_FLAG_REVISION        0x0080
 423
 424 /* If set, produce ISO6429's direction specifying sequence.  */
 425 #define CODING_ISO_FLAG_DIRECTION       0x0100
 426
 427 /* If set, assume designation states are reset at beginning of line on
 428    output.  */
 429 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 430
 431 /* If set, designation sequence should be placed at beginning of line
 432    on output.  */
 433 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 434
 435 /* If set, do not encode unsafe characters on output.  */
 436 #define CODING_ISO_FLAG_SAFE            0x0800
 437
 438 /* If set, extra latin codes (128..159) are accepted as a valid code
 439    on input.  */
 440 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 441
 442 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 443
 444 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 445
 446 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 447
 448 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 449
 450 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 451
 452 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 453
 454 /* A character to be produced on output if encoding of the original
 455    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 456 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 457
 458 /* UTF-8 section */
 459 #define CODING_UTF_8_BOM(coding)        \
 460   ((coding)->spec.utf_8_bom)
 461
 462 /* UTF-16 section */
 463 #define CODING_UTF_16_BOM(coding)       \
 464   ((coding)->spec.utf_16.bom)
 465
 466 #define CODING_UTF_16_ENDIAN(coding)    \
 467   ((coding)->spec.utf_16.endian)
 468
 469 #define CODING_UTF_16_SURROGATE(coding) \
 470   ((coding)->spec.utf_16.surrogate)
 471
 472
 473 /* CCL section */
 474 #define CODING_CCL_DECODER(coding)      \
 475   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 476 #define CODING_CCL_ENCODER(coding)      \
 477   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 478 #define CODING_CCL_VALIDS(coding)                                          \
 479   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 480
 481 /* Index for each coding category in `coding_categories' */
 482
 483 enum coding_category
 484   {
 485     coding_category_iso_7,
 486     coding_category_iso_7_tight,
 487     coding_category_iso_8_1,
 488     coding_category_iso_8_2,
 489     coding_category_iso_7_else,
 490     coding_category_iso_8_else,
 491     coding_category_utf_8_auto,
 492     coding_category_utf_8_nosig,
 493     coding_category_utf_8_sig,
 494     coding_category_utf_16_auto,
 495     coding_category_utf_16_be,
 496     coding_category_utf_16_le,
 497     coding_category_utf_16_be_nosig,
 498     coding_category_utf_16_le_nosig,
 499     coding_category_charset,
 500     coding_category_sjis,
 501     coding_category_big5,
 502     coding_category_ccl,
 503     coding_category_emacs_mule,
 504     /* All above are targets of code detection.  */
 505     coding_category_raw_text,
 506     coding_category_undecided,
 507     coding_category_max
 508   };
 509
 510 /* Definitions of flag bits used in detect_coding_XXXX.  */
 511 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 512 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 513 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 514 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 515 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 516 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 517 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 518 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 519 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 520 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 521 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 522 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 523 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 524 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 525 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 526 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 527 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 528 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 529 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 530 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 531
 532 /* This value is returned if detect_coding_mask () find nothing other
 533    than ASCII characters.  */
 534 #define CATEGORY_MASK_ANY               \
 535   (CATEGORY_MASK_ISO_7                  \
 536    | CATEGORY_MASK_ISO_7_TIGHT          \
 537    | CATEGORY_MASK_ISO_8_1              \
 538    | CATEGORY_MASK_ISO_8_2              \
 539    | CATEGORY_MASK_ISO_7_ELSE           \
 540    | CATEGORY_MASK_ISO_8_ELSE           \
 541    | CATEGORY_MASK_UTF_8_AUTO           \
 542    | CATEGORY_MASK_UTF_8_NOSIG          \
 543    | CATEGORY_MASK_UTF_8_SIG            \
 544    | CATEGORY_MASK_UTF_16_AUTO          \
 545    | CATEGORY_MASK_UTF_16_BE            \
 546    | CATEGORY_MASK_UTF_16_LE            \
 547    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 548    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 549    | CATEGORY_MASK_CHARSET              \
 550    | CATEGORY_MASK_SJIS                 \
 551    | CATEGORY_MASK_BIG5                 \
 552    | CATEGORY_MASK_CCL                  \
 553    | CATEGORY_MASK_EMACS_MULE)
 554
 555
 556 #define CATEGORY_MASK_ISO_7BIT \
 557   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 558
 559 #define CATEGORY_MASK_ISO_8BIT \
 560   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 561
 562 #define CATEGORY_MASK_ISO_ELSE \
 563   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 564
 565 #define CATEGORY_MASK_ISO_ESCAPE        \
 566   (CATEGORY_MASK_ISO_7                  \
 567    | CATEGORY_MASK_ISO_7_TIGHT          \
 568    | CATEGORY_MASK_ISO_7_ELSE           \
 569    | CATEGORY_MASK_ISO_8_ELSE)
 570
 571 #define CATEGORY_MASK_ISO       \
 572   (  CATEGORY_MASK_ISO_7BIT     \
 573      | CATEGORY_MASK_ISO_8BIT   \
 574      | CATEGORY_MASK_ISO_ELSE)
 575
 576 #define CATEGORY_MASK_UTF_16            \
 577   (CATEGORY_MASK_UTF_16_AUTO            \
 578    | CATEGORY_MASK_UTF_16_BE            \
 579    | CATEGORY_MASK_UTF_16_LE            \
 580    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 581    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 582
 583 #define CATEGORY_MASK_UTF_8     \
 584   (CATEGORY_MASK_UTF_8_AUTO     \
 585    | CATEGORY_MASK_UTF_8_NOSIG  \
 586    | CATEGORY_MASK_UTF_8_SIG)
 587
 588 /* Table of coding categories (Lisp symbols).  This variable is for
 589    internal use only.  */
 590 static Lisp_Object Vcoding_category_table;
 591
 592 /* Table of coding-categories ordered by priority.  */
 593 static enum coding_category coding_priorities[coding_category_max];
 594
 595 /* Nth element is a coding context for the coding system bound to the
 596    Nth coding category.  */
 597 static struct coding_system coding_categories[coding_category_max];
 598
 599 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 600
 601 static int
 602 encode_inhibit_flag (Lisp_Object flag)
 603 {
 604   return NILP (flag) ? -1 : EQ (flag, Qt);
 605 }
 606
 607 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 608    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 609
 610 static bool
 611 inhibit_flag (int encoded_flag, bool var)
 612 {
 613   return 0 < encoded_flag + var;
 614 }
 615
 616 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 617   do {                                                  \
 618     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 619     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 620   } while (0)
 621
 622 static void
 623 CHECK_NATNUM_CAR (Lisp_Object x)
 624 {
 625   Lisp_Object tmp = XCAR (x);
 626   CHECK_NATNUM (tmp);
 627   XSETCAR (x, tmp);
 628 }
 629
 630 static void
 631 CHECK_NATNUM_CDR (Lisp_Object x)
 632 {
 633   Lisp_Object tmp = XCDR (x);
 634   CHECK_NATNUM (tmp);
 635   XSETCDR (x, tmp);
 636 }
 637
 638 /* True if CODING's destination can be grown.  */
 639
 640 static bool
 641 growable_destination (struct coding_system *coding)
 642 {
 643   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
 644 }
 645
 646
 647 /* Safely get one byte from the source text pointed by SRC which ends
 648    at SRC_END, and set C to that byte.  If there are not enough bytes
 649    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 650    and a multibyte character is found at SRC, set C to the
 651    negative value of the character code.  The caller should declare
 652    and set these variables appropriately in advance:
 653         src, src_end, multibytep */
 654
 655 #define ONE_MORE_BYTE(c)                                \
 656   do {                                                  \
 657     if (src == src_end)                                 \
 658       {                                                 \
 659         if (src_base < src)                             \
 660           record_conversion_result                      \
 661             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 662         goto no_more_source;                            \
 663       }                                                 \
 664     c = *src++;                                         \
 665     if (multibytep && (c & 0x80))                       \
 666       {                                                 \
 667         if ((c & 0xFE) == 0xC0)                         \
 668           c = ((c & 1) << 6) | *src++;                  \
 669         else                                            \
 670           {                                             \
 671             src--;                                      \
 672             c = - string_char (src, &src, NULL);        \
 673             record_conversion_result                    \
 674               (coding, CODING_RESULT_INVALID_SRC);      \
 675           }                                             \
 676       }                                                 \
 677     consumed_chars++;                                   \
 678   } while (0)
 679
 680 /* Safely get two bytes from the source text pointed by SRC which ends
 681    at SRC_END, and set C1 and C2 to those bytes while skipping the
 682    heading multibyte characters.  If there are not enough bytes in the
 683    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 684    a multibyte character is found for C2, set C2 to the negative value
 685    of the character code.  The caller should declare and set these
 686    variables appropriately in advance:
 687         src, src_end, multibytep
 688    It is intended that this macro is used in detect_coding_utf_16.  */
 689
 690 #define TWO_MORE_BYTES(c1, c2)                          \
 691   do {                                                  \
 692     do {                                                \
 693       if (src == src_end)                               \
 694         goto no_more_source;                            \
 695       c1 = *src++;                                      \
 696       if (multibytep && (c1 & 0x80))                    \
 697         {                                               \
 698           if ((c1 & 0xFE) == 0xC0)                      \
 699             c1 = ((c1 & 1) << 6) | *src++;              \
 700           else                                          \
 701             {                                           \
 702               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 703               c1 = -1;                                  \
 704             }                                           \
 705         }                                               \
 706     } while (c1 < 0);                                   \
 707     if (src == src_end)                                 \
 708       goto no_more_source;                              \
 709     c2 = *src++;                                        \
 710     if (multibytep && (c2 & 0x80))                      \
 711       {                                                 \
 712         if ((c2 & 0xFE) == 0xC0)                        \
 713           c2 = ((c2 & 1) << 6) | *src++;                \
 714         else                                            \
 715           c2 = -1;                                      \
 716       }                                                 \
 717   } while (0)
 718
 719
 720 /* Store a byte C in the place pointed by DST and increment DST to the
 721    next free point, and increment PRODUCED_CHARS.  The caller should
 722    assure that C is 0..127, and declare and set the variable `dst'
 723    appropriately in advance.
 724 */
 725
 726
 727 #define EMIT_ONE_ASCII_BYTE(c)  \
 728   do {                          \
 729     produced_chars++;           \
 730     *dst++ = (c);               \
 731   } while (0)
 732
 733
 734 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 735
 736 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 737   do {                                  \
 738     produced_chars += 2;                \
 739     *dst++ = (c1), *dst++ = (c2);       \
 740   } while (0)
 741
 742
 743 /* Store a byte C in the place pointed by DST and increment DST to the
 744    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 745    store in an appropriate multibyte form.  The caller should
 746    declare and set the variables `dst' and `multibytep' appropriately
 747    in advance.  */
 748
 749 #define EMIT_ONE_BYTE(c)                \
 750   do {                                  \
 751     produced_chars++;                   \
 752     if (multibytep)                     \
 753       {                                 \
 754         unsigned ch = (c);              \
 755         if (ch >= 0x80)                 \
 756           ch = BYTE8_TO_CHAR (ch);      \
 757         CHAR_STRING_ADVANCE (ch, dst);  \
 758       }                                 \
 759     else                                \
 760       *dst++ = (c);                     \
 761   } while (0)
 762
 763
 764 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 765
 766 #define EMIT_TWO_BYTES(c1, c2)          \
 767   do {                                  \
 768     produced_chars += 2;                \
 769     if (multibytep)                     \
 770       {                                 \
 771         unsigned ch;                    \
 772                                         \
 773         ch = (c1);                      \
 774         if (ch >= 0x80)                 \
 775           ch = BYTE8_TO_CHAR (ch);      \
 776         CHAR_STRING_ADVANCE (ch, dst);  \
 777         ch = (c2);                      \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781       }                                 \
 782     else                                \
 783       {                                 \
 784         *dst++ = (c1);                  \
 785         *dst++ = (c2);                  \
 786       }                                 \
 787   } while (0)
 788
 789
 790 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 791   do {                                  \
 792     EMIT_ONE_BYTE (c1);                 \
 793     EMIT_TWO_BYTES (c2, c3);            \
 794   } while (0)
 795
 796
 797 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 798   do {                                          \
 799     EMIT_TWO_BYTES (c1, c2);                    \
 800     EMIT_TWO_BYTES (c3, c4);                    \
 801   } while (0)
 802
 803
 804 static void
 805 record_conversion_result (struct coding_system *coding,
 806                           enum coding_result_code result)
 807 {
 808   coding->result = result;
 809   switch (result)
 810     {
 811     case CODING_RESULT_INSUFFICIENT_SRC:
 812       Vlast_code_conversion_error = Qinsufficient_source;
 813       break;
 814     case CODING_RESULT_INVALID_SRC:
 815       Vlast_code_conversion_error = Qinvalid_source;
 816       break;
 817     case CODING_RESULT_INTERRUPT:
 818       Vlast_code_conversion_error = Qinterrupted;
 819       break;
 820     case CODING_RESULT_INSUFFICIENT_DST:
 821       /* Don't record this error in Vlast_code_conversion_error
 822          because it happens just temporarily and is resolved when the
 823          whole conversion is finished.  */
 824       break;
 825     case CODING_RESULT_SUCCESS:
 826       break;
 827     default:
 828       Vlast_code_conversion_error = intern ("Unknown error");
 829     }
 830 }
 831
 832 /* These wrapper macros are used to preserve validity of pointers into
 833    buffer text across calls to decode_char, encode_char, etc, which
 834    could cause relocation of buffers if it loads a charset map,
 835    because loading a charset map allocates large structures.  */
 836
 837 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 838   do {                                                                       \
 839     ptrdiff_t offset;                                                        \
 840                                                                              \
 841     charset_map_loaded = 0;                                                  \
 842     c = DECODE_CHAR (charset, code);                                         \
 843     if (charset_map_loaded                                                   \
 844         && (offset = coding_change_source (coding)))                         \
 845       {                                                                      \
 846         src += offset;                                                       \
 847         src_base += offset;                                                  \
 848         src_end += offset;                                                   \
 849       }                                                                      \
 850   } while (0)
 851
 852 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 853   do {                                                                  \
 854     ptrdiff_t offset;                                                   \
 855                                                                         \
 856     charset_map_loaded = 0;                                             \
 857     code = ENCODE_CHAR (charset, c);                                    \
 858     if (charset_map_loaded                                              \
 859         && (offset = coding_change_destination (coding)))               \
 860       {                                                                 \
 861         dst += offset;                                                  \
 862         dst_end += offset;                                              \
 863       }                                                                 \
 864   } while (0)
 865
 866 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 867   do {                                                                  \
 868     ptrdiff_t offset;                                                   \
 869                                                                         \
 870     charset_map_loaded = 0;                                             \
 871     charset = char_charset (c, charset_list, code_return);              \
 872     if (charset_map_loaded                                              \
 873         && (offset = coding_change_destination (coding)))               \
 874       {                                                                 \
 875         dst += offset;                                                  \
 876         dst_end += offset;                                              \
 877       }                                                                 \
 878   } while (0)
 879
 880 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 881   do {                                                                  \
 882     ptrdiff_t offset;                                                   \
 883                                                                         \
 884     charset_map_loaded = 0;                                             \
 885     result = CHAR_CHARSET_P (c, charset);                               \
 886     if (charset_map_loaded                                              \
 887         && (offset = coding_change_destination (coding)))               \
 888       {                                                                 \
 889         dst += offset;                                                  \
 890         dst_end += offset;                                              \
 891       }                                                                 \
 892   } while (0)
 893
 894
 895 /* If there are at least BYTES length of room at dst, allocate memory
 896    for coding->destination and update dst and dst_end.  We don't have
 897    to take care of coding->source which will be relocated.  It is
 898    handled by calling coding_set_source in encode_coding.  */
 899
 900 #define ASSURE_DESTINATION(bytes)                               \
 901   do {                                                          \
 902     if (dst + (bytes) >= dst_end)                               \
 903       {                                                         \
 904         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 905                                                                 \
 906         dst = alloc_destination (coding, more_bytes, dst);      \
 907         dst_end = coding->destination + coding->dst_bytes;      \
 908       }                                                         \
 909   } while (0)
 910
 911
 912 /* Store multibyte form of the character C in P, and advance P to the
 913    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 914    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 915    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 916
 917 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 918
 919 /* Return the character code of character whose multibyte form is at
 920    P, and advance P to the end of the multibyte form.  This used to be
 921    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 922    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 923
 924 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 925
 926 /* Set coding->source from coding->src_object.  */
 927
 928 static void
 929 coding_set_source (struct coding_system *coding)
 930 {
 931   if (BUFFERP (coding->src_object))
 932     {
 933       struct buffer *buf = XBUFFER (coding->src_object);
 934
 935       if (coding->src_pos < 0)
 936         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 937       else
 938         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 939     }
 940   else if (STRINGP (coding->src_object))
 941     {
 942       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 943     }
 944   else
 945     {
 946       /* Otherwise, the source is C string and is never relocated
 947          automatically.  Thus we don't have to update anything.  */
 948     }
 949 }
 950
 951
 952 /* Set coding->source from coding->src_object, and return how many
 953    bytes coding->source was changed.  */
 954
 955 static ptrdiff_t
 956 coding_change_source (struct coding_system *coding)
 957 {
 958   const unsigned char *orig = coding->source;
 959   coding_set_source (coding);
 960   return coding->source - orig;
 961 }
 962
 963
 964 /* Set coding->destination from coding->dst_object.  */
 965
 966 static void
 967 coding_set_destination (struct coding_system *coding)
 968 {
 969   if (BUFFERP (coding->dst_object))
 970     {
 971       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 972         {
 973           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 974           coding->dst_bytes = (GAP_END_ADDR
 975                                - (coding->src_bytes - coding->consumed)
 976                                - coding->destination);
 977         }
 978       else
 979         {
 980           /* We are sure that coding->dst_pos_byte is before the gap
 981              of the buffer. */
 982           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 983                                  + coding->dst_pos_byte - BEG_BYTE);
 984           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 985                                - coding->destination);
 986         }
 987     }
 988   else
 989     {
 990       /* Otherwise, the destination is C string and is never relocated
 991          automatically.  Thus we don't have to update anything.  */
 992     }
 993 }
 994
 995
 996 /* Set coding->destination from coding->dst_object, and return how
 997    many bytes coding->destination was changed.  */
 998
 999 static ptrdiff_t
1000 coding_change_destination (struct coding_system *coding)
1001 {
1002   const unsigned char *orig = coding->destination;
1003   coding_set_destination (coding);
1004   return coding->destination - orig;
1005 }
1006
1007
1008 static void
1009 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1010 {
1011   ptrdiff_t newbytes;
1012   if (INT_ADD_WRAPV (coding->dst_bytes, bytes, &newbytes)
1013       || SIZE_MAX < newbytes)
1014     string_overflow ();
1015   coding->destination = xrealloc (coding->destination, newbytes);
1016   coding->dst_bytes = newbytes;
1017 }
1018
1019 static void
1020 coding_alloc_by_making_gap (struct coding_system *coding,
1021                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1022 {
1023   if (EQ (coding->src_object, coding->dst_object))
1024     {
1025       /* The gap may contain the produced data at the head and not-yet
1026          consumed data at the tail.  To preserve those data, we at
1027          first make the gap size to zero, then increase the gap
1028          size.  */
1029       ptrdiff_t add = GAP_SIZE;
1030
1031       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1032       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1033       make_gap (bytes);
1034       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1035       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1036     }
1037   else
1038     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1039 }
1040
1041
1042 static unsigned char *
1043 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1044                    unsigned char *dst)
1045 {
1046   ptrdiff_t offset = dst - coding->destination;
1047
1048   if (BUFFERP (coding->dst_object))
1049     {
1050       struct buffer *buf = XBUFFER (coding->dst_object);
1051
1052       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1053     }
1054   else
1055     coding_alloc_by_realloc (coding, nbytes);
1056   coding_set_destination (coding);
1057   dst = coding->destination + offset;
1058   return dst;
1059 }
1060
1061 /** Macros for annotations.  */
1062
1063 /* An annotation data is stored in the array coding->charbuf in this
1064    format:
1065      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1066    LENGTH is the number of elements in the annotation.
1067    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1068    NCHARS is the number of characters in the text annotated.
1069
1070    The format of the following elements depend on ANNOTATION_MASK.
1071
1072    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1073    follows:
1074      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1075
1076    NBYTES is the number of bytes specified in the header part of
1077    old-style emacs-mule encoding, or 0 for the other kind of
1078    composition.
1079
1080    METHOD is one of enum composition_method.
1081
1082    Optional COMPOSITION-COMPONENTS are characters and composition
1083    rules.
1084
1085    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1086    follows.
1087
1088    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1089    recover from an invalid annotation, and should be skipped by
1090    produce_annotation.  */
1091
1092 /* Maximum length of the header of annotation data.  */
1093 #define MAX_ANNOTATION_LENGTH 5
1094
1095 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1096   do {                                                  \
1097     *(buf)++ = -(len);                                  \
1098     *(buf)++ = (mask);                                  \
1099     *(buf)++ = (nchars);                                \
1100     coding->annotated = 1;                              \
1101   } while (0);
1102
1103 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1104   do {                                                                      \
1105     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1106     *buf++ = nbytes;                                                        \
1107     *buf++ = method;                                                        \
1108   } while (0)
1109
1110
1111 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1112   do {                                                                  \
1113     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1114     *buf++ = id;                                                        \
1115   } while (0)
1116
1117
1118 /* Bitmasks for coding->eol_seen.  */
1119
1120 #define EOL_SEEN_NONE   0
1121 #define EOL_SEEN_LF     1
1122 #define EOL_SEEN_CR     2
1123 #define EOL_SEEN_CRLF   4
1124
1125 \f
1126 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1127
1128
1129
1130 \f
1131 /*** 3. UTF-8 ***/
1132
1133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1134    Return true if a text is encoded in UTF-8.  */
1135
1136 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1137 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1138 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1139 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1140 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1141 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1142
1143 #define UTF_8_BOM_1 0xEF
1144 #define UTF_8_BOM_2 0xBB
1145 #define UTF_8_BOM_3 0xBF
1146
1147 /* Unlike the other detect_coding_XXX, this function counts the number
1148    of characters and checks the EOL format.  */
1149
1150 static bool
1151 detect_coding_utf_8 (struct coding_system *coding,
1152                      struct coding_detection_info *detect_info)
1153 {
1154   const unsigned char *src = coding->source, *src_base;
1155   const unsigned char *src_end = coding->source + coding->src_bytes;
1156   bool multibytep = coding->src_multibyte;
1157   ptrdiff_t consumed_chars = 0;
1158   bool bom_found = 0;
1159   ptrdiff_t nchars = coding->head_ascii;
1160   int eol_seen = coding->eol_seen;
1161
1162   detect_info->checked |= CATEGORY_MASK_UTF_8;
1163   /* A coding system of this category is always ASCII compatible.  */
1164   src += nchars;
1165
1166   if (src == coding->source     /* BOM should be at the head.  */
1167       && src + 3 < src_end      /* BOM is 3-byte long.  */
1168       && src[0] == UTF_8_BOM_1
1169       && src[1] == UTF_8_BOM_2
1170       && src[2] == UTF_8_BOM_3)
1171     {
1172       bom_found = 1;
1173       src += 3;
1174       nchars++;
1175     }
1176
1177   while (1)
1178     {
1179       int c, c1, c2, c3, c4;
1180
1181       src_base = src;
1182       ONE_MORE_BYTE (c);
1183       if (c < 0 || UTF_8_1_OCTET_P (c))
1184         {
1185           nchars++;
1186           if (c == '\r')
1187             {
1188               if (src < src_end && *src == '\n')
1189                 {
1190                   eol_seen |= EOL_SEEN_CRLF;
1191                   src++;
1192                   nchars++;
1193                 }
1194               else
1195                 eol_seen |= EOL_SEEN_CR;
1196             }
1197           else if (c == '\n')
1198             eol_seen |= EOL_SEEN_LF;
1199           continue;
1200         }
1201       ONE_MORE_BYTE (c1);
1202       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1203         break;
1204       if (UTF_8_2_OCTET_LEADING_P (c))
1205         {
1206           nchars++;
1207           continue;
1208         }
1209       ONE_MORE_BYTE (c2);
1210       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1211         break;
1212       if (UTF_8_3_OCTET_LEADING_P (c))
1213         {
1214           nchars++;
1215           continue;
1216         }
1217       ONE_MORE_BYTE (c3);
1218       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1219         break;
1220       if (UTF_8_4_OCTET_LEADING_P (c))
1221         {
1222           nchars++;
1223           continue;
1224         }
1225       ONE_MORE_BYTE (c4);
1226       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1227         break;
1228       if (UTF_8_5_OCTET_LEADING_P (c))
1229         {
1230           nchars++;
1231           continue;
1232         }
1233       break;
1234     }
1235   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1236   return 0;
1237
1238  no_more_source:
1239   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1240     {
1241       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1242       return 0;
1243     }
1244   if (bom_found)
1245     {
1246       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1247       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1248     }
1249   else
1250     {
1251       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1252       if (nchars < src_end - coding->source)
1253         /* The found characters are less than source bytes, which
1254            means that we found a valid non-ASCII characters.  */
1255         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1256     }
1257   coding->detected_utf8_bytes = src_base - coding->source;
1258   coding->detected_utf8_chars = nchars;
1259   return 1;
1260 }
1261
1262
1263 static void
1264 decode_coding_utf_8 (struct coding_system *coding)
1265 {
1266   const unsigned char *src = coding->source + coding->consumed;
1267   const unsigned char *src_end = coding->source + coding->src_bytes;
1268   const unsigned char *src_base;
1269   int *charbuf = coding->charbuf + coding->charbuf_used;
1270   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1271   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1272   bool multibytep = coding->src_multibyte;
1273   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1274   bool eol_dos
1275     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1276   int byte_after_cr = -1;
1277
1278   if (bom != utf_without_bom)
1279     {
1280       int c1, c2, c3;
1281
1282       src_base = src;
1283       ONE_MORE_BYTE (c1);
1284       if (! UTF_8_3_OCTET_LEADING_P (c1))
1285         src = src_base;
1286       else
1287         {
1288           ONE_MORE_BYTE (c2);
1289           if (! UTF_8_EXTRA_OCTET_P (c2))
1290             src = src_base;
1291           else
1292             {
1293               ONE_MORE_BYTE (c3);
1294               if (! UTF_8_EXTRA_OCTET_P (c3))
1295                 src = src_base;
1296               else
1297                 {
1298                   if ((c1 != UTF_8_BOM_1)
1299                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1300                     src = src_base;
1301                   else
1302                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1303                 }
1304             }
1305         }
1306     }
1307   CODING_UTF_8_BOM (coding) = utf_without_bom;
1308
1309   while (1)
1310     {
1311       int c, c1, c2, c3, c4, c5;
1312
1313       src_base = src;
1314       consumed_chars_base = consumed_chars;
1315
1316       if (charbuf >= charbuf_end)
1317         {
1318           if (byte_after_cr >= 0)
1319             src_base--;
1320           break;
1321         }
1322
1323       /* In the simple case, rapidly handle ordinary characters */
1324       if (multibytep && ! eol_dos
1325           && charbuf < charbuf_end - 6 && src < src_end - 6)
1326         {
1327           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1328             {
1329               c1 = *src;
1330               if (c1 & 0x80)
1331                 break;
1332               src++;
1333               consumed_chars++;
1334               *charbuf++ = c1;
1335
1336               c1 = *src;
1337               if (c1 & 0x80)
1338                 break;
1339               src++;
1340               consumed_chars++;
1341               *charbuf++ = c1;
1342
1343               c1 = *src;
1344               if (c1 & 0x80)
1345                 break;
1346               src++;
1347               consumed_chars++;
1348               *charbuf++ = c1;
1349
1350               c1 = *src;
1351               if (c1 & 0x80)
1352                 break;
1353               src++;
1354               consumed_chars++;
1355               *charbuf++ = c1;
1356             }
1357           /* If we handled at least one character, restart the main loop.  */
1358           if (src != src_base)
1359             continue;
1360         }
1361
1362       if (byte_after_cr >= 0)
1363         c1 = byte_after_cr, byte_after_cr = -1;
1364       else
1365         ONE_MORE_BYTE (c1);
1366       if (c1 < 0)
1367         {
1368           c = - c1;
1369         }
1370       else if (UTF_8_1_OCTET_P (c1))
1371         {
1372           if (eol_dos && c1 == '\r')
1373             ONE_MORE_BYTE (byte_after_cr);
1374           c = c1;
1375         }
1376       else
1377         {
1378           ONE_MORE_BYTE (c2);
1379           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1380             goto invalid_code;
1381           if (UTF_8_2_OCTET_LEADING_P (c1))
1382             {
1383               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1384               /* Reject overlong sequences here and below.  Encoders
1385                  producing them are incorrect, they can be misleading,
1386                  and they mess up read/write invariance.  */
1387               if (c < 128)
1388                 goto invalid_code;
1389             }
1390           else
1391             {
1392               ONE_MORE_BYTE (c3);
1393               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1394                 goto invalid_code;
1395               if (UTF_8_3_OCTET_LEADING_P (c1))
1396                 {
1397                   c = (((c1 & 0xF) << 12)
1398                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1399                   if (c < 0x800
1400                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1401                     goto invalid_code;
1402                 }
1403               else
1404                 {
1405                   ONE_MORE_BYTE (c4);
1406                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1407                     goto invalid_code;
1408                   if (UTF_8_4_OCTET_LEADING_P (c1))
1409                     {
1410                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1411                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1412                     if (c < 0x10000)
1413                       goto invalid_code;
1414                     }
1415                   else
1416                     {
1417                       ONE_MORE_BYTE (c5);
1418                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1419                         goto invalid_code;
1420                       if (UTF_8_5_OCTET_LEADING_P (c1))
1421                         {
1422                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1423                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1424                                | (c5 & 0x3F));
1425                           if ((c > MAX_CHAR) || (c < 0x200000))
1426                             goto invalid_code;
1427                         }
1428                       else
1429                         goto invalid_code;
1430                     }
1431                 }
1432             }
1433         }
1434
1435       *charbuf++ = c;
1436       continue;
1437
1438     invalid_code:
1439       src = src_base;
1440       consumed_chars = consumed_chars_base;
1441       ONE_MORE_BYTE (c);
1442       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1443     }
1444
1445  no_more_source:
1446   coding->consumed_char += consumed_chars_base;
1447   coding->consumed = src_base - coding->source;
1448   coding->charbuf_used = charbuf - coding->charbuf;
1449 }
1450
1451
1452 bool
1453 encode_coding_utf_8 (struct coding_system *coding)
1454 {
1455   bool multibytep = coding->dst_multibyte;
1456   int *charbuf = coding->charbuf;
1457   int *charbuf_end = charbuf + coding->charbuf_used;
1458   unsigned char *dst = coding->destination + coding->produced;
1459   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1460   ptrdiff_t produced_chars = 0;
1461   int c;
1462
1463   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1464     {
1465       ASSURE_DESTINATION (3);
1466       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1467       CODING_UTF_8_BOM (coding) = utf_without_bom;
1468     }
1469
1470   if (multibytep)
1471     {
1472       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1473
1474       while (charbuf < charbuf_end)
1475         {
1476           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1477
1478           ASSURE_DESTINATION (safe_room);
1479           c = *charbuf++;
1480           if (CHAR_BYTE8_P (c))
1481             {
1482               c = CHAR_TO_BYTE8 (c);
1483               EMIT_ONE_BYTE (c);
1484             }
1485           else
1486             {
1487               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1488               for (p = str; p < pend; p++)
1489                 EMIT_ONE_BYTE (*p);
1490             }
1491         }
1492     }
1493   else
1494     {
1495       int safe_room = MAX_MULTIBYTE_LENGTH;
1496
1497       while (charbuf < charbuf_end)
1498         {
1499           ASSURE_DESTINATION (safe_room);
1500           c = *charbuf++;
1501           if (CHAR_BYTE8_P (c))
1502             *dst++ = CHAR_TO_BYTE8 (c);
1503           else
1504             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1505         }
1506       produced_chars = dst - (coding->destination + coding->produced);
1507     }
1508   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1509   coding->produced_char += produced_chars;
1510   coding->produced = dst - coding->destination;
1511   return 0;
1512 }
1513
1514
1515 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1516    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1517
1518 #define UTF_16_HIGH_SURROGATE_P(val) \
1519   (((val) & 0xFC00) == 0xD800)
1520
1521 #define UTF_16_LOW_SURROGATE_P(val) \
1522   (((val) & 0xFC00) == 0xDC00)
1523
1524
1525 static bool
1526 detect_coding_utf_16 (struct coding_system *coding,
1527                       struct coding_detection_info *detect_info)
1528 {
1529   const unsigned char *src = coding->source;
1530   const unsigned char *src_end = coding->source + coding->src_bytes;
1531   bool multibytep = coding->src_multibyte;
1532   int c1, c2;
1533
1534   detect_info->checked |= CATEGORY_MASK_UTF_16;
1535   if (coding->mode & CODING_MODE_LAST_BLOCK
1536       && (coding->src_chars & 1))
1537     {
1538       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1539       return 0;
1540     }
1541
1542   TWO_MORE_BYTES (c1, c2);
1543   if ((c1 == 0xFF) && (c2 == 0xFE))
1544     {
1545       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1546                              | CATEGORY_MASK_UTF_16_AUTO);
1547       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1548                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1549                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1550     }
1551   else if ((c1 == 0xFE) && (c2 == 0xFF))
1552     {
1553       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1554                              | CATEGORY_MASK_UTF_16_AUTO);
1555       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1556                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1557                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1558     }
1559   else if (c2 < 0)
1560     {
1561       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1562       return 0;
1563     }
1564   else
1565     {
1566       /* We check the dispersion of Eth and Oth bytes where E is even and
1567          O is odd.  If both are high, we assume binary data.*/
1568       unsigned char e[256], o[256];
1569       unsigned e_num = 1, o_num = 1;
1570
1571       memset (e, 0, 256);
1572       memset (o, 0, 256);
1573       e[c1] = 1;
1574       o[c2] = 1;
1575
1576       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1577                                 |CATEGORY_MASK_UTF_16_BE
1578                                 | CATEGORY_MASK_UTF_16_LE);
1579
1580       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1581              != CATEGORY_MASK_UTF_16)
1582         {
1583           TWO_MORE_BYTES (c1, c2);
1584           if (c2 < 0)
1585             break;
1586           if (! e[c1])
1587             {
1588               e[c1] = 1;
1589               e_num++;
1590               if (e_num >= 128)
1591                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1592             }
1593           if (! o[c2])
1594             {
1595               o[c2] = 1;
1596               o_num++;
1597               if (o_num >= 128)
1598                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1599             }
1600         }
1601       return 0;
1602     }
1603
1604  no_more_source:
1605   return 1;
1606 }
1607
1608 static void
1609 decode_coding_utf_16 (struct coding_system *coding)
1610 {
1611   const unsigned char *src = coding->source + coding->consumed;
1612   const unsigned char *src_end = coding->source + coding->src_bytes;
1613   const unsigned char *src_base;
1614   int *charbuf = coding->charbuf + coding->charbuf_used;
1615   /* We may produces at most 3 chars in one loop.  */
1616   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1617   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1618   bool multibytep = coding->src_multibyte;
1619   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1620   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1621   int surrogate = CODING_UTF_16_SURROGATE (coding);
1622   bool eol_dos
1623     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1624   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1625
1626   if (bom == utf_with_bom)
1627     {
1628       int c, c1, c2;
1629
1630       src_base = src;
1631       ONE_MORE_BYTE (c1);
1632       ONE_MORE_BYTE (c2);
1633       c = (c1 << 8) | c2;
1634
1635       if (endian == utf_16_big_endian
1636           ? c != 0xFEFF : c != 0xFFFE)
1637         {
1638           /* The first two bytes are not BOM.  Treat them as bytes
1639              for a normal character.  */
1640           src = src_base;
1641         }
1642       CODING_UTF_16_BOM (coding) = utf_without_bom;
1643     }
1644   else if (bom == utf_detect_bom)
1645     {
1646       /* We have already tried to detect BOM and failed in
1647          detect_coding.  */
1648       CODING_UTF_16_BOM (coding) = utf_without_bom;
1649     }
1650
1651   while (1)
1652     {
1653       int c, c1, c2;
1654
1655       src_base = src;
1656       consumed_chars_base = consumed_chars;
1657
1658       if (charbuf >= charbuf_end)
1659         {
1660           if (byte_after_cr1 >= 0)
1661             src_base -= 2;
1662           break;
1663         }
1664
1665       if (byte_after_cr1 >= 0)
1666         c1 = byte_after_cr1, byte_after_cr1 = -1;
1667       else
1668         ONE_MORE_BYTE (c1);
1669       if (c1 < 0)
1670         {
1671           *charbuf++ = -c1;
1672           continue;
1673         }
1674       if (byte_after_cr2 >= 0)
1675         c2 = byte_after_cr2, byte_after_cr2 = -1;
1676       else
1677         ONE_MORE_BYTE (c2);
1678       if (c2 < 0)
1679         {
1680           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1681           *charbuf++ = -c2;
1682           continue;
1683         }
1684       c = (endian == utf_16_big_endian
1685            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1686
1687       if (surrogate)
1688         {
1689           if (! UTF_16_LOW_SURROGATE_P (c))
1690             {
1691               if (endian == utf_16_big_endian)
1692                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1693               else
1694                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1695               *charbuf++ = c1;
1696               *charbuf++ = c2;
1697               if (UTF_16_HIGH_SURROGATE_P (c))
1698                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1699               else
1700                 *charbuf++ = c;
1701             }
1702           else
1703             {
1704               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1705               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1706               *charbuf++ = 0x10000 + c;
1707             }
1708         }
1709       else
1710         {
1711           if (UTF_16_HIGH_SURROGATE_P (c))
1712             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1713           else
1714             {
1715               if (eol_dos && c == '\r')
1716                 {
1717                   ONE_MORE_BYTE (byte_after_cr1);
1718                   ONE_MORE_BYTE (byte_after_cr2);
1719                 }
1720               *charbuf++ = c;
1721             }
1722         }
1723     }
1724
1725  no_more_source:
1726   coding->consumed_char += consumed_chars_base;
1727   coding->consumed = src_base - coding->source;
1728   coding->charbuf_used = charbuf - coding->charbuf;
1729 }
1730
1731 static bool
1732 encode_coding_utf_16 (struct coding_system *coding)
1733 {
1734   bool multibytep = coding->dst_multibyte;
1735   int *charbuf = coding->charbuf;
1736   int *charbuf_end = charbuf + coding->charbuf_used;
1737   unsigned char *dst = coding->destination + coding->produced;
1738   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1739   int safe_room = 8;
1740   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1741   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1742   ptrdiff_t produced_chars = 0;
1743   int c;
1744
1745   if (bom != utf_without_bom)
1746     {
1747       ASSURE_DESTINATION (safe_room);
1748       if (big_endian)
1749         EMIT_TWO_BYTES (0xFE, 0xFF);
1750       else
1751         EMIT_TWO_BYTES (0xFF, 0xFE);
1752       CODING_UTF_16_BOM (coding) = utf_without_bom;
1753     }
1754
1755   while (charbuf < charbuf_end)
1756     {
1757       ASSURE_DESTINATION (safe_room);
1758       c = *charbuf++;
1759       if (c > MAX_UNICODE_CHAR)
1760         c = coding->default_char;
1761
1762       if (c < 0x10000)
1763         {
1764           if (big_endian)
1765             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1766           else
1767             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1768         }
1769       else
1770         {
1771           int c1, c2;
1772
1773           c -= 0x10000;
1774           c1 = (c >> 10) + 0xD800;
1775           c2 = (c & 0x3FF) + 0xDC00;
1776           if (big_endian)
1777             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1778           else
1779             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1780         }
1781     }
1782   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1783   coding->produced = dst - coding->destination;
1784   coding->produced_char += produced_chars;
1785   return 0;
1786 }
1787
1788 \f
1789 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1790
1791 /* Emacs' internal format for representation of multiple character
1792    sets is a kind of multi-byte encoding, i.e. characters are
1793    represented by variable-length sequences of one-byte codes.
1794
1795    ASCII characters and control characters (e.g. `tab', `newline') are
1796    represented by one-byte sequences which are their ASCII codes, in
1797    the range 0x00 through 0x7F.
1798
1799    8-bit characters of the range 0x80..0x9F are represented by
1800    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1801    code + 0x20).
1802
1803    8-bit characters of the range 0xA0..0xFF are represented by
1804    one-byte sequences which are their 8-bit code.
1805
1806    The other characters are represented by a sequence of `base
1807    leading-code', optional `extended leading-code', and one or two
1808    `position-code's.  The length of the sequence is determined by the
1809    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1810    whereas extended leading-code and position-code take the range 0xA0
1811    through 0xFF.  See `charset.h' for more details about leading-code
1812    and position-code.
1813
1814    --- CODE RANGE of Emacs' internal format ---
1815    character set        range
1816    -------------        -----
1817    ascii                0x00..0x7F
1818    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1819    eight-bit-graphic    0xA0..0xBF
1820    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1821    ---------------------------------------------
1822
1823    As this is the internal character representation, the format is
1824    usually not used externally (i.e. in a file or in a data sent to a
1825    process).  But, it is possible to have a text externally in this
1826    format (i.e. by encoding by the coding system `emacs-mule').
1827
1828    In that case, a sequence of one-byte codes has a slightly different
1829    form.
1830
1831    At first, all characters in eight-bit-control are represented by
1832    one-byte sequences which are their 8-bit code.
1833
1834    Next, character composition data are represented by the byte
1835    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1836    where,
1837         METHOD is 0xF2 plus one of composition method (enum
1838         composition_method),
1839
1840         BYTES is 0xA0 plus a byte length of this composition data,
1841
1842         CHARS is 0xA0 plus a number of characters composed by this
1843         data,
1844
1845         COMPONENTs are characters of multibyte form or composition
1846         rules encoded by two-byte of ASCII codes.
1847
1848    In addition, for backward compatibility, the following formats are
1849    also recognized as composition data on decoding.
1850
1851    0x80 MSEQ ...
1852    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1853
1854    Here,
1855         MSEQ is a multibyte form but in these special format:
1856           ASCII: 0xA0 ASCII_CODE+0x80,
1857           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1858         RULE is a one byte code of the range 0xA0..0xF0 that
1859         represents a composition rule.
1860   */
1861
1862 char emacs_mule_bytes[256];
1863
1864
1865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1866    Return true if a text is encoded in 'emacs-mule'.  */
1867
1868 static bool
1869 detect_coding_emacs_mule (struct coding_system *coding,
1870                           struct coding_detection_info *detect_info)
1871 {
1872   const unsigned char *src = coding->source, *src_base;
1873   const unsigned char *src_end = coding->source + coding->src_bytes;
1874   bool multibytep = coding->src_multibyte;
1875   ptrdiff_t consumed_chars = 0;
1876   int c;
1877   int found = 0;
1878
1879   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1880   /* A coding system of this category is always ASCII compatible.  */
1881   src += coding->head_ascii;
1882
1883   while (1)
1884     {
1885       src_base = src;
1886       ONE_MORE_BYTE (c);
1887       if (c < 0)
1888         continue;
1889       if (c == 0x80)
1890         {
1891           /* Perhaps the start of composite character.  We simply skip
1892              it because analyzing it is too heavy for detecting.  But,
1893              at least, we check that the composite character
1894              constitutes of more than 4 bytes.  */
1895           const unsigned char *src_start;
1896
1897         repeat:
1898           src_start = src;
1899           do
1900             {
1901               ONE_MORE_BYTE (c);
1902             }
1903           while (c >= 0xA0);
1904
1905           if (src - src_start <= 4)
1906             break;
1907           found = CATEGORY_MASK_EMACS_MULE;
1908           if (c == 0x80)
1909             goto repeat;
1910         }
1911
1912       if (c < 0x80)
1913         {
1914           if (c < 0x20
1915               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1916             break;
1917         }
1918       else
1919         {
1920           int more_bytes = emacs_mule_bytes[c] - 1;
1921
1922           while (more_bytes > 0)
1923             {
1924               ONE_MORE_BYTE (c);
1925               if (c < 0xA0)
1926                 {
1927                   src--;        /* Unread the last byte.  */
1928                   break;
1929                 }
1930               more_bytes--;
1931             }
1932           if (more_bytes != 0)
1933             break;
1934           found = CATEGORY_MASK_EMACS_MULE;
1935         }
1936     }
1937   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1938   return 0;
1939
1940  no_more_source:
1941   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1942     {
1943       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1944       return 0;
1945     }
1946   detect_info->found |= found;
1947   return 1;
1948 }
1949
1950
1951 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1952    character.  If CMP_STATUS indicates that we must expect MSEQ or
1953    RULE described above, decode it and return the negative value of
1954    the decoded character or rule.  If an invalid byte is found, return
1955    -1.  If SRC is too short, return -2.  */
1956
1957 static int
1958 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1959                  int *nbytes, int *nchars, int *id,
1960                  struct composition_status *cmp_status)
1961 {
1962   const unsigned char *src_end = coding->source + coding->src_bytes;
1963   const unsigned char *src_base = src;
1964   bool multibytep = coding->src_multibyte;
1965   int charset_ID;
1966   unsigned code;
1967   int c;
1968   ptrdiff_t consumed_chars = 0;
1969   bool mseq_found = 0;
1970
1971   ONE_MORE_BYTE (c);
1972   if (c < 0)
1973     {
1974       c = -c;
1975       charset_ID = emacs_mule_charset[0];
1976     }
1977   else
1978     {
1979       if (c >= 0xA0)
1980         {
1981           if (cmp_status->state != COMPOSING_NO
1982               && cmp_status->old_form)
1983             {
1984               if (cmp_status->state == COMPOSING_CHAR)
1985                 {
1986                   if (c == 0xA0)
1987                     {
1988                       ONE_MORE_BYTE (c);
1989                       c -= 0x80;
1990                       if (c < 0)
1991                         goto invalid_code;
1992                     }
1993                   else
1994                     c -= 0x20;
1995                   mseq_found = 1;
1996                 }
1997               else
1998                 {
1999                   *nbytes = src - src_base;
2000                   *nchars = consumed_chars;
2001                   return -c;
2002                 }
2003             }
2004           else
2005             goto invalid_code;
2006         }
2007
2008       switch (emacs_mule_bytes[c])
2009         {
2010         case 2:
2011           if ((charset_ID = emacs_mule_charset[c]) < 0)
2012             goto invalid_code;
2013           ONE_MORE_BYTE (c);
2014           if (c < 0xA0)
2015             goto invalid_code;
2016           code = c & 0x7F;
2017           break;
2018
2019         case 3:
2020           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2021               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2022             {
2023               ONE_MORE_BYTE (c);
2024               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2025                 goto invalid_code;
2026               ONE_MORE_BYTE (c);
2027               if (c < 0xA0)
2028                 goto invalid_code;
2029               code = c & 0x7F;
2030             }
2031           else
2032             {
2033               if ((charset_ID = emacs_mule_charset[c]) < 0)
2034                 goto invalid_code;
2035               ONE_MORE_BYTE (c);
2036               if (c < 0xA0)
2037                 goto invalid_code;
2038               code = (c & 0x7F) << 8;
2039               ONE_MORE_BYTE (c);
2040               if (c < 0xA0)
2041                 goto invalid_code;
2042               code |= c & 0x7F;
2043             }
2044           break;
2045
2046         case 4:
2047           ONE_MORE_BYTE (c);
2048           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2049             goto invalid_code;
2050           ONE_MORE_BYTE (c);
2051           if (c < 0xA0)
2052             goto invalid_code;
2053           code = (c & 0x7F) << 8;
2054           ONE_MORE_BYTE (c);
2055           if (c < 0xA0)
2056             goto invalid_code;
2057           code |= c & 0x7F;
2058           break;
2059
2060         case 1:
2061           code = c;
2062           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2063           break;
2064
2065         default:
2066           emacs_abort ();
2067         }
2068       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2069                           CHARSET_FROM_ID (charset_ID), code, c);
2070       if (c < 0)
2071         goto invalid_code;
2072     }
2073   *nbytes = src - src_base;
2074   *nchars = consumed_chars;
2075   if (id)
2076     *id = charset_ID;
2077   return (mseq_found ? -c : c);
2078
2079  no_more_source:
2080   return -2;
2081
2082  invalid_code:
2083   return -1;
2084 }
2085
2086
2087 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2088
2089 /* Handle these composition sequence ('|': the end of header elements,
2090    BYTES and CHARS >= 0xA0):
2091
2092    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2093    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2094    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2095
2096    and these old form:
2097
2098    (4) relative composition: 0x80 | MSEQ ... MSEQ
2099    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2100
2101    When the starter 0x80 and the following header elements are found,
2102    this annotation header is produced.
2103
2104         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2105
2106    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2107    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2108
2109    Then, upon reading the following elements, these codes are produced
2110    until the composition end is found:
2111
2112    (1) CHAR ... CHAR
2113    (2) ALT ... ALT CHAR ... CHAR
2114    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2115    (4) CHAR ... CHAR
2116    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2117
2118    When the composition end is found, LENGTH and NCHARS in the
2119    annotation header is updated as below:
2120
2121    (1) LENGTH: unchanged, NCHARS: unchanged
2122    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2123    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2124    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2125    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2126
2127    If an error is found while composing, the annotation header is
2128    changed to the original composition header (plus filler -1s) as
2129    below:
2130
2131    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2132    (5)          [ 0x80 0xFF -1 -1- -1 ]
2133
2134    and the sequence [ -2 DECODED-RULE ] is changed to the original
2135    byte sequence as below:
2136         o the original byte sequence is B: [ B -1 ]
2137         o the original byte sequence is B1 B2: [ B1 B2 ]
2138
2139    Most of the routines are implemented by macros because many
2140    variables and labels in the caller decode_coding_emacs_mule must be
2141    accessible, and they are usually called just once (thus doesn't
2142    increase the size of compiled object).  */
2143
2144 /* Decode a composition rule represented by C as a component of
2145    composition sequence of Emacs 20 style.  Set RULE to the decoded
2146    rule. */
2147
2148 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2149   do {                                                  \
2150     int gref, nref;                                     \
2151                                                         \
2152     c -= 0xA0;                                          \
2153     if (c < 0 || c >= 81)                               \
2154       goto invalid_code;                                \
2155     gref = c / 9, nref = c % 9;                         \
2156     if (gref == 4) gref = 10;                           \
2157     if (nref == 4) nref = 10;                           \
2158     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2159   } while (0)
2160
2161
2162 /* Decode a composition rule represented by C and the following byte
2163    at SRC as a component of composition sequence of Emacs 21 style.
2164    Set RULE to the decoded rule.  */
2165
2166 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2167   do {                                                  \
2168     int gref, nref;                                     \
2169                                                         \
2170     gref = c - 0x20;                                    \
2171     if (gref < 0 || gref >= 81)                         \
2172       goto invalid_code;                                \
2173     ONE_MORE_BYTE (c);                                  \
2174     nref = c - 0x20;                                    \
2175     if (nref < 0 || nref >= 81)                         \
2176       goto invalid_code;                                \
2177     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2178   } while (0)
2179
2180
2181 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2182    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2183    byte length of this composition information, CHARS is the number of
2184    characters composed by this composition.  */
2185
2186 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2187   do {                                                                  \
2188     enum composition_method method = c - 0xF2;                          \
2189     int nbytes, nchars;                                                 \
2190                                                                         \
2191     ONE_MORE_BYTE (c);                                                  \
2192     if (c < 0)                                                          \
2193       goto invalid_code;                                                \
2194     nbytes = c - 0xA0;                                                  \
2195     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2196       goto invalid_code;                                                \
2197     ONE_MORE_BYTE (c);                                                  \
2198     nchars = c - 0xA0;                                                  \
2199     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2200       goto invalid_code;                                                \
2201     cmp_status->old_form = 0;                                           \
2202     cmp_status->method = method;                                        \
2203     if (method == COMPOSITION_RELATIVE)                                 \
2204       cmp_status->state = COMPOSING_CHAR;                               \
2205     else                                                                \
2206       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2207     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2208     cmp_status->nchars = nchars;                                        \
2209     cmp_status->ncomps = nbytes - 4;                                    \
2210     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2211   } while (0)
2212
2213
2214 /* Start of Emacs 20 style format for relative composition.  */
2215
2216 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2217   do {                                                          \
2218     cmp_status->old_form = 1;                                   \
2219     cmp_status->method = COMPOSITION_RELATIVE;                  \
2220     cmp_status->state = COMPOSING_CHAR;                         \
2221     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2222     cmp_status->nchars = cmp_status->ncomps = 0;                \
2223     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2224   } while (0)
2225
2226
2227 /* Start of Emacs 20 style format for rule-base composition.  */
2228
2229 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2230   do {                                                          \
2231     cmp_status->old_form = 1;                                   \
2232     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2233     cmp_status->state = COMPOSING_CHAR;                         \
2234     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2235     cmp_status->nchars = cmp_status->ncomps = 0;                \
2236     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2237   } while (0)
2238
2239
2240 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2241   do {                                                  \
2242     const unsigned char *current_src = src;             \
2243                                                         \
2244     ONE_MORE_BYTE (c);                                  \
2245     if (c < 0)                                          \
2246       goto invalid_code;                                \
2247     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2248         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2249       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2250     else if (c < 0xA0)                                  \
2251       goto invalid_code;                                \
2252     else if (c < 0xC0)                                  \
2253       {                                                 \
2254         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2255         /* Re-read C as a composition component.  */    \
2256         src = current_src;                              \
2257       }                                                 \
2258     else if (c == 0xFF)                                 \
2259       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2260     else                                                \
2261       goto invalid_code;                                \
2262   } while (0)
2263
2264 #define EMACS_MULE_COMPOSITION_END()                            \
2265   do {                                                          \
2266     int idx = - cmp_status->length;                             \
2267                                                                 \
2268     if (cmp_status->old_form)                                   \
2269       charbuf[idx + 2] = cmp_status->nchars;                    \
2270     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2271       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2272     cmp_status->state = COMPOSING_NO;                           \
2273   } while (0)
2274
2275
2276 static int
2277 emacs_mule_finish_composition (int *charbuf,
2278                                struct composition_status *cmp_status)
2279 {
2280   int idx = - cmp_status->length;
2281   int new_chars;
2282
2283   if (cmp_status->old_form && cmp_status->nchars > 0)
2284     {
2285       charbuf[idx + 2] = cmp_status->nchars;
2286       new_chars = 0;
2287       if (cmp_status->method == COMPOSITION_WITH_RULE
2288           && cmp_status->state == COMPOSING_CHAR)
2289         {
2290           /* The last rule was invalid.  */
2291           int rule = charbuf[-1] + 0xA0;
2292
2293           charbuf[-2] = BYTE8_TO_CHAR (rule);
2294           charbuf[-1] = -1;
2295           new_chars = 1;
2296         }
2297     }
2298   else
2299     {
2300       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2301
2302       if (cmp_status->method == COMPOSITION_WITH_RULE)
2303         {
2304           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2305           charbuf[idx++] = -3;
2306           charbuf[idx++] = 0;
2307           new_chars = 1;
2308         }
2309       else
2310         {
2311           int nchars = charbuf[idx + 1] + 0xA0;
2312           int nbytes = charbuf[idx + 2] + 0xA0;
2313
2314           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2315           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2316           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2317           charbuf[idx++] = -1;
2318           new_chars = 4;
2319         }
2320     }
2321   cmp_status->state = COMPOSING_NO;
2322   return new_chars;
2323 }
2324
2325 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2326   do {                                                                    \
2327     if (cmp_status->state != COMPOSING_NO)                                \
2328       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2329   } while (0)
2330
2331
2332 static void
2333 decode_coding_emacs_mule (struct coding_system *coding)
2334 {
2335   const unsigned char *src = coding->source + coding->consumed;
2336   const unsigned char *src_end = coding->source + coding->src_bytes;
2337   const unsigned char *src_base;
2338   int *charbuf = coding->charbuf + coding->charbuf_used;
2339   /* We may produce two annotations (charset and composition) in one
2340      loop and one more charset annotation at the end.  */
2341   int *charbuf_end
2342     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2343       /* We can produce up to 2 characters in a loop.  */
2344       - 1;
2345   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2346   bool multibytep = coding->src_multibyte;
2347   ptrdiff_t char_offset = coding->produced_char;
2348   ptrdiff_t last_offset = char_offset;
2349   int last_id = charset_ascii;
2350   bool eol_dos
2351     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2352   int byte_after_cr = -1;
2353   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2354
2355   if (cmp_status->state != COMPOSING_NO)
2356     {
2357       int i;
2358
2359       if (charbuf_end - charbuf < cmp_status->length)
2360         emacs_abort ();
2361       for (i = 0; i < cmp_status->length; i++)
2362         *charbuf++ = cmp_status->carryover[i];
2363       coding->annotated = 1;
2364     }
2365
2366   while (1)
2367     {
2368       int c;
2369       int id UNINIT;
2370
2371       src_base = src;
2372       consumed_chars_base = consumed_chars;
2373
2374       if (charbuf >= charbuf_end)
2375         {
2376           if (byte_after_cr >= 0)
2377             src_base--;
2378           break;
2379         }
2380
2381       if (byte_after_cr >= 0)
2382         c = byte_after_cr, byte_after_cr = -1;
2383       else
2384         ONE_MORE_BYTE (c);
2385
2386       if (c < 0 || c == 0x80)
2387         {
2388           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2389           if (c < 0)
2390             {
2391               *charbuf++ = -c;
2392               char_offset++;
2393             }
2394           else
2395             DECODE_EMACS_MULE_COMPOSITION_START ();
2396           continue;
2397         }
2398
2399       if (c < 0x80)
2400         {
2401           if (eol_dos && c == '\r')
2402             ONE_MORE_BYTE (byte_after_cr);
2403           id = charset_ascii;
2404           if (cmp_status->state != COMPOSING_NO)
2405             {
2406               if (cmp_status->old_form)
2407                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2408               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2409                 cmp_status->ncomps--;
2410             }
2411         }
2412       else
2413         {
2414           int nchars UNINIT, nbytes UNINIT;
2415           /* emacs_mule_char can load a charset map from a file, which
2416              allocates a large structure and might cause buffer text
2417              to be relocated as result.  Thus, we need to remember the
2418              original pointer to buffer text, and fix up all related
2419              pointers after the call.  */
2420           const unsigned char *orig = coding->source;
2421           ptrdiff_t offset;
2422
2423           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2424                                cmp_status);
2425           offset = coding->source - orig;
2426           if (offset)
2427             {
2428               src += offset;
2429               src_base += offset;
2430               src_end += offset;
2431             }
2432           if (c < 0)
2433             {
2434               if (c == -1)
2435                 goto invalid_code;
2436               if (c == -2)
2437                 break;
2438             }
2439           src = src_base + nbytes;
2440           consumed_chars = consumed_chars_base + nchars;
2441           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2442             cmp_status->ncomps -= nchars;
2443         }
2444
2445       /* Now if C >= 0, we found a normally encoded character, if C <
2446          0, we found an old-style composition component character or
2447          rule.  */
2448
2449       if (cmp_status->state == COMPOSING_NO)
2450         {
2451           if (last_id != id)
2452             {
2453               if (last_id != charset_ascii)
2454                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2455                                   last_id);
2456               last_id = id;
2457               last_offset = char_offset;
2458             }
2459           *charbuf++ = c;
2460           char_offset++;
2461         }
2462       else if (cmp_status->state == COMPOSING_CHAR)
2463         {
2464           if (cmp_status->old_form)
2465             {
2466               if (c >= 0)
2467                 {
2468                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2469                   *charbuf++ = c;
2470                   char_offset++;
2471                 }
2472               else
2473                 {
2474                   *charbuf++ = -c;
2475                   cmp_status->nchars++;
2476                   cmp_status->length++;
2477                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2478                     EMACS_MULE_COMPOSITION_END ();
2479                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2480                     cmp_status->state = COMPOSING_RULE;
2481                 }
2482             }
2483           else
2484             {
2485               *charbuf++ = c;
2486               cmp_status->length++;
2487               cmp_status->nchars--;
2488               if (cmp_status->nchars == 0)
2489                 EMACS_MULE_COMPOSITION_END ();
2490             }
2491         }
2492       else if (cmp_status->state == COMPOSING_RULE)
2493         {
2494           int rule;
2495
2496           if (c >= 0)
2497             {
2498               EMACS_MULE_COMPOSITION_END ();
2499               *charbuf++ = c;
2500               char_offset++;
2501             }
2502           else
2503             {
2504               c = -c;
2505               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2506               if (rule < 0)
2507                 goto invalid_code;
2508               *charbuf++ = -2;
2509               *charbuf++ = rule;
2510               cmp_status->length += 2;
2511               cmp_status->state = COMPOSING_CHAR;
2512             }
2513         }
2514       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2515         {
2516           *charbuf++ = c;
2517           cmp_status->length++;
2518           if (cmp_status->ncomps == 0)
2519             cmp_status->state = COMPOSING_CHAR;
2520           else if (cmp_status->ncomps > 0)
2521             {
2522               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2523                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2524             }
2525           else
2526             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2527         }
2528       else                      /* COMPOSING_COMPONENT_RULE */
2529         {
2530           int rule;
2531
2532           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2533           if (rule < 0)
2534             goto invalid_code;
2535           *charbuf++ = -2;
2536           *charbuf++ = rule;
2537           cmp_status->length += 2;
2538           cmp_status->ncomps--;
2539           if (cmp_status->ncomps > 0)
2540             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2541           else
2542             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2543         }
2544       continue;
2545
2546     invalid_code:
2547       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2548       src = src_base;
2549       consumed_chars = consumed_chars_base;
2550       ONE_MORE_BYTE (c);
2551       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2552       char_offset++;
2553     }
2554
2555  no_more_source:
2556   if (cmp_status->state != COMPOSING_NO)
2557     {
2558       if (coding->mode & CODING_MODE_LAST_BLOCK)
2559         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2560       else
2561         {
2562           int i;
2563
2564           charbuf -= cmp_status->length;
2565           for (i = 0; i < cmp_status->length; i++)
2566             cmp_status->carryover[i] = charbuf[i];
2567         }
2568     }
2569   if (last_id != charset_ascii)
2570     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2571   coding->consumed_char += consumed_chars_base;
2572   coding->consumed = src_base - coding->source;
2573   coding->charbuf_used = charbuf - coding->charbuf;
2574 }
2575
2576
2577 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2578   do {                                          \
2579     if (id < 0xA0)                              \
2580       codes[0] = id, codes[1] = 0;              \
2581     else if (id < 0xE0)                         \
2582       codes[0] = 0x9A, codes[1] = id;           \
2583     else if (id < 0xF0)                         \
2584       codes[0] = 0x9B, codes[1] = id;           \
2585     else if (id < 0xF5)                         \
2586       codes[0] = 0x9C, codes[1] = id;           \
2587     else                                        \
2588       codes[0] = 0x9D, codes[1] = id;           \
2589   } while (0);
2590
2591
2592 static bool
2593 encode_coding_emacs_mule (struct coding_system *coding)
2594 {
2595   bool multibytep = coding->dst_multibyte;
2596   int *charbuf = coding->charbuf;
2597   int *charbuf_end = charbuf + coding->charbuf_used;
2598   unsigned char *dst = coding->destination + coding->produced;
2599   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2600   int safe_room = 8;
2601   ptrdiff_t produced_chars = 0;
2602   Lisp_Object attrs, charset_list;
2603   int c;
2604   int preferred_charset_id = -1;
2605
2606   CODING_GET_INFO (coding, attrs, charset_list);
2607   if (! EQ (charset_list, Vemacs_mule_charset_list))
2608     {
2609       charset_list = Vemacs_mule_charset_list;
2610       ASET (attrs, coding_attr_charset_list, charset_list);
2611     }
2612
2613   while (charbuf < charbuf_end)
2614     {
2615       ASSURE_DESTINATION (safe_room);
2616       c = *charbuf++;
2617
2618       if (c < 0)
2619         {
2620           /* Handle an annotation.  */
2621           switch (*charbuf)
2622             {
2623             case CODING_ANNOTATE_COMPOSITION_MASK:
2624               /* Not yet implemented.  */
2625               break;
2626             case CODING_ANNOTATE_CHARSET_MASK:
2627               preferred_charset_id = charbuf[3];
2628               if (preferred_charset_id >= 0
2629                   && NILP (Fmemq (make_number (preferred_charset_id),
2630                                   charset_list)))
2631                 preferred_charset_id = -1;
2632               break;
2633             default:
2634               emacs_abort ();
2635             }
2636           charbuf += -c - 1;
2637           continue;
2638         }
2639
2640       if (ASCII_CHAR_P (c))
2641         EMIT_ONE_ASCII_BYTE (c);
2642       else if (CHAR_BYTE8_P (c))
2643         {
2644           c = CHAR_TO_BYTE8 (c);
2645           EMIT_ONE_BYTE (c);
2646         }
2647       else
2648         {
2649           struct charset *charset;
2650           unsigned code;
2651           int dimension;
2652           int emacs_mule_id;
2653           unsigned char leading_codes[2];
2654
2655           if (preferred_charset_id >= 0)
2656             {
2657               bool result;
2658
2659               charset = CHARSET_FROM_ID (preferred_charset_id);
2660               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2661               if (result)
2662                 code = ENCODE_CHAR (charset, c);
2663               else
2664                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2665                                      &code, charset);
2666             }
2667           else
2668             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2669                                  &code, charset);
2670           if (! charset)
2671             {
2672               c = coding->default_char;
2673               if (ASCII_CHAR_P (c))
2674                 {
2675                   EMIT_ONE_ASCII_BYTE (c);
2676                   continue;
2677                 }
2678               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2679                                    &code, charset);
2680             }
2681           dimension = CHARSET_DIMENSION (charset);
2682           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2683           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2684           EMIT_ONE_BYTE (leading_codes[0]);
2685           if (leading_codes[1])
2686             EMIT_ONE_BYTE (leading_codes[1]);
2687           if (dimension == 1)
2688             EMIT_ONE_BYTE (code | 0x80);
2689           else
2690             {
2691               code |= 0x8080;
2692               EMIT_ONE_BYTE (code >> 8);
2693               EMIT_ONE_BYTE (code & 0xFF);
2694             }
2695         }
2696     }
2697   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2698   coding->produced_char += produced_chars;
2699   coding->produced = dst - coding->destination;
2700   return 0;
2701 }
2702
2703 \f
2704 /*** 7. ISO2022 handlers ***/
2705
2706 /* The following note describes the coding system ISO2022 briefly.
2707    Since the intention of this note is to help understand the
2708    functions in this file, some parts are NOT ACCURATE or are OVERLY
2709    SIMPLIFIED.  For thorough understanding, please refer to the
2710    original document of ISO2022.  This is equivalent to the standard
2711    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2712
2713    ISO2022 provides many mechanisms to encode several character sets
2714    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2715    is encoded using bytes less than 128.  This may make the encoded
2716    text a little bit longer, but the text passes more easily through
2717    several types of gateway, some of which strip off the MSB (Most
2718    Significant Bit).
2719
2720    There are two kinds of character sets: control character sets and
2721    graphic character sets.  The former contain control characters such
2722    as `newline' and `escape' to provide control functions (control
2723    functions are also provided by escape sequences).  The latter
2724    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2725    two control character sets and many graphic character sets.
2726
2727    Graphic character sets are classified into one of the following
2728    four classes, according to the number of bytes (DIMENSION) and
2729    number of characters in one dimension (CHARS) of the set:
2730    - DIMENSION1_CHARS94
2731    - DIMENSION1_CHARS96
2732    - DIMENSION2_CHARS94
2733    - DIMENSION2_CHARS96
2734
2735    In addition, each character set is assigned an identification tag,
2736    unique for each set, called the "final character" (denoted as <F>
2737    hereafter).  The <F> of each character set is decided by ECMA(*)
2738    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2739    (0x30..0x3F are for private use only).
2740
2741    Note (*): ECMA = European Computer Manufacturers Association
2742
2743    Here are examples of graphic character sets [NAME(<F>)]:
2744         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2745         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2746         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2747         o DIMENSION2_CHARS96 -- none for the moment
2748
2749    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2750         C0 [0x00..0x1F] -- control character plane 0
2751         GL [0x20..0x7F] -- graphic character plane 0
2752         C1 [0x80..0x9F] -- control character plane 1
2753         GR [0xA0..0xFF] -- graphic character plane 1
2754
2755    A control character set is directly designated and invoked to C0 or
2756    C1 by an escape sequence.  The most common case is that:
2757    - ISO646's  control character set is designated/invoked to C0, and
2758    - ISO6429's control character set is designated/invoked to C1,
2759    and usually these designations/invocations are omitted in encoded
2760    text.  In a 7-bit environment, only C0 can be used, and a control
2761    character for C1 is encoded by an appropriate escape sequence to
2762    fit into the environment.  All control characters for C1 are
2763    defined to have corresponding escape sequences.
2764
2765    A graphic character set is at first designated to one of four
2766    graphic registers (G0 through G3), then these graphic registers are
2767    invoked to GL or GR.  These designations and invocations can be
2768    done independently.  The most common case is that G0 is invoked to
2769    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2770    these invocations and designations are omitted in encoded text.
2771    In a 7-bit environment, only GL can be used.
2772
2773    When a graphic character set of CHARS94 is invoked to GL, codes
2774    0x20 and 0x7F of the GL area work as control characters SPACE and
2775    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2776    be used.
2777
2778    There are two ways of invocation: locking-shift and single-shift.
2779    With locking-shift, the invocation lasts until the next different
2780    invocation, whereas with single-shift, the invocation affects the
2781    following character only and doesn't affect the locking-shift
2782    state.  Invocations are done by the following control characters or
2783    escape sequences:
2784
2785    ----------------------------------------------------------------------
2786    abbrev  function                  cntrl escape seq   description
2787    ----------------------------------------------------------------------
2788    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2789    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2790    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2791    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2792    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2793    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2794    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2795    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2796    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2797    ----------------------------------------------------------------------
2798    (*) These are not used by any known coding system.
2799
2800    Control characters for these functions are defined by macros
2801    ISO_CODE_XXX in `coding.h'.
2802
2803    Designations are done by the following escape sequences:
2804    ----------------------------------------------------------------------
2805    escape sequence      description
2806    ----------------------------------------------------------------------
2807    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2808    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2809    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2810    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2811    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2812    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2813    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2814    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2815    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2816    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2817    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2818    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2819    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2820    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2821    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2822    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2823    ----------------------------------------------------------------------
2824
2825    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2826    of dimension 1, chars 94, and final character <F>, etc...
2827
2828    Note (*): Although these designations are not allowed in ISO2022,
2829    Emacs accepts them on decoding, and produces them on encoding
2830    CHARS96 character sets in a coding system which is characterized as
2831    7-bit environment, non-locking-shift, and non-single-shift.
2832
2833    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2834    '(' must be omitted.  We refer to this as "short-form" hereafter.
2835
2836    Now you may notice that there are a lot of ways of encoding the
2837    same multilingual text in ISO2022.  Actually, there exist many
2838    coding systems such as Compound Text (used in X11's inter client
2839    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2840    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2841    localized platforms), and all of these are variants of ISO2022.
2842
2843    In addition to the above, Emacs handles two more kinds of escape
2844    sequences: ISO6429's direction specification and Emacs' private
2845    sequence for specifying character composition.
2846
2847    ISO6429's direction specification takes the following form:
2848         o CSI ']'      -- end of the current direction
2849         o CSI '0' ']'  -- end of the current direction
2850         o CSI '1' ']'  -- start of left-to-right text
2851         o CSI '2' ']'  -- start of right-to-left text
2852    The control character CSI (0x9B: control sequence introducer) is
2853    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2854
2855    Character composition specification takes the following form:
2856         o ESC '0' -- start relative composition
2857         o ESC '1' -- end composition
2858         o ESC '2' -- start rule-base composition (*)
2859         o ESC '3' -- start relative composition with alternate chars  (**)
2860         o ESC '4' -- start rule-base composition with alternate chars  (**)
2861   Since these are not standard escape sequences of any ISO standard,
2862   the use of them with these meanings is restricted to Emacs only.
2863
2864   (*) This form is used only in Emacs 20.7 and older versions,
2865   but newer versions can safely decode it.
2866   (**) This form is used only in Emacs 21.1 and newer versions,
2867   and older versions can't decode it.
2868
2869   Here's a list of example usages of these composition escape
2870   sequences (categorized by `enum composition_method').
2871
2872   COMPOSITION_RELATIVE:
2873         ESC 0 CHAR [ CHAR ] ESC 1
2874   COMPOSITION_WITH_RULE:
2875         ESC 2 CHAR [ RULE CHAR ] ESC 1
2876   COMPOSITION_WITH_ALTCHARS:
2877         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2878   COMPOSITION_WITH_RULE_ALTCHARS:
2879         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2880
2881 static enum iso_code_class_type iso_code_class[256];
2882
2883 #define SAFE_CHARSET_P(coding, id)      \
2884   ((id) <= (coding)->max_charset_id     \
2885    && (coding)->safe_charsets[id] != 255)
2886
2887 static void
2888 setup_iso_safe_charsets (Lisp_Object attrs)
2889 {
2890   Lisp_Object charset_list, safe_charsets;
2891   Lisp_Object request;
2892   Lisp_Object reg_usage;
2893   Lisp_Object tail;
2894   EMACS_INT reg94, reg96;
2895   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2896   int max_charset_id;
2897
2898   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2899   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2900       && ! EQ (charset_list, Viso_2022_charset_list))
2901     {
2902       charset_list = Viso_2022_charset_list;
2903       ASET (attrs, coding_attr_charset_list, charset_list);
2904       ASET (attrs, coding_attr_safe_charsets, Qnil);
2905     }
2906
2907   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2908     return;
2909
2910   max_charset_id = 0;
2911   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2912     {
2913       int id = XINT (XCAR (tail));
2914       if (max_charset_id < id)
2915         max_charset_id = id;
2916     }
2917
2918   safe_charsets = make_uninit_string (max_charset_id + 1);
2919   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2920   request = AREF (attrs, coding_attr_iso_request);
2921   reg_usage = AREF (attrs, coding_attr_iso_usage);
2922   reg94 = XINT (XCAR (reg_usage));
2923   reg96 = XINT (XCDR (reg_usage));
2924
2925   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2926     {
2927       Lisp_Object id;
2928       Lisp_Object reg;
2929       struct charset *charset;
2930
2931       id = XCAR (tail);
2932       charset = CHARSET_FROM_ID (XINT (id));
2933       reg = Fcdr (Fassq (id, request));
2934       if (! NILP (reg))
2935         SSET (safe_charsets, XINT (id), XINT (reg));
2936       else if (charset->iso_chars_96)
2937         {
2938           if (reg96 < 4)
2939             SSET (safe_charsets, XINT (id), reg96);
2940         }
2941       else
2942         {
2943           if (reg94 < 4)
2944             SSET (safe_charsets, XINT (id), reg94);
2945         }
2946     }
2947   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2948 }
2949
2950
2951 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2952    Return true if a text is encoded in one of ISO-2022 based coding
2953    systems.  */
2954
2955 static bool
2956 detect_coding_iso_2022 (struct coding_system *coding,
2957                         struct coding_detection_info *detect_info)
2958 {
2959   const unsigned char *src = coding->source, *src_base = src;
2960   const unsigned char *src_end = coding->source + coding->src_bytes;
2961   bool multibytep = coding->src_multibyte;
2962   bool single_shifting = 0;
2963   int id;
2964   int c, c1;
2965   ptrdiff_t consumed_chars = 0;
2966   int i;
2967   int rejected = 0;
2968   int found = 0;
2969   int composition_count = -1;
2970
2971   detect_info->checked |= CATEGORY_MASK_ISO;
2972
2973   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2974     {
2975       struct coding_system *this = &(coding_categories[i]);
2976       Lisp_Object attrs, val;
2977
2978       if (this->id < 0)
2979         continue;
2980       attrs = CODING_ID_ATTRS (this->id);
2981       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2982           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2983         setup_iso_safe_charsets (attrs);
2984       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2985       this->max_charset_id = SCHARS (val) - 1;
2986       this->safe_charsets = SDATA (val);
2987     }
2988
2989   /* A coding system of this category is always ASCII compatible.  */
2990   src += coding->head_ascii;
2991
2992   while (rejected != CATEGORY_MASK_ISO)
2993     {
2994       src_base = src;
2995       ONE_MORE_BYTE (c);
2996       switch (c)
2997         {
2998         case ISO_CODE_ESC:
2999           if (inhibit_iso_escape_detection)
3000             break;
3001           single_shifting = 0;
3002           ONE_MORE_BYTE (c);
3003           if (c == 'N' || c == 'O')
3004             {
3005               /* ESC <Fe> for SS2 or SS3.  */
3006               single_shifting = 1;
3007               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3008             }
3009           else if (c == '1')
3010             {
3011               /* End of composition.  */
3012               if (composition_count < 0
3013                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3014                 /* Invalid */
3015                 break;
3016               composition_count = -1;
3017               found |= CATEGORY_MASK_ISO;
3018             }
3019           else if (c >= '0' && c <= '4')
3020             {
3021               /* ESC <Fp> for start/end composition.  */
3022               composition_count = 0;
3023             }
3024           else
3025             {
3026               if (c >= '(' && c <= '/')
3027                 {
3028                   /* Designation sequence for a charset of dimension 1.  */
3029                   ONE_MORE_BYTE (c1);
3030                   if (c1 < ' ' || c1 >= 0x80
3031                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3032                     {
3033                       /* Invalid designation sequence.  Just ignore.  */
3034                       if (c1 >= 0x80)
3035                         rejected |= (CATEGORY_MASK_ISO_7BIT
3036                                      | CATEGORY_MASK_ISO_7_ELSE);
3037                       break;
3038                     }
3039                 }
3040               else if (c == '$')
3041                 {
3042                   /* Designation sequence for a charset of dimension 2.  */
3043                   ONE_MORE_BYTE (c);
3044                   if (c >= '@' && c <= 'B')
3045                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3046                     id = iso_charset_table[1][0][c];
3047                   else if (c >= '(' && c <= '/')
3048                     {
3049                       ONE_MORE_BYTE (c1);
3050                       if (c1 < ' ' || c1 >= 0x80
3051                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3052                         {
3053                           /* Invalid designation sequence.  Just ignore.  */
3054                           if (c1 >= 0x80)
3055                             rejected |= (CATEGORY_MASK_ISO_7BIT
3056                                          | CATEGORY_MASK_ISO_7_ELSE);
3057                           break;
3058                         }
3059                     }
3060                   else
3061                     {
3062                       /* Invalid designation sequence.  Just ignore it.  */
3063                       if (c >= 0x80)
3064                         rejected |= (CATEGORY_MASK_ISO_7BIT
3065                                      | CATEGORY_MASK_ISO_7_ELSE);
3066                       break;
3067                     }
3068                 }
3069               else
3070                 {
3071                   /* Invalid escape sequence.  Just ignore it.  */
3072                   if (c >= 0x80)
3073                     rejected |= (CATEGORY_MASK_ISO_7BIT
3074                                  | CATEGORY_MASK_ISO_7_ELSE);
3075                   break;
3076                 }
3077
3078               /* We found a valid designation sequence for CHARSET.  */
3079               rejected |= CATEGORY_MASK_ISO_8BIT;
3080               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3081                                   id))
3082                 found |= CATEGORY_MASK_ISO_7;
3083               else
3084                 rejected |= CATEGORY_MASK_ISO_7;
3085               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3086                                   id))
3087                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3088               else
3089                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3090               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3091                                   id))
3092                 found |= CATEGORY_MASK_ISO_7_ELSE;
3093               else
3094                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3095               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3096                                   id))
3097                 found |= CATEGORY_MASK_ISO_8_ELSE;
3098               else
3099                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3100             }
3101           break;
3102
3103         case ISO_CODE_SO:
3104         case ISO_CODE_SI:
3105           /* Locking shift out/in.  */
3106           if (inhibit_iso_escape_detection)
3107             break;
3108           single_shifting = 0;
3109           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3110           break;
3111
3112         case ISO_CODE_CSI:
3113           /* Control sequence introducer.  */
3114           single_shifting = 0;
3115           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3116           found |= CATEGORY_MASK_ISO_8_ELSE;
3117           goto check_extra_latin;
3118
3119         case ISO_CODE_SS2:
3120         case ISO_CODE_SS3:
3121           /* Single shift.   */
3122           if (inhibit_iso_escape_detection)
3123             break;
3124           single_shifting = 0;
3125           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3126           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3127               & CODING_ISO_FLAG_SINGLE_SHIFT)
3128             {
3129               found |= CATEGORY_MASK_ISO_8_1;
3130               single_shifting = 1;
3131             }
3132           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3133               & CODING_ISO_FLAG_SINGLE_SHIFT)
3134             {
3135               found |= CATEGORY_MASK_ISO_8_2;
3136               single_shifting = 1;
3137             }
3138           if (single_shifting)
3139             break;
3140           goto check_extra_latin;
3141
3142         default:
3143           if (c < 0)
3144             continue;
3145           if (c < 0x80)
3146             {
3147               if (composition_count >= 0)
3148                 composition_count++;
3149               single_shifting = 0;
3150               break;
3151             }
3152           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3153           if (c >= 0xA0)
3154             {
3155               found |= CATEGORY_MASK_ISO_8_1;
3156               /* Check the length of succeeding codes of the range
3157                  0xA0..0FF.  If the byte length is even, we include
3158                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3159                  only when we are not single shifting.  */
3160               if (! single_shifting
3161                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3162                 {
3163                   ptrdiff_t len = 1;
3164                   while (src < src_end)
3165                     {
3166                       src_base = src;
3167                       ONE_MORE_BYTE (c);
3168                       if (c < 0xA0)
3169                         {
3170                           src = src_base;
3171                           break;
3172                         }
3173                       len++;
3174                     }
3175
3176                   if (len & 1 && src < src_end)
3177                     {
3178                       rejected |= CATEGORY_MASK_ISO_8_2;
3179                       if (composition_count >= 0)
3180                         composition_count += len;
3181                     }
3182                   else
3183                     {
3184                       found |= CATEGORY_MASK_ISO_8_2;
3185                       if (composition_count >= 0)
3186                         composition_count += len / 2;
3187                     }
3188                 }
3189               break;
3190             }
3191         check_extra_latin:
3192           if (! VECTORP (Vlatin_extra_code_table)
3193               || NILP (AREF (Vlatin_extra_code_table, c)))
3194             {
3195               rejected = CATEGORY_MASK_ISO;
3196               break;
3197             }
3198           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3199               & CODING_ISO_FLAG_LATIN_EXTRA)
3200             found |= CATEGORY_MASK_ISO_8_1;
3201           else
3202             rejected |= CATEGORY_MASK_ISO_8_1;
3203           rejected |= CATEGORY_MASK_ISO_8_2;
3204           break;
3205         }
3206     }
3207   detect_info->rejected |= CATEGORY_MASK_ISO;
3208   return 0;
3209
3210  no_more_source:
3211   detect_info->rejected |= rejected;
3212   detect_info->found |= (found & ~rejected);
3213   return 1;
3214 }
3215
3216
3217 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3218    escape sequence should be kept.  */
3219 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3220   do {                                                                  \
3221     int id, prev;                                                       \
3222                                                                         \
3223     if (final < '0' || final >= 128                                     \
3224         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3225         || !SAFE_CHARSET_P (coding, id))                                \
3226       {                                                                 \
3227         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3228         chars_96 = -1;                                                  \
3229         break;                                                          \
3230       }                                                                 \
3231     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3232     if (id == charset_jisx0201_roman)                                   \
3233       {                                                                 \
3234         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3235           id = charset_ascii;                                           \
3236       }                                                                 \
3237     else if (id == charset_jisx0208_1978)                               \
3238       {                                                                 \
3239         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3240           id = charset_jisx0208;                                        \
3241       }                                                                 \
3242     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3243     /* If there was an invalid designation to REG previously, and this  \
3244        designation is ASCII to REG, we should keep this designation     \
3245        sequence.  */                                                    \
3246     if (prev == -2 && id == charset_ascii)                              \
3247       chars_96 = -1;                                                    \
3248   } while (0)
3249
3250
3251 /* Handle these composition sequence (ALT: alternate char):
3252
3253    (1) relative composition: ESC 0 CHAR ... ESC 1
3254    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3255    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3256    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3257
3258    When the start sequence (ESC 0/2/3/4) is found, this annotation
3259    header is produced.
3260
3261         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3262
3263    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3264    produced until the end sequence (ESC 1) is found:
3265
3266    (1) CHAR ... CHAR
3267    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3268    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3269    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3270
3271    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3272    annotation header is updated as below:
3273
3274    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3275    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3276    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3277    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3278
3279    If an error is found while composing, the annotation header is
3280    changed to:
3281
3282         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3283
3284    and the sequence [ -2 DECODED-RULE ] is changed to the original
3285    byte sequence as below:
3286         o the original byte sequence is B: [ B -1 ]
3287         o the original byte sequence is B1 B2: [ B1 B2 ]
3288    and the sequence [ -1 -1 ] is changed to the original byte
3289    sequence:
3290         [ ESC '0' ]
3291 */
3292
3293 /* Decode a composition rule C1 and maybe one more byte from the
3294    source, and set RULE to the encoded composition rule.  If the rule
3295    is invalid, goto invalid_code.  */
3296
3297 #define DECODE_COMPOSITION_RULE(rule)                                   \
3298   do {                                                                  \
3299     rule = c1 - 32;                                                     \
3300     if (rule < 0)                                                       \
3301       goto invalid_code;                                                \
3302     if (rule < 81)              /* old format (before ver.21) */        \
3303       {                                                                 \
3304         int gref = (rule) / 9;                                          \
3305         int nref = (rule) % 9;                                          \
3306         if (gref == 4) gref = 10;                                       \
3307         if (nref == 4) nref = 10;                                       \
3308         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3309       }                                                                 \
3310     else                        /* new format (after ver.21) */         \
3311       {                                                                 \
3312         int b;                                                          \
3313                                                                         \
3314         ONE_MORE_BYTE (b);                                              \
3315         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3316           goto invalid_code;                                            \
3317         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3318         rule += 0x100;   /* Distinguish it from the old format.  */     \
3319       }                                                                 \
3320   } while (0)
3321
3322 #define ENCODE_COMPOSITION_RULE(rule)                           \
3323   do {                                                          \
3324     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3325                                                                 \
3326     if (rule < 0x100)           /* old format */                \
3327       {                                                         \
3328         if (gref == 10) gref = 4;                               \
3329         if (nref == 10) nref = 4;                               \
3330         charbuf[idx] = 32 + gref * 9 + nref;                    \
3331         charbuf[idx + 1] = -1;                                  \
3332         new_chars++;                                            \
3333       }                                                         \
3334     else                                /* new format */        \
3335       {                                                         \
3336         charbuf[idx] = 32 + 81 + gref;                          \
3337         charbuf[idx + 1] = 32 + nref;                           \
3338         new_chars += 2;                                         \
3339       }                                                         \
3340   } while (0)
3341
3342 /* Finish the current composition as invalid.  */
3343
3344 static int
3345 finish_composition (int *charbuf, struct composition_status *cmp_status)
3346 {
3347   int idx = - cmp_status->length;
3348   int new_chars;
3349
3350   /* Recover the original ESC sequence */
3351   charbuf[idx++] = ISO_CODE_ESC;
3352   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3353                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3354                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3355                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3356                     : '4');
3357   charbuf[idx++] = -2;
3358   charbuf[idx++] = 0;
3359   charbuf[idx++] = -1;
3360   new_chars = cmp_status->nchars;
3361   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3362     for (; idx < 0; idx++)
3363       {
3364         int elt = charbuf[idx];
3365
3366         if (elt == -2)
3367           {
3368             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3369             idx++;
3370           }
3371         else if (elt == -1)
3372           {
3373             charbuf[idx++] = ISO_CODE_ESC;
3374             charbuf[idx] = '0';
3375             new_chars += 2;
3376           }
3377       }
3378   cmp_status->state = COMPOSING_NO;
3379   return new_chars;
3380 }
3381
3382 /* If characters are under composition, finish the composition.  */
3383 #define MAYBE_FINISH_COMPOSITION()                              \
3384   do {                                                          \
3385     if (cmp_status->state != COMPOSING_NO)                      \
3386       char_offset += finish_composition (charbuf, cmp_status);  \
3387   } while (0)
3388
3389 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3390
3391    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3392    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3393    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3394    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3395
3396    Produce this annotation sequence now:
3397
3398    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3399 */
3400
3401 #define DECODE_COMPOSITION_START(c1)                                       \
3402   do {                                                                     \
3403     if (c1 == '0'                                                          \
3404         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3405              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3406             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3407                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3408       {                                                                    \
3409         *charbuf++ = -1;                                                   \
3410         *charbuf++= -1;                                                    \
3411         cmp_status->state = COMPOSING_CHAR;                                \
3412         cmp_status->length += 2;                                           \
3413       }                                                                    \
3414     else                                                                   \
3415       {                                                                    \
3416         MAYBE_FINISH_COMPOSITION ();                                       \
3417         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3418                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3419                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3420                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3421         cmp_status->state                                                  \
3422           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3423         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3424         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3425         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3426         coding->annotated = 1;                                             \
3427       }                                                                    \
3428   } while (0)
3429
3430
3431 /* Handle composition end sequence ESC 1.  */
3432
3433 #define DECODE_COMPOSITION_END()                                        \
3434   do {                                                                  \
3435     if (cmp_status->nchars == 0                                         \
3436         || ((cmp_status->state == COMPOSING_CHAR)                       \
3437             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3438       {                                                                 \
3439         MAYBE_FINISH_COMPOSITION ();                                    \
3440         goto invalid_code;                                              \
3441       }                                                                 \
3442     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3443       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3444     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3445       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3446     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3447     char_offset += cmp_status->nchars;                                  \
3448     cmp_status->state = COMPOSING_NO;                                   \
3449   } while (0)
3450
3451 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3452
3453 #define STORE_COMPOSITION_RULE(rule)    \
3454   do {                                  \
3455     *charbuf++ = -2;                    \
3456     *charbuf++ = rule;                  \
3457     cmp_status->length += 2;            \
3458     cmp_status->state--;                \
3459   } while (0)
3460
3461 /* Store a composed char or a component char C in charbuf, and update
3462    cmp_status.  */
3463
3464 #define STORE_COMPOSITION_CHAR(c)                                       \
3465   do {                                                                  \
3466     *charbuf++ = (c);                                                   \
3467     cmp_status->length++;                                               \
3468     if (cmp_status->state == COMPOSING_CHAR)                            \
3469       cmp_status->nchars++;                                             \
3470     else                                                                \
3471       cmp_status->ncomps++;                                             \
3472     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3473         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3474             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3475       cmp_status->state++;                                              \
3476   } while (0)
3477
3478
3479 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3480
3481 static void
3482 decode_coding_iso_2022 (struct coding_system *coding)
3483 {
3484   const unsigned char *src = coding->source + coding->consumed;
3485   const unsigned char *src_end = coding->source + coding->src_bytes;
3486   const unsigned char *src_base;
3487   int *charbuf = coding->charbuf + coding->charbuf_used;
3488   /* We may produce two annotations (charset and composition) in one
3489      loop and one more charset annotation at the end.  */
3490   int *charbuf_end
3491     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3492   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3493   bool multibytep = coding->src_multibyte;
3494   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3495   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3496   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3497   int charset_id_2, charset_id_3;
3498   struct charset *charset;
3499   int c;
3500   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3501   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3502   ptrdiff_t char_offset = coding->produced_char;
3503   ptrdiff_t last_offset = char_offset;
3504   int last_id = charset_ascii;
3505   bool eol_dos
3506     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3507   int byte_after_cr = -1;
3508   int i;
3509
3510   setup_iso_safe_charsets (attrs);
3511   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3512
3513   if (cmp_status->state != COMPOSING_NO)
3514     {
3515       if (charbuf_end - charbuf < cmp_status->length)
3516         emacs_abort ();
3517       for (i = 0; i < cmp_status->length; i++)
3518         *charbuf++ = cmp_status->carryover[i];
3519       coding->annotated = 1;
3520     }
3521
3522   while (1)
3523     {
3524       int c1, c2, c3;
3525
3526       src_base = src;
3527       consumed_chars_base = consumed_chars;
3528
3529       if (charbuf >= charbuf_end)
3530         {
3531           if (byte_after_cr >= 0)
3532             src_base--;
3533           break;
3534         }
3535
3536       if (byte_after_cr >= 0)
3537         c1 = byte_after_cr, byte_after_cr = -1;
3538       else
3539         ONE_MORE_BYTE (c1);
3540       if (c1 < 0)
3541         goto invalid_code;
3542
3543       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3544         {
3545           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3546           char_offset++;
3547           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3548           continue;
3549         }
3550
3551       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3552         {
3553           if (c1 == ISO_CODE_ESC)
3554             {
3555               if (src + 1 >= src_end)
3556                 goto no_more_source;
3557               *charbuf++ = ISO_CODE_ESC;
3558               char_offset++;
3559               if (src[0] == '%' && src[1] == '@')
3560                 {
3561                   src += 2;
3562                   consumed_chars += 2;
3563                   char_offset += 2;
3564                   /* We are sure charbuf can contain two more chars. */
3565                   *charbuf++ = '%';
3566                   *charbuf++ = '@';
3567                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3568                 }
3569             }
3570           else
3571             {
3572               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3573               char_offset++;
3574             }
3575           continue;
3576         }
3577
3578       if ((cmp_status->state == COMPOSING_RULE
3579            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3580           && c1 != ISO_CODE_ESC)
3581         {
3582           int rule;
3583
3584           DECODE_COMPOSITION_RULE (rule);
3585           STORE_COMPOSITION_RULE (rule);
3586           continue;
3587         }
3588
3589       /* We produce at most one character.  */
3590       switch (iso_code_class [c1])
3591         {
3592         case ISO_0x20_or_0x7F:
3593           if (charset_id_0 < 0
3594               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3595             /* This is SPACE or DEL.  */
3596             charset = CHARSET_FROM_ID (charset_ascii);
3597           else
3598             charset = CHARSET_FROM_ID (charset_id_0);
3599           break;
3600
3601         case ISO_graphic_plane_0:
3602           if (charset_id_0 < 0)
3603             charset = CHARSET_FROM_ID (charset_ascii);
3604           else
3605             charset = CHARSET_FROM_ID (charset_id_0);
3606           break;
3607
3608         case ISO_0xA0_or_0xFF:
3609           if (charset_id_1 < 0
3610               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3611               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3612             goto invalid_code;
3613           /* This is a graphic character, we fall down ... */
3614           FALLTHROUGH;
3615         case ISO_graphic_plane_1:
3616           if (charset_id_1 < 0)
3617             goto invalid_code;
3618           charset = CHARSET_FROM_ID (charset_id_1);
3619           break;
3620
3621         case ISO_control_0:
3622           if (eol_dos && c1 == '\r')
3623             ONE_MORE_BYTE (byte_after_cr);
3624           MAYBE_FINISH_COMPOSITION ();
3625           charset = CHARSET_FROM_ID (charset_ascii);
3626           break;
3627
3628         case ISO_control_1:
3629           goto invalid_code;
3630
3631         case ISO_shift_out:
3632           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3633               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3634             goto invalid_code;
3635           CODING_ISO_INVOCATION (coding, 0) = 1;
3636           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3637           continue;
3638
3639         case ISO_shift_in:
3640           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3641             goto invalid_code;
3642           CODING_ISO_INVOCATION (coding, 0) = 0;
3643           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3644           continue;
3645
3646         case ISO_single_shift_2_7:
3647           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3648             goto invalid_code;
3649           FALLTHROUGH;
3650         case ISO_single_shift_2:
3651           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3652             goto invalid_code;
3653           /* SS2 is handled as an escape sequence of ESC 'N' */
3654           c1 = 'N';
3655           goto label_escape_sequence;
3656
3657         case ISO_single_shift_3:
3658           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3659             goto invalid_code;
3660           /* SS2 is handled as an escape sequence of ESC 'O' */
3661           c1 = 'O';
3662           goto label_escape_sequence;
3663
3664         case ISO_control_sequence_introducer:
3665           /* CSI is handled as an escape sequence of ESC '[' ...  */
3666           c1 = '[';
3667           goto label_escape_sequence;
3668
3669         case ISO_escape:
3670           ONE_MORE_BYTE (c1);
3671         label_escape_sequence:
3672           /* Escape sequences handled here are invocation,
3673              designation, direction specification, and character
3674              composition specification.  */
3675           switch (c1)
3676             {
3677             case '&':           /* revision of following character set */
3678               ONE_MORE_BYTE (c1);
3679               if (!(c1 >= '@' && c1 <= '~'))
3680                 goto invalid_code;
3681               ONE_MORE_BYTE (c1);
3682               if (c1 != ISO_CODE_ESC)
3683                 goto invalid_code;
3684               ONE_MORE_BYTE (c1);
3685               goto label_escape_sequence;
3686
3687             case '$':           /* designation of 2-byte character set */
3688               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3689                 goto invalid_code;
3690               {
3691                 int reg, chars96;
3692
3693                 ONE_MORE_BYTE (c1);
3694                 if (c1 >= '@' && c1 <= 'B')
3695                   {     /* designation of JISX0208.1978, GB2312.1980,
3696                            or JISX0208.1980 */
3697                     reg = 0, chars96 = 0;
3698                   }
3699                 else if (c1 >= 0x28 && c1 <= 0x2B)
3700                   { /* designation of DIMENSION2_CHARS94 character set */
3701                     reg = c1 - 0x28, chars96 = 0;
3702                     ONE_MORE_BYTE (c1);
3703                   }
3704                 else if (c1 >= 0x2C && c1 <= 0x2F)
3705                   { /* designation of DIMENSION2_CHARS96 character set */
3706                     reg = c1 - 0x2C, chars96 = 1;
3707                     ONE_MORE_BYTE (c1);
3708                   }
3709                 else
3710                   goto invalid_code;
3711                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3712                 /* We must update these variables now.  */
3713                 if (reg == 0)
3714                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3715                 else if (reg == 1)
3716                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3717                 if (chars96 < 0)
3718                   goto invalid_code;
3719               }
3720               continue;
3721
3722             case 'n':           /* invocation of locking-shift-2 */
3723               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3724                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3725                 goto invalid_code;
3726               CODING_ISO_INVOCATION (coding, 0) = 2;
3727               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3728               continue;
3729
3730             case 'o':           /* invocation of locking-shift-3 */
3731               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3732                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3733                 goto invalid_code;
3734               CODING_ISO_INVOCATION (coding, 0) = 3;
3735               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3736               continue;
3737
3738             case 'N':           /* invocation of single-shift-2 */
3739               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3740                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3741                 goto invalid_code;
3742               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3743               if (charset_id_2 < 0)
3744                 charset = CHARSET_FROM_ID (charset_ascii);
3745               else
3746                 charset = CHARSET_FROM_ID (charset_id_2);
3747               ONE_MORE_BYTE (c1);
3748               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3749                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3750                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3751                           ? c1 >= 0x80 : c1 < 0x80)))
3752                 goto invalid_code;
3753               break;
3754
3755             case 'O':           /* invocation of single-shift-3 */
3756               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3757                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3758                 goto invalid_code;
3759               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3760               if (charset_id_3 < 0)
3761                 charset = CHARSET_FROM_ID (charset_ascii);
3762               else
3763                 charset = CHARSET_FROM_ID (charset_id_3);
3764               ONE_MORE_BYTE (c1);
3765               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3766                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3767                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3768                           ? c1 >= 0x80 : c1 < 0x80)))
3769                 goto invalid_code;
3770               break;
3771
3772             case '0': case '2': case '3': case '4': /* start composition */
3773               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3774                 goto invalid_code;
3775               if (last_id != charset_ascii)
3776                 {
3777                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3778                   last_id = charset_ascii;
3779                   last_offset = char_offset;
3780                 }
3781               DECODE_COMPOSITION_START (c1);
3782               continue;
3783
3784             case '1':           /* end composition */
3785               if (cmp_status->state == COMPOSING_NO)
3786                 goto invalid_code;
3787               DECODE_COMPOSITION_END ();
3788               continue;
3789
3790             case '[':           /* specification of direction */
3791               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3792                 goto invalid_code;
3793               /* For the moment, nested direction is not supported.
3794                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3795                  left-to-right, and nonzero means right-to-left.  */
3796               ONE_MORE_BYTE (c1);
3797               switch (c1)
3798                 {
3799                 case ']':       /* end of the current direction */
3800                   coding->mode &= ~CODING_MODE_DIRECTION;
3801                   break;
3802
3803                 case '0':       /* end of the current direction */
3804                 case '1':       /* start of left-to-right direction */
3805                   ONE_MORE_BYTE (c1);
3806                   if (c1 == ']')
3807                     coding->mode &= ~CODING_MODE_DIRECTION;
3808                   else
3809                     goto invalid_code;
3810                   break;
3811
3812                 case '2':       /* start of right-to-left direction */
3813                   ONE_MORE_BYTE (c1);
3814                   if (c1 == ']')
3815                     coding->mode |= CODING_MODE_DIRECTION;
3816                   else
3817                     goto invalid_code;
3818                   break;
3819
3820                 default:
3821                   goto invalid_code;
3822                 }
3823               continue;
3824
3825             case '%':
3826               ONE_MORE_BYTE (c1);
3827               if (c1 == '/')
3828                 {
3829                   /* CTEXT extended segment:
3830                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3831                      We keep these bytes as is for the moment.
3832                      They may be decoded by post-read-conversion.  */
3833                   int dim, M, L;
3834                   int size;
3835
3836                   ONE_MORE_BYTE (dim);
3837                   if (dim < '0' || dim > '4')
3838                     goto invalid_code;
3839                   ONE_MORE_BYTE (M);
3840                   if (M < 128)
3841                     goto invalid_code;
3842                   ONE_MORE_BYTE (L);
3843                   if (L < 128)
3844                     goto invalid_code;
3845                   size = ((M - 128) * 128) + (L - 128);
3846                   if (charbuf + 6 > charbuf_end)
3847                     goto break_loop;
3848                   *charbuf++ = ISO_CODE_ESC;
3849                   *charbuf++ = '%';
3850                   *charbuf++ = '/';
3851                   *charbuf++ = dim;
3852                   *charbuf++ = BYTE8_TO_CHAR (M);
3853                   *charbuf++ = BYTE8_TO_CHAR (L);
3854                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3855                 }
3856               else if (c1 == 'G')
3857                 {
3858                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3859                      ESC % G --UTF-8-BYTES-- ESC % @
3860                      We keep these bytes as is for the moment.
3861                      They may be decoded by post-read-conversion.  */
3862                   if (charbuf + 3 > charbuf_end)
3863                     goto break_loop;
3864                   *charbuf++ = ISO_CODE_ESC;
3865                   *charbuf++ = '%';
3866                   *charbuf++ = 'G';
3867                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3868                 }
3869               else
3870                 goto invalid_code;
3871               continue;
3872               break;
3873
3874             default:
3875               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3876                 goto invalid_code;
3877               {
3878                 int reg, chars96;
3879
3880                 if (c1 >= 0x28 && c1 <= 0x2B)
3881                   { /* designation of DIMENSION1_CHARS94 character set */
3882                     reg = c1 - 0x28, chars96 = 0;
3883                     ONE_MORE_BYTE (c1);
3884                   }
3885                 else if (c1 >= 0x2C && c1 <= 0x2F)
3886                   { /* designation of DIMENSION1_CHARS96 character set */
3887                     reg = c1 - 0x2C, chars96 = 1;
3888                     ONE_MORE_BYTE (c1);
3889                   }
3890                 else
3891                   goto invalid_code;
3892                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3893                 /* We must update these variables now.  */
3894                 if (reg == 0)
3895                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3896                 else if (reg == 1)
3897                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3898                 if (chars96 < 0)
3899                   goto invalid_code;
3900               }
3901               continue;
3902             }
3903           break;
3904
3905         default:
3906           emacs_abort ();
3907         }
3908
3909       if (cmp_status->state == COMPOSING_NO
3910           && charset->id != charset_ascii
3911           && last_id != charset->id)
3912         {
3913           if (last_id != charset_ascii)
3914             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3915           last_id = charset->id;
3916           last_offset = char_offset;
3917         }
3918
3919       /* Now we know CHARSET and 1st position code C1 of a character.
3920          Produce a decoded character while getting 2nd and 3rd
3921          position codes C2, C3 if necessary.  */
3922       if (CHARSET_DIMENSION (charset) > 1)
3923         {
3924           ONE_MORE_BYTE (c2);
3925           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3926               || ((c1 & 0x80) != (c2 & 0x80)))
3927             /* C2 is not in a valid range.  */
3928             goto invalid_code;
3929           if (CHARSET_DIMENSION (charset) == 2)
3930             c1 = (c1 << 8) | c2;
3931           else
3932             {
3933               ONE_MORE_BYTE (c3);
3934               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3935                   || ((c1 & 0x80) != (c3 & 0x80)))
3936                 /* C3 is not in a valid range.  */
3937                 goto invalid_code;
3938               c1 = (c1 << 16) | (c2 << 8) | c2;
3939             }
3940         }
3941       c1 &= 0x7F7F7F;
3942       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3943       if (c < 0)
3944         {
3945           MAYBE_FINISH_COMPOSITION ();
3946           for (; src_base < src; src_base++, char_offset++)
3947             {
3948               if (ASCII_CHAR_P (*src_base))
3949                 *charbuf++ = *src_base;
3950               else
3951                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3952             }
3953         }
3954       else if (cmp_status->state == COMPOSING_NO)
3955         {
3956           *charbuf++ = c;
3957           char_offset++;
3958         }
3959       else if ((cmp_status->state == COMPOSING_CHAR
3960                 ? cmp_status->nchars
3961                 : cmp_status->ncomps)
3962                >= MAX_COMPOSITION_COMPONENTS)
3963         {
3964           /* Too long composition.  */
3965           MAYBE_FINISH_COMPOSITION ();
3966           *charbuf++ = c;
3967           char_offset++;
3968         }
3969       else
3970         STORE_COMPOSITION_CHAR (c);
3971       continue;
3972
3973     invalid_code:
3974       MAYBE_FINISH_COMPOSITION ();
3975       src = src_base;
3976       consumed_chars = consumed_chars_base;
3977       ONE_MORE_BYTE (c);
3978       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
3979       char_offset++;
3980       /* Reset the invocation and designation status to the safest
3981          one; i.e. designate ASCII to the graphic register 0, and
3982          invoke that register to the graphic plane 0.  This typically
3983          helps the case that a designation sequence for ASCII "ESC (
3984          B" is somehow broken (e.g. broken by a newline).  */
3985       CODING_ISO_INVOCATION (coding, 0) = 0;
3986       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3987       charset_id_0 = charset_ascii;
3988       continue;
3989
3990     break_loop:
3991       break;
3992     }
3993
3994  no_more_source:
3995   if (cmp_status->state != COMPOSING_NO)
3996     {
3997       if (coding->mode & CODING_MODE_LAST_BLOCK)
3998         MAYBE_FINISH_COMPOSITION ();
3999       else
4000         {
4001           charbuf -= cmp_status->length;
4002           for (i = 0; i < cmp_status->length; i++)
4003             cmp_status->carryover[i] = charbuf[i];
4004         }
4005     }
4006   else if (last_id != charset_ascii)
4007     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4008   coding->consumed_char += consumed_chars_base;
4009   coding->consumed = src_base - coding->source;
4010   coding->charbuf_used = charbuf - coding->charbuf;
4011 }
4012
4013
4014 /* ISO2022 encoding stuff.  */
4015
4016 /*
4017    It is not enough to say just "ISO2022" on encoding, we have to
4018    specify more details.  In Emacs, each coding system of ISO2022
4019    variant has the following specifications:
4020         1. Initial designation to G0 thru G3.
4021         2. Allows short-form designation?
4022         3. ASCII should be designated to G0 before control characters?
4023         4. ASCII should be designated to G0 at end of line?
4024         5. 7-bit environment or 8-bit environment?
4025         6. Use locking-shift?
4026         7. Use Single-shift?
4027    And the following two are only for Japanese:
4028         8. Use ASCII in place of JIS0201-1976-Roman?
4029         9. Use JISX0208-1983 in place of JISX0208-1978?
4030    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4031    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4032    details.
4033 */
4034
4035 /* Produce codes (escape sequence) for designating CHARSET to graphic
4036    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4037    '@', 'A', or 'B' and the coding system CODING allows, produce
4038    designation sequence of short-form.  */
4039
4040 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4041   do {                                                                  \
4042     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4043     const char *intermediate_char_94 = "()*+";                          \
4044     const char *intermediate_char_96 = ",-./";                          \
4045     int revision = -1;                                                  \
4046                                                                         \
4047     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4048       revision = CHARSET_ISO_REVISION (charset);                        \
4049                                                                         \
4050     if (revision >= 0)                                                  \
4051       {                                                                 \
4052         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4053         EMIT_ONE_BYTE ('@' + revision);                                 \
4054       }                                                                 \
4055     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4056     if (CHARSET_DIMENSION (charset) == 1)                               \
4057       {                                                                 \
4058         int b;                                                          \
4059         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4060           b = intermediate_char_94[reg];                                \
4061         else                                                            \
4062           b = intermediate_char_96[reg];                                \
4063         EMIT_ONE_ASCII_BYTE (b);                                        \
4064       }                                                                 \
4065     else                                                                \
4066       {                                                                 \
4067         EMIT_ONE_ASCII_BYTE ('$');                                      \
4068         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4069           {                                                             \
4070             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4071                 || reg != 0                                             \
4072                 || final_char < '@' || final_char > 'B')                \
4073               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4074           }                                                             \
4075         else                                                            \
4076           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4077       }                                                                 \
4078     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4079                                                                         \
4080     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4081   } while (0)
4082
4083
4084 /* The following two macros produce codes (control character or escape
4085    sequence) for ISO2022 single-shift functions (single-shift-2 and
4086    single-shift-3).  */
4087
4088 #define ENCODE_SINGLE_SHIFT_2                                           \
4089   do {                                                                  \
4090     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4091       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4092     else                                                                \
4093       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4094     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4095   } while (0)
4096
4097
4098 #define ENCODE_SINGLE_SHIFT_3                                           \
4099   do {                                                                  \
4100     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4101       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4102     else                                                                \
4103       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4104     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4105   } while (0)
4106
4107
4108 /* The following four macros produce codes (control character or
4109    escape sequence) for ISO2022 locking-shift functions (shift-in,
4110    shift-out, locking-shift-2, and locking-shift-3).  */
4111
4112 #define ENCODE_SHIFT_IN                                 \
4113   do {                                                  \
4114     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4115     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4116   } while (0)
4117
4118
4119 #define ENCODE_SHIFT_OUT                                \
4120   do {                                                  \
4121     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4122     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4123   } while (0)
4124
4125
4126 #define ENCODE_LOCKING_SHIFT_2                          \
4127   do {                                                  \
4128     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4129     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4130   } while (0)
4131
4132
4133 #define ENCODE_LOCKING_SHIFT_3                          \
4134   do {                                                  \
4135     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4136     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4137   } while (0)
4138
4139
4140 /* Produce codes for a DIMENSION1 character whose character set is
4141    CHARSET and whose position-code is C1.  Designation and invocation
4142    sequences are also produced in advance if necessary.  */
4143
4144 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4145   do {                                                                  \
4146     int id = CHARSET_ID (charset);                                      \
4147                                                                         \
4148     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4149         && id == charset_ascii)                                         \
4150       {                                                                 \
4151         id = charset_jisx0201_roman;                                    \
4152         charset = CHARSET_FROM_ID (id);                                 \
4153       }                                                                 \
4154                                                                         \
4155     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4156       {                                                                 \
4157         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4158           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4159         else                                                            \
4160           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4161         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4162         break;                                                          \
4163       }                                                                 \
4164     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4165       {                                                                 \
4166         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4167         break;                                                          \
4168       }                                                                 \
4169     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4170       {                                                                 \
4171         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4172         break;                                                          \
4173       }                                                                 \
4174     else                                                                \
4175       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4176          must invoke it, or, at first, designate it to some graphic     \
4177          register.  Then repeat the loop to actually produce the        \
4178          character.  */                                                 \
4179       dst = encode_invocation_designation (charset, coding, dst,        \
4180                                            &produced_chars);            \
4181   } while (1)
4182
4183
4184 /* Produce codes for a DIMENSION2 character whose character set is
4185    CHARSET and whose position-codes are C1 and C2.  Designation and
4186    invocation codes are also produced in advance if necessary.  */
4187
4188 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4189   do {                                                                  \
4190     int id = CHARSET_ID (charset);                                      \
4191                                                                         \
4192     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4193         && id == charset_jisx0208)                                      \
4194       {                                                                 \
4195         id = charset_jisx0208_1978;                                     \
4196         charset = CHARSET_FROM_ID (id);                                 \
4197       }                                                                 \
4198                                                                         \
4199     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4200       {                                                                 \
4201         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4202           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4203         else                                                            \
4204           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4205         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4206         break;                                                          \
4207       }                                                                 \
4208     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4209       {                                                                 \
4210         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4211         break;                                                          \
4212       }                                                                 \
4213     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4214       {                                                                 \
4215         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4216         break;                                                          \
4217       }                                                                 \
4218     else                                                                \
4219       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4220          must invoke it, or, at first, designate it to some graphic     \
4221          register.  Then repeat the loop to actually produce the        \
4222          character.  */                                                 \
4223       dst = encode_invocation_designation (charset, coding, dst,        \
4224                                            &produced_chars);            \
4225   } while (1)
4226
4227
4228 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4229   do {                                                                     \
4230     unsigned code;                                                         \
4231     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4232                                                                            \
4233     if (CHARSET_DIMENSION (charset) == 1)                                  \
4234       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4235     else                                                                   \
4236       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4237   } while (0)
4238
4239
4240 /* Produce designation and invocation codes at a place pointed by DST
4241    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4242    Return new DST.  */
4243
4244 static unsigned char *
4245 encode_invocation_designation (struct charset *charset,
4246                                struct coding_system *coding,
4247                                unsigned char *dst, ptrdiff_t *p_nchars)
4248 {
4249   bool multibytep = coding->dst_multibyte;
4250   ptrdiff_t produced_chars = *p_nchars;
4251   int reg;                      /* graphic register number */
4252   int id = CHARSET_ID (charset);
4253
4254   /* At first, check designations.  */
4255   for (reg = 0; reg < 4; reg++)
4256     if (id == CODING_ISO_DESIGNATION (coding, reg))
4257       break;
4258
4259   if (reg >= 4)
4260     {
4261       /* CHARSET is not yet designated to any graphic registers.  */
4262       /* At first check the requested designation.  */
4263       reg = CODING_ISO_REQUEST (coding, id);
4264       if (reg < 0)
4265         /* Since CHARSET requests no special designation, designate it
4266            to graphic register 0.  */
4267         reg = 0;
4268
4269       ENCODE_DESIGNATION (charset, reg, coding);
4270     }
4271
4272   if (CODING_ISO_INVOCATION (coding, 0) != reg
4273       && CODING_ISO_INVOCATION (coding, 1) != reg)
4274     {
4275       /* Since the graphic register REG is not invoked to any graphic
4276          planes, invoke it to graphic plane 0.  */
4277       switch (reg)
4278         {
4279         case 0:                 /* graphic register 0 */
4280           ENCODE_SHIFT_IN;
4281           break;
4282
4283         case 1:                 /* graphic register 1 */
4284           ENCODE_SHIFT_OUT;
4285           break;
4286
4287         case 2:                 /* graphic register 2 */
4288           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4289             ENCODE_SINGLE_SHIFT_2;
4290           else
4291             ENCODE_LOCKING_SHIFT_2;
4292           break;
4293
4294         case 3:                 /* graphic register 3 */
4295           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4296             ENCODE_SINGLE_SHIFT_3;
4297           else
4298             ENCODE_LOCKING_SHIFT_3;
4299           break;
4300
4301         default:
4302           break;
4303         }
4304     }
4305
4306   *p_nchars = produced_chars;
4307   return dst;
4308 }
4309
4310
4311 /* Produce codes for designation and invocation to reset the graphic
4312    planes and registers to initial state.  */
4313 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4314   do {                                                                  \
4315     int reg;                                                            \
4316     struct charset *charset;                                            \
4317                                                                         \
4318     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4319       ENCODE_SHIFT_IN;                                                  \
4320     for (reg = 0; reg < 4; reg++)                                       \
4321       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4322           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4323               != CODING_ISO_INITIAL (coding, reg)))                     \
4324         {                                                               \
4325           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4326           ENCODE_DESIGNATION (charset, reg, coding);                    \
4327         }                                                               \
4328   } while (0)
4329
4330
4331 /* Produce designation sequences of charsets in the line started from
4332    CHARBUF to a place pointed by DST, and return the number of
4333    produced bytes.  DST should not directly point a buffer text area
4334    which may be relocated by char_charset call.
4335
4336    If the current block ends before any end-of-line, we may fail to
4337    find all the necessary designations.  */
4338
4339 static ptrdiff_t
4340 encode_designation_at_bol (struct coding_system *coding,
4341                            int *charbuf, int *charbuf_end,
4342                            unsigned char *dst)
4343 {
4344   unsigned char *orig = dst;
4345   struct charset *charset;
4346   /* Table of charsets to be designated to each graphic register.  */
4347   int r[4];
4348   int c, found = 0, reg;
4349   ptrdiff_t produced_chars = 0;
4350   bool multibytep = coding->dst_multibyte;
4351   Lisp_Object attrs;
4352   Lisp_Object charset_list;
4353
4354   attrs = CODING_ID_ATTRS (coding->id);
4355   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4356   if (EQ (charset_list, Qiso_2022))
4357     charset_list = Viso_2022_charset_list;
4358
4359   for (reg = 0; reg < 4; reg++)
4360     r[reg] = -1;
4361
4362   while (charbuf < charbuf_end && found < 4)
4363     {
4364       int id;
4365
4366       c = *charbuf++;
4367       if (c == '\n')
4368         break;
4369       charset = char_charset (c, charset_list, NULL);
4370       id = CHARSET_ID (charset);
4371       reg = CODING_ISO_REQUEST (coding, id);
4372       if (reg >= 0 && r[reg] < 0)
4373         {
4374           found++;
4375           r[reg] = id;
4376         }
4377     }
4378
4379   if (found)
4380     {
4381       for (reg = 0; reg < 4; reg++)
4382         if (r[reg] >= 0
4383             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4384           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4385     }
4386
4387   return dst - orig;
4388 }
4389
4390 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4391
4392 static bool
4393 encode_coding_iso_2022 (struct coding_system *coding)
4394 {
4395   bool multibytep = coding->dst_multibyte;
4396   int *charbuf = coding->charbuf;
4397   int *charbuf_end = charbuf + coding->charbuf_used;
4398   unsigned char *dst = coding->destination + coding->produced;
4399   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4400   int safe_room = 16;
4401   bool bol_designation
4402     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4403        && CODING_ISO_BOL (coding));
4404   ptrdiff_t produced_chars = 0;
4405   Lisp_Object attrs, eol_type, charset_list;
4406   bool ascii_compatible;
4407   int c;
4408   int preferred_charset_id = -1;
4409
4410   CODING_GET_INFO (coding, attrs, charset_list);
4411   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4412   if (VECTORP (eol_type))
4413     eol_type = Qunix;
4414
4415   setup_iso_safe_charsets (attrs);
4416   /* Charset list may have been changed.  */
4417   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4418   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4419
4420   ascii_compatible
4421     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4422        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4423                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4424
4425   while (charbuf < charbuf_end)
4426     {
4427       ASSURE_DESTINATION (safe_room);
4428
4429       if (bol_designation)
4430         {
4431           /* We have to produce designation sequences if any now.  */
4432           unsigned char desig_buf[16];
4433           ptrdiff_t nbytes;
4434           ptrdiff_t offset;
4435
4436           charset_map_loaded = 0;
4437           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4438                                               desig_buf);
4439           if (charset_map_loaded
4440               && (offset = coding_change_destination (coding)))
4441             {
4442               dst += offset;
4443               dst_end += offset;
4444             }
4445           memcpy (dst, desig_buf, nbytes);
4446           dst += nbytes;
4447           /* We are sure that designation sequences are all ASCII bytes.  */
4448           produced_chars += nbytes;
4449           bol_designation = 0;
4450           ASSURE_DESTINATION (safe_room);
4451         }
4452
4453       c = *charbuf++;
4454
4455       if (c < 0)
4456         {
4457           /* Handle an annotation.  */
4458           switch (*charbuf)
4459             {
4460             case CODING_ANNOTATE_COMPOSITION_MASK:
4461               /* Not yet implemented.  */
4462               break;
4463             case CODING_ANNOTATE_CHARSET_MASK:
4464               preferred_charset_id = charbuf[2];
4465               if (preferred_charset_id >= 0
4466                   && NILP (Fmemq (make_number (preferred_charset_id),
4467                                   charset_list)))
4468                 preferred_charset_id = -1;
4469               break;
4470             default:
4471               emacs_abort ();
4472             }
4473           charbuf += -c - 1;
4474           continue;
4475         }
4476
4477       /* Now encode the character C.  */
4478       if (c < 0x20 || c == 0x7F)
4479         {
4480           if (c == '\n'
4481               || (c == '\r' && EQ (eol_type, Qmac)))
4482             {
4483               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4484                 ENCODE_RESET_PLANE_AND_REGISTER ();
4485               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4486                 {
4487                   int i;
4488
4489                   for (i = 0; i < 4; i++)
4490                     CODING_ISO_DESIGNATION (coding, i)
4491                       = CODING_ISO_INITIAL (coding, i);
4492                 }
4493               bol_designation = ((CODING_ISO_FLAGS (coding)
4494                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4495                                  != 0);
4496             }
4497           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4498             ENCODE_RESET_PLANE_AND_REGISTER ();
4499           EMIT_ONE_ASCII_BYTE (c);
4500         }
4501       else if (ASCII_CHAR_P (c))
4502         {
4503           if (ascii_compatible)
4504             EMIT_ONE_ASCII_BYTE (c);
4505           else
4506             {
4507               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4508               ENCODE_ISO_CHARACTER (charset, c);
4509             }
4510         }
4511       else if (CHAR_BYTE8_P (c))
4512         {
4513           c = CHAR_TO_BYTE8 (c);
4514           EMIT_ONE_BYTE (c);
4515         }
4516       else
4517         {
4518           struct charset *charset;
4519
4520           if (preferred_charset_id >= 0)
4521             {
4522               bool result;
4523
4524               charset = CHARSET_FROM_ID (preferred_charset_id);
4525               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4526               if (! result)
4527                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4528                                      NULL, charset);
4529             }
4530           else
4531             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4532                                  NULL, charset);
4533           if (!charset)
4534             {
4535               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4536                 {
4537                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4538                   charset = CHARSET_FROM_ID (charset_ascii);
4539                 }
4540               else
4541                 {
4542                   c = coding->default_char;
4543                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4544                                        charset_list, NULL, charset);
4545                 }
4546             }
4547           ENCODE_ISO_CHARACTER (charset, c);
4548         }
4549     }
4550
4551   if (coding->mode & CODING_MODE_LAST_BLOCK
4552       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4553     {
4554       ASSURE_DESTINATION (safe_room);
4555       ENCODE_RESET_PLANE_AND_REGISTER ();
4556     }
4557   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4558   CODING_ISO_BOL (coding) = bol_designation;
4559   coding->produced_char += produced_chars;
4560   coding->produced = dst - coding->destination;
4561   return 0;
4562 }
4563
4564 \f
4565 /*** 8,9. SJIS and BIG5 handlers ***/
4566
4567 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4568    quite widely.  So, for the moment, Emacs supports them in the bare
4569    C code.  But, in the future, they may be supported only by CCL.  */
4570
4571 /* SJIS is a coding system encoding three character sets: ASCII, right
4572    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4573    as is.  A character of charset katakana-jisx0201 is encoded by
4574    "position-code + 0x80".  A character of charset japanese-jisx0208
4575    is encoded in 2-byte but two position-codes are divided and shifted
4576    so that it fit in the range below.
4577
4578    --- CODE RANGE of SJIS ---
4579    (character set)      (range)
4580    ASCII                0x00 .. 0x7F
4581    KATAKANA-JISX0201    0xA0 .. 0xDF
4582    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4583             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4584    -------------------------------
4585
4586 */
4587
4588 /* BIG5 is a coding system encoding two character sets: ASCII and
4589    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4590    character set and is encoded in two-byte.
4591
4592    --- CODE RANGE of BIG5 ---
4593    (character set)      (range)
4594    ASCII                0x00 .. 0x7F
4595    Big5 (1st byte)      0xA1 .. 0xFE
4596         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4597    --------------------------
4598
4599   */
4600
4601 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4602    Return true if a text is encoded in SJIS.  */
4603
4604 static bool
4605 detect_coding_sjis (struct coding_system *coding,
4606                     struct coding_detection_info *detect_info)
4607 {
4608   const unsigned char *src = coding->source, *src_base;
4609   const unsigned char *src_end = coding->source + coding->src_bytes;
4610   bool multibytep = coding->src_multibyte;
4611   ptrdiff_t consumed_chars = 0;
4612   int found = 0;
4613   int c;
4614   Lisp_Object attrs, charset_list;
4615   int max_first_byte_of_2_byte_code;
4616
4617   CODING_GET_INFO (coding, attrs, charset_list);
4618   max_first_byte_of_2_byte_code
4619     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4620
4621   detect_info->checked |= CATEGORY_MASK_SJIS;
4622   /* A coding system of this category is always ASCII compatible.  */
4623   src += coding->head_ascii;
4624
4625   while (1)
4626     {
4627       src_base = src;
4628       ONE_MORE_BYTE (c);
4629       if (c < 0x80)
4630         continue;
4631       if ((c >= 0x81 && c <= 0x9F)
4632           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4633         {
4634           ONE_MORE_BYTE (c);
4635           if (c < 0x40 || c == 0x7F || c > 0xFC)
4636             break;
4637           found = CATEGORY_MASK_SJIS;
4638         }
4639       else if (c >= 0xA0 && c < 0xE0)
4640         found = CATEGORY_MASK_SJIS;
4641       else
4642         break;
4643     }
4644   detect_info->rejected |= CATEGORY_MASK_SJIS;
4645   return 0;
4646
4647  no_more_source:
4648   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4649     {
4650       detect_info->rejected |= CATEGORY_MASK_SJIS;
4651       return 0;
4652     }
4653   detect_info->found |= found;
4654   return 1;
4655 }
4656
4657 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4658    Return true if a text is encoded in BIG5.  */
4659
4660 static bool
4661 detect_coding_big5 (struct coding_system *coding,
4662                     struct coding_detection_info *detect_info)
4663 {
4664   const unsigned char *src = coding->source, *src_base;
4665   const unsigned char *src_end = coding->source + coding->src_bytes;
4666   bool multibytep = coding->src_multibyte;
4667   ptrdiff_t consumed_chars = 0;
4668   int found = 0;
4669   int c;
4670
4671   detect_info->checked |= CATEGORY_MASK_BIG5;
4672   /* A coding system of this category is always ASCII compatible.  */
4673   src += coding->head_ascii;
4674
4675   while (1)
4676     {
4677       src_base = src;
4678       ONE_MORE_BYTE (c);
4679       if (c < 0x80)
4680         continue;
4681       if (c >= 0xA1)
4682         {
4683           ONE_MORE_BYTE (c);
4684           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4685             return 0;
4686           found = CATEGORY_MASK_BIG5;
4687         }
4688       else
4689         break;
4690     }
4691   detect_info->rejected |= CATEGORY_MASK_BIG5;
4692   return 0;
4693
4694  no_more_source:
4695   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4696     {
4697       detect_info->rejected |= CATEGORY_MASK_BIG5;
4698       return 0;
4699     }
4700   detect_info->found |= found;
4701   return 1;
4702 }
4703
4704 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4705
4706 static void
4707 decode_coding_sjis (struct coding_system *coding)
4708 {
4709   const unsigned char *src = coding->source + coding->consumed;
4710   const unsigned char *src_end = coding->source + coding->src_bytes;
4711   const unsigned char *src_base;
4712   int *charbuf = coding->charbuf + coding->charbuf_used;
4713   /* We may produce one charset annotation in one loop and one more at
4714      the end.  */
4715   int *charbuf_end
4716     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4717   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4718   bool multibytep = coding->src_multibyte;
4719   struct charset *charset_roman, *charset_kanji, *charset_kana;
4720   struct charset *charset_kanji2;
4721   Lisp_Object attrs, charset_list, val;
4722   ptrdiff_t char_offset = coding->produced_char;
4723   ptrdiff_t last_offset = char_offset;
4724   int last_id = charset_ascii;
4725   bool eol_dos
4726     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4727   int byte_after_cr = -1;
4728
4729   CODING_GET_INFO (coding, attrs, charset_list);
4730
4731   val = charset_list;
4732   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4733   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4734   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4735   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4736
4737   while (1)
4738     {
4739       int c, c1;
4740       struct charset *charset;
4741
4742       src_base = src;
4743       consumed_chars_base = consumed_chars;
4744
4745       if (charbuf >= charbuf_end)
4746         {
4747           if (byte_after_cr >= 0)
4748             src_base--;
4749           break;
4750         }
4751
4752       if (byte_after_cr >= 0)
4753         c = byte_after_cr, byte_after_cr = -1;
4754       else
4755         ONE_MORE_BYTE (c);
4756       if (c < 0)
4757         goto invalid_code;
4758       if (c < 0x80)
4759         {
4760           if (eol_dos && c == '\r')
4761             ONE_MORE_BYTE (byte_after_cr);
4762           charset = charset_roman;
4763         }
4764       else if (c == 0x80 || c == 0xA0)
4765         goto invalid_code;
4766       else if (c >= 0xA1 && c <= 0xDF)
4767         {
4768           /* SJIS -> JISX0201-Kana */
4769           c &= 0x7F;
4770           charset = charset_kana;
4771         }
4772       else if (c <= 0xEF)
4773         {
4774           /* SJIS -> JISX0208 */
4775           ONE_MORE_BYTE (c1);
4776           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4777             goto invalid_code;
4778           c = (c << 8) | c1;
4779           SJIS_TO_JIS (c);
4780           charset = charset_kanji;
4781         }
4782       else if (c <= 0xFC && charset_kanji2)
4783         {
4784           /* SJIS -> JISX0213-2 */
4785           ONE_MORE_BYTE (c1);
4786           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4787             goto invalid_code;
4788           c = (c << 8) | c1;
4789           SJIS_TO_JIS2 (c);
4790           charset = charset_kanji2;
4791         }
4792       else
4793         goto invalid_code;
4794       if (charset->id != charset_ascii
4795           && last_id != charset->id)
4796         {
4797           if (last_id != charset_ascii)
4798             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4799           last_id = charset->id;
4800           last_offset = char_offset;
4801         }
4802       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4803       *charbuf++ = c;
4804       char_offset++;
4805       continue;
4806
4807     invalid_code:
4808       src = src_base;
4809       consumed_chars = consumed_chars_base;
4810       ONE_MORE_BYTE (c);
4811       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4812       char_offset++;
4813     }
4814
4815  no_more_source:
4816   if (last_id != charset_ascii)
4817     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4818   coding->consumed_char += consumed_chars_base;
4819   coding->consumed = src_base - coding->source;
4820   coding->charbuf_used = charbuf - coding->charbuf;
4821 }
4822
4823 static void
4824 decode_coding_big5 (struct coding_system *coding)
4825 {
4826   const unsigned char *src = coding->source + coding->consumed;
4827   const unsigned char *src_end = coding->source + coding->src_bytes;
4828   const unsigned char *src_base;
4829   int *charbuf = coding->charbuf + coding->charbuf_used;
4830   /* We may produce one charset annotation in one loop and one more at
4831      the end.  */
4832   int *charbuf_end
4833     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4834   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4835   bool multibytep = coding->src_multibyte;
4836   struct charset *charset_roman, *charset_big5;
4837   Lisp_Object attrs, charset_list, val;
4838   ptrdiff_t char_offset = coding->produced_char;
4839   ptrdiff_t last_offset = char_offset;
4840   int last_id = charset_ascii;
4841   bool eol_dos
4842     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4843   int byte_after_cr = -1;
4844
4845   CODING_GET_INFO (coding, attrs, charset_list);
4846   val = charset_list;
4847   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4848   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4849
4850   while (1)
4851     {
4852       int c, c1;
4853       struct charset *charset;
4854
4855       src_base = src;
4856       consumed_chars_base = consumed_chars;
4857
4858       if (charbuf >= charbuf_end)
4859         {
4860           if (byte_after_cr >= 0)
4861             src_base--;
4862           break;
4863         }
4864
4865       if (byte_after_cr >= 0)
4866         c = byte_after_cr, byte_after_cr = -1;
4867       else
4868         ONE_MORE_BYTE (c);
4869
4870       if (c < 0)
4871         goto invalid_code;
4872       if (c < 0x80)
4873         {
4874           if (eol_dos && c == '\r')
4875             ONE_MORE_BYTE (byte_after_cr);
4876           charset = charset_roman;
4877         }
4878       else
4879         {
4880           /* BIG5 -> Big5 */
4881           if (c < 0xA1 || c > 0xFE)
4882             goto invalid_code;
4883           ONE_MORE_BYTE (c1);
4884           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4885             goto invalid_code;
4886           c = c << 8 | c1;
4887           charset = charset_big5;
4888         }
4889       if (charset->id != charset_ascii
4890           && last_id != charset->id)
4891         {
4892           if (last_id != charset_ascii)
4893             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4894           last_id = charset->id;
4895           last_offset = char_offset;
4896         }
4897       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4898       *charbuf++ = c;
4899       char_offset++;
4900       continue;
4901
4902     invalid_code:
4903       src = src_base;
4904       consumed_chars = consumed_chars_base;
4905       ONE_MORE_BYTE (c);
4906       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4907       char_offset++;
4908     }
4909
4910  no_more_source:
4911   if (last_id != charset_ascii)
4912     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4913   coding->consumed_char += consumed_chars_base;
4914   coding->consumed = src_base - coding->source;
4915   coding->charbuf_used = charbuf - coding->charbuf;
4916 }
4917
4918 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4919    This function can encode charsets `ascii', `katakana-jisx0201',
4920    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4921    are sure that all these charsets are registered as official charset
4922    (i.e. do not have extended leading-codes).  Characters of other
4923    charsets are produced without any encoding.  */
4924
4925 static bool
4926 encode_coding_sjis (struct coding_system *coding)
4927 {
4928   bool multibytep = coding->dst_multibyte;
4929   int *charbuf = coding->charbuf;
4930   int *charbuf_end = charbuf + coding->charbuf_used;
4931   unsigned char *dst = coding->destination + coding->produced;
4932   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4933   int safe_room = 4;
4934   ptrdiff_t produced_chars = 0;
4935   Lisp_Object attrs, charset_list, val;
4936   bool ascii_compatible;
4937   struct charset *charset_kanji, *charset_kana;
4938   struct charset *charset_kanji2;
4939   int c;
4940
4941   CODING_GET_INFO (coding, attrs, charset_list);
4942   val = XCDR (charset_list);
4943   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4944   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4945   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4946
4947   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4948
4949   while (charbuf < charbuf_end)
4950     {
4951       ASSURE_DESTINATION (safe_room);
4952       c = *charbuf++;
4953       /* Now encode the character C.  */
4954       if (ASCII_CHAR_P (c) && ascii_compatible)
4955         EMIT_ONE_ASCII_BYTE (c);
4956       else if (CHAR_BYTE8_P (c))
4957         {
4958           c = CHAR_TO_BYTE8 (c);
4959           EMIT_ONE_BYTE (c);
4960         }
4961       else
4962         {
4963           unsigned code;
4964           struct charset *charset;
4965           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4966                                &code, charset);
4967
4968           if (!charset)
4969             {
4970               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4971                 {
4972                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4973                   charset = CHARSET_FROM_ID (charset_ascii);
4974                 }
4975               else
4976                 {
4977                   c = coding->default_char;
4978                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4979                                        charset_list, &code, charset);
4980                 }
4981             }
4982           if (code == CHARSET_INVALID_CODE (charset))
4983             emacs_abort ();
4984           if (charset == charset_kanji)
4985             {
4986               int c1, c2;
4987               JIS_TO_SJIS (code);
4988               c1 = code >> 8, c2 = code & 0xFF;
4989               EMIT_TWO_BYTES (c1, c2);
4990             }
4991           else if (charset == charset_kana)
4992             EMIT_ONE_BYTE (code | 0x80);
4993           else if (charset_kanji2 && charset == charset_kanji2)
4994             {
4995               int c1, c2;
4996
4997               c1 = code >> 8;
4998               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4999                   || c1 == 0x28
5000                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5001                 {
5002                   JIS_TO_SJIS2 (code);
5003                   c1 = code >> 8, c2 = code & 0xFF;
5004                   EMIT_TWO_BYTES (c1, c2);
5005                 }
5006               else
5007                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5008             }
5009           else
5010             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5011         }
5012     }
5013   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5014   coding->produced_char += produced_chars;
5015   coding->produced = dst - coding->destination;
5016   return 0;
5017 }
5018
5019 static bool
5020 encode_coding_big5 (struct coding_system *coding)
5021 {
5022   bool multibytep = coding->dst_multibyte;
5023   int *charbuf = coding->charbuf;
5024   int *charbuf_end = charbuf + coding->charbuf_used;
5025   unsigned char *dst = coding->destination + coding->produced;
5026   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5027   int safe_room = 4;
5028   ptrdiff_t produced_chars = 0;
5029   Lisp_Object attrs, charset_list, val;
5030   bool ascii_compatible;
5031   struct charset *charset_big5;
5032   int c;
5033
5034   CODING_GET_INFO (coding, attrs, charset_list);
5035   val = XCDR (charset_list);
5036   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5037   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5038
5039   while (charbuf < charbuf_end)
5040     {
5041       ASSURE_DESTINATION (safe_room);
5042       c = *charbuf++;
5043       /* Now encode the character C.  */
5044       if (ASCII_CHAR_P (c) && ascii_compatible)
5045         EMIT_ONE_ASCII_BYTE (c);
5046       else if (CHAR_BYTE8_P (c))
5047         {
5048           c = CHAR_TO_BYTE8 (c);
5049           EMIT_ONE_BYTE (c);
5050         }
5051       else
5052         {
5053           unsigned code;
5054           struct charset *charset;
5055           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5056                                &code, charset);
5057
5058           if (! charset)
5059             {
5060               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5061                 {
5062                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5063                   charset = CHARSET_FROM_ID (charset_ascii);
5064                 }
5065               else
5066                 {
5067                   c = coding->default_char;
5068                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5069                                        charset_list, &code, charset);
5070                 }
5071             }
5072           if (code == CHARSET_INVALID_CODE (charset))
5073             emacs_abort ();
5074           if (charset == charset_big5)
5075             {
5076               int c1, c2;
5077
5078               c1 = code >> 8, c2 = code & 0xFF;
5079               EMIT_TWO_BYTES (c1, c2);
5080             }
5081           else
5082             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5083         }
5084     }
5085   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5086   coding->produced_char += produced_chars;
5087   coding->produced = dst - coding->destination;
5088   return 0;
5089 }
5090
5091 \f
5092 /*** 10. CCL handlers ***/
5093
5094 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5095    Return true if a text is encoded in a coding system of which
5096    encoder/decoder are written in CCL program.  */
5097
5098 static bool
5099 detect_coding_ccl (struct coding_system *coding,
5100                    struct coding_detection_info *detect_info)
5101 {
5102   const unsigned char *src = coding->source, *src_base;
5103   const unsigned char *src_end = coding->source + coding->src_bytes;
5104   bool multibytep = coding->src_multibyte;
5105   ptrdiff_t consumed_chars = 0;
5106   int found = 0;
5107   unsigned char *valids;
5108   ptrdiff_t head_ascii = coding->head_ascii;
5109   Lisp_Object attrs;
5110
5111   detect_info->checked |= CATEGORY_MASK_CCL;
5112
5113   coding = &coding_categories[coding_category_ccl];
5114   valids = CODING_CCL_VALIDS (coding);
5115   attrs = CODING_ID_ATTRS (coding->id);
5116   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5117     src += head_ascii;
5118
5119   while (1)
5120     {
5121       int c;
5122
5123       src_base = src;
5124       ONE_MORE_BYTE (c);
5125       if (c < 0 || ! valids[c])
5126         break;
5127       if ((valids[c] > 1))
5128         found = CATEGORY_MASK_CCL;
5129     }
5130   detect_info->rejected |= CATEGORY_MASK_CCL;
5131   return 0;
5132
5133  no_more_source:
5134   detect_info->found |= found;
5135   return 1;
5136 }
5137
5138 static void
5139 decode_coding_ccl (struct coding_system *coding)
5140 {
5141   const unsigned char *src = coding->source + coding->consumed;
5142   const unsigned char *src_end = coding->source + coding->src_bytes;
5143   int *charbuf = coding->charbuf + coding->charbuf_used;
5144   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5145   ptrdiff_t consumed_chars = 0;
5146   bool multibytep = coding->src_multibyte;
5147   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5148   int source_charbuf[1024];
5149   int source_byteidx[1025];
5150   Lisp_Object attrs, charset_list;
5151
5152   CODING_GET_INFO (coding, attrs, charset_list);
5153
5154   while (1)
5155     {
5156       const unsigned char *p = src;
5157       ptrdiff_t offset;
5158       int i = 0;
5159
5160       if (multibytep)
5161         {
5162           while (i < 1024 && p < src_end)
5163             {
5164               source_byteidx[i] = p - src;
5165               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5166             }
5167           source_byteidx[i] = p - src;
5168         }
5169       else
5170         while (i < 1024 && p < src_end)
5171           source_charbuf[i++] = *p++;
5172
5173       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5174         ccl->last_block = true;
5175       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5176       charset_map_loaded = 0;
5177       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5178                   charset_list);
5179       if (charset_map_loaded
5180           && (offset = coding_change_source (coding)))
5181         {
5182           p += offset;
5183           src += offset;
5184           src_end += offset;
5185         }
5186       charbuf += ccl->produced;
5187       if (multibytep)
5188         src += source_byteidx[ccl->consumed];
5189       else
5190         src += ccl->consumed;
5191       consumed_chars += ccl->consumed;
5192       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5193         break;
5194     }
5195
5196   switch (ccl->status)
5197     {
5198     case CCL_STAT_SUSPEND_BY_SRC:
5199       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5200       break;
5201     case CCL_STAT_SUSPEND_BY_DST:
5202       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5203       break;
5204     case CCL_STAT_QUIT:
5205     case CCL_STAT_INVALID_CMD:
5206       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5207       break;
5208     default:
5209       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5210       break;
5211     }
5212   coding->consumed_char += consumed_chars;
5213   coding->consumed = src - coding->source;
5214   coding->charbuf_used = charbuf - coding->charbuf;
5215 }
5216
5217 static bool
5218 encode_coding_ccl (struct coding_system *coding)
5219 {
5220   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5221   bool multibytep = coding->dst_multibyte;
5222   int *charbuf = coding->charbuf;
5223   int *charbuf_end = charbuf + coding->charbuf_used;
5224   unsigned char *dst = coding->destination + coding->produced;
5225   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5226   int destination_charbuf[1024];
5227   ptrdiff_t produced_chars = 0;
5228   int i;
5229   Lisp_Object attrs, charset_list;
5230
5231   CODING_GET_INFO (coding, attrs, charset_list);
5232   if (coding->consumed_char == coding->src_chars
5233       && coding->mode & CODING_MODE_LAST_BLOCK)
5234     ccl->last_block = true;
5235
5236   do
5237     {
5238       ptrdiff_t offset;
5239
5240       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5241       charset_map_loaded = 0;
5242       ccl_driver (ccl, charbuf, destination_charbuf,
5243                   charbuf_end - charbuf, 1024, charset_list);
5244       if (charset_map_loaded
5245           && (offset = coding_change_destination (coding)))
5246         dst += offset;
5247       if (multibytep)
5248         {
5249           ASSURE_DESTINATION (ccl->produced * 2);
5250           for (i = 0; i < ccl->produced; i++)
5251             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5252         }
5253       else
5254         {
5255           ASSURE_DESTINATION (ccl->produced);
5256           for (i = 0; i < ccl->produced; i++)
5257             *dst++ = destination_charbuf[i] & 0xFF;
5258           produced_chars += ccl->produced;
5259         }
5260       charbuf += ccl->consumed;
5261       if (ccl->status == CCL_STAT_QUIT
5262           || ccl->status == CCL_STAT_INVALID_CMD)
5263         break;
5264     }
5265   while (charbuf < charbuf_end);
5266
5267   switch (ccl->status)
5268     {
5269     case CCL_STAT_SUSPEND_BY_SRC:
5270       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5271       break;
5272     case CCL_STAT_SUSPEND_BY_DST:
5273       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5274       break;
5275     case CCL_STAT_QUIT:
5276     case CCL_STAT_INVALID_CMD:
5277       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5278       break;
5279     default:
5280       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5281       break;
5282     }
5283
5284   coding->produced_char += produced_chars;
5285   coding->produced = dst - coding->destination;
5286   return 0;
5287 }
5288
5289 \f
5290 /*** 10, 11. no-conversion handlers ***/
5291
5292 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5293
5294 static void
5295 decode_coding_raw_text (struct coding_system *coding)
5296 {
5297   bool eol_dos
5298     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5299
5300   coding->chars_at_source = 1;
5301   coding->consumed_char = coding->src_chars;
5302   coding->consumed = coding->src_bytes;
5303   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5304     {
5305       coding->consumed_char--;
5306       coding->consumed--;
5307       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5308     }
5309   else
5310     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5311 }
5312
5313 static bool
5314 encode_coding_raw_text (struct coding_system *coding)
5315 {
5316   bool multibytep = coding->dst_multibyte;
5317   int *charbuf = coding->charbuf;
5318   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5319   unsigned char *dst = coding->destination + coding->produced;
5320   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5321   ptrdiff_t produced_chars = 0;
5322   int c;
5323
5324   if (multibytep)
5325     {
5326       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5327
5328       if (coding->src_multibyte)
5329         while (charbuf < charbuf_end)
5330           {
5331             ASSURE_DESTINATION (safe_room);
5332             c = *charbuf++;
5333             if (ASCII_CHAR_P (c))
5334               EMIT_ONE_ASCII_BYTE (c);
5335             else if (CHAR_BYTE8_P (c))
5336               {
5337                 c = CHAR_TO_BYTE8 (c);
5338                 EMIT_ONE_BYTE (c);
5339               }
5340             else
5341               {
5342                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5343
5344                 CHAR_STRING_ADVANCE (c, p1);
5345                 do
5346                   {
5347                     EMIT_ONE_BYTE (*p0);
5348                     p0++;
5349                   }
5350                 while (p0 < p1);
5351               }
5352           }
5353       else
5354         while (charbuf < charbuf_end)
5355           {
5356             ASSURE_DESTINATION (safe_room);
5357             c = *charbuf++;
5358             EMIT_ONE_BYTE (c);
5359           }
5360     }
5361   else
5362     {
5363       if (coding->src_multibyte)
5364         {
5365           int safe_room = MAX_MULTIBYTE_LENGTH;
5366
5367           while (charbuf < charbuf_end)
5368             {
5369               ASSURE_DESTINATION (safe_room);
5370               c = *charbuf++;
5371               if (ASCII_CHAR_P (c))
5372                 *dst++ = c;
5373               else if (CHAR_BYTE8_P (c))
5374                 *dst++ = CHAR_TO_BYTE8 (c);
5375               else
5376                 CHAR_STRING_ADVANCE (c, dst);
5377             }
5378         }
5379       else
5380         {
5381           ASSURE_DESTINATION (charbuf_end - charbuf);
5382           while (charbuf < charbuf_end && dst < dst_end)
5383             *dst++ = *charbuf++;
5384         }
5385       produced_chars = dst - (coding->destination + coding->produced);
5386     }
5387   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5388   coding->produced_char += produced_chars;
5389   coding->produced = dst - coding->destination;
5390   return 0;
5391 }
5392
5393 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5394    Return true if a text is encoded in a charset-based coding system.  */
5395
5396 static bool
5397 detect_coding_charset (struct coding_system *coding,
5398                        struct coding_detection_info *detect_info)
5399 {
5400   const unsigned char *src = coding->source, *src_base;
5401   const unsigned char *src_end = coding->source + coding->src_bytes;
5402   bool multibytep = coding->src_multibyte;
5403   ptrdiff_t consumed_chars = 0;
5404   Lisp_Object attrs, valids, name;
5405   int found = 0;
5406   ptrdiff_t head_ascii = coding->head_ascii;
5407   bool check_latin_extra = 0;
5408
5409   detect_info->checked |= CATEGORY_MASK_CHARSET;
5410
5411   coding = &coding_categories[coding_category_charset];
5412   attrs = CODING_ID_ATTRS (coding->id);
5413   valids = AREF (attrs, coding_attr_charset_valids);
5414   name = CODING_ID_NAME (coding->id);
5415   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5416                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5417       || strncmp (SSDATA (SYMBOL_NAME (name)),
5418                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5419     check_latin_extra = 1;
5420
5421   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5422     src += head_ascii;
5423
5424   while (1)
5425     {
5426       int c;
5427       Lisp_Object val;
5428       struct charset *charset;
5429       int dim, idx;
5430
5431       src_base = src;
5432       ONE_MORE_BYTE (c);
5433       if (c < 0)
5434         continue;
5435       val = AREF (valids, c);
5436       if (NILP (val))
5437         break;
5438       if (c >= 0x80)
5439         {
5440           if (c < 0xA0
5441               && check_latin_extra
5442               && (!VECTORP (Vlatin_extra_code_table)
5443                   || NILP (AREF (Vlatin_extra_code_table, c))))
5444             break;
5445           found = CATEGORY_MASK_CHARSET;
5446         }
5447       if (INTEGERP (val))
5448         {
5449           charset = CHARSET_FROM_ID (XFASTINT (val));
5450           dim = CHARSET_DIMENSION (charset);
5451           for (idx = 1; idx < dim; idx++)
5452             {
5453               if (src == src_end)
5454                 goto too_short;
5455               ONE_MORE_BYTE (c);
5456               if (c < charset->code_space[(dim - 1 - idx) * 4]
5457                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5458                 break;
5459             }
5460           if (idx < dim)
5461             break;
5462         }
5463       else
5464         {
5465           idx = 1;
5466           for (; CONSP (val); val = XCDR (val))
5467             {
5468               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5469               dim = CHARSET_DIMENSION (charset);
5470               while (idx < dim)
5471                 {
5472                   if (src == src_end)
5473                     goto too_short;
5474                   ONE_MORE_BYTE (c);
5475                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5476                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5477                     break;
5478                   idx++;
5479                 }
5480               if (idx == dim)
5481                 {
5482                   val = Qnil;
5483                   break;
5484                 }
5485             }
5486           if (CONSP (val))
5487             break;
5488         }
5489     }
5490  too_short:
5491   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5492   return 0;
5493
5494  no_more_source:
5495   detect_info->found |= found;
5496   return 1;
5497 }
5498
5499 static void
5500 decode_coding_charset (struct coding_system *coding)
5501 {
5502   const unsigned char *src = coding->source + coding->consumed;
5503   const unsigned char *src_end = coding->source + coding->src_bytes;
5504   const unsigned char *src_base;
5505   int *charbuf = coding->charbuf + coding->charbuf_used;
5506   /* We may produce one charset annotation in one loop and one more at
5507      the end.  */
5508   int *charbuf_end
5509     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5510   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5511   bool multibytep = coding->src_multibyte;
5512   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5513   Lisp_Object valids;
5514   ptrdiff_t char_offset = coding->produced_char;
5515   ptrdiff_t last_offset = char_offset;
5516   int last_id = charset_ascii;
5517   bool eol_dos
5518     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5519   int byte_after_cr = -1;
5520
5521   valids = AREF (attrs, coding_attr_charset_valids);
5522
5523   while (1)
5524     {
5525       int c;
5526       Lisp_Object val;
5527       struct charset *charset;
5528       int dim;
5529       int len = 1;
5530       unsigned code;
5531
5532       src_base = src;
5533       consumed_chars_base = consumed_chars;
5534
5535       if (charbuf >= charbuf_end)
5536         {
5537           if (byte_after_cr >= 0)
5538             src_base--;
5539           break;
5540         }
5541
5542       if (byte_after_cr >= 0)
5543         {
5544           c = byte_after_cr;
5545           byte_after_cr = -1;
5546         }
5547       else
5548         {
5549           ONE_MORE_BYTE (c);
5550           if (eol_dos && c == '\r')
5551             ONE_MORE_BYTE (byte_after_cr);
5552         }
5553       if (c < 0)
5554         goto invalid_code;
5555       code = c;
5556
5557       val = AREF (valids, c);
5558       if (! INTEGERP (val) && ! CONSP (val))
5559         goto invalid_code;
5560       if (INTEGERP (val))
5561         {
5562           charset = CHARSET_FROM_ID (XFASTINT (val));
5563           dim = CHARSET_DIMENSION (charset);
5564           while (len < dim)
5565             {
5566               ONE_MORE_BYTE (c);
5567               code = (code << 8) | c;
5568               len++;
5569             }
5570           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5571                               charset, code, c);
5572         }
5573       else
5574         {
5575           /* VAL is a list of charset IDs.  It is assured that the
5576              list is sorted by charset dimensions (smaller one
5577              comes first).  */
5578           while (CONSP (val))
5579             {
5580               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5581               dim = CHARSET_DIMENSION (charset);
5582               while (len < dim)
5583                 {
5584                   ONE_MORE_BYTE (c);
5585                   code = (code << 8) | c;
5586                   len++;
5587                 }
5588               CODING_DECODE_CHAR (coding, src, src_base,
5589                                   src_end, charset, code, c);
5590               if (c >= 0)
5591                 break;
5592               val = XCDR (val);
5593             }
5594         }
5595       if (c < 0)
5596         goto invalid_code;
5597       if (charset->id != charset_ascii
5598           && last_id != charset->id)
5599         {
5600           if (last_id != charset_ascii)
5601             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5602           last_id = charset->id;
5603           last_offset = char_offset;
5604         }
5605
5606       *charbuf++ = c;
5607       char_offset++;
5608       continue;
5609
5610     invalid_code:
5611       src = src_base;
5612       consumed_chars = consumed_chars_base;
5613       ONE_MORE_BYTE (c);
5614       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5615       char_offset++;
5616     }
5617
5618  no_more_source:
5619   if (last_id != charset_ascii)
5620     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5621   coding->consumed_char += consumed_chars_base;
5622   coding->consumed = src_base - coding->source;
5623   coding->charbuf_used = charbuf - coding->charbuf;
5624 }
5625
5626 static bool
5627 encode_coding_charset (struct coding_system *coding)
5628 {
5629   bool multibytep = coding->dst_multibyte;
5630   int *charbuf = coding->charbuf;
5631   int *charbuf_end = charbuf + coding->charbuf_used;
5632   unsigned char *dst = coding->destination + coding->produced;
5633   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5634   int safe_room = MAX_MULTIBYTE_LENGTH;
5635   ptrdiff_t produced_chars = 0;
5636   Lisp_Object attrs, charset_list;
5637   bool ascii_compatible;
5638   int c;
5639
5640   CODING_GET_INFO (coding, attrs, charset_list);
5641   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5642
5643   while (charbuf < charbuf_end)
5644     {
5645       struct charset *charset;
5646       unsigned code;
5647
5648       ASSURE_DESTINATION (safe_room);
5649       c = *charbuf++;
5650       if (ascii_compatible && ASCII_CHAR_P (c))
5651         EMIT_ONE_ASCII_BYTE (c);
5652       else if (CHAR_BYTE8_P (c))
5653         {
5654           c = CHAR_TO_BYTE8 (c);
5655           EMIT_ONE_BYTE (c);
5656         }
5657       else
5658         {
5659           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5660                                &code, charset);
5661
5662           if (charset)
5663             {
5664               if (CHARSET_DIMENSION (charset) == 1)
5665                 EMIT_ONE_BYTE (code);
5666               else if (CHARSET_DIMENSION (charset) == 2)
5667                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5668               else if (CHARSET_DIMENSION (charset) == 3)
5669                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5670               else
5671                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5672                                  (code >> 8) & 0xFF, code & 0xFF);
5673             }
5674           else
5675             {
5676               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5677                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5678               else
5679                 c = coding->default_char;
5680               EMIT_ONE_BYTE (c);
5681             }
5682         }
5683     }
5684
5685   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5686   coding->produced_char += produced_chars;
5687   coding->produced = dst - coding->destination;
5688   return 0;
5689 }
5690
5691 \f
5692 /*** 7. C library functions ***/
5693
5694 /* Setup coding context CODING from information about CODING_SYSTEM.
5695    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5696    CODING_SYSTEM is invalid, signal an error.  */
5697
5698 void
5699 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5700 {
5701   Lisp_Object attrs;
5702   Lisp_Object eol_type;
5703   Lisp_Object coding_type;
5704   Lisp_Object val;
5705
5706   if (NILP (coding_system))
5707     coding_system = Qundecided;
5708
5709   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5710
5711   attrs = CODING_ID_ATTRS (coding->id);
5712   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5713
5714   coding->mode = 0;
5715   if (VECTORP (eol_type))
5716     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5717                             | CODING_REQUIRE_DETECTION_MASK);
5718   else if (! EQ (eol_type, Qunix))
5719     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5720                             | CODING_REQUIRE_ENCODING_MASK);
5721   else
5722     coding->common_flags = 0;
5723   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5724     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5725   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5726     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5727   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5728     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5729
5730   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5731   coding->max_charset_id = SCHARS (val) - 1;
5732   coding->safe_charsets = SDATA (val);
5733   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5734   coding->carryover_bytes = 0;
5735   coding->raw_destination = 0;
5736
5737   coding_type = CODING_ATTR_TYPE (attrs);
5738   if (EQ (coding_type, Qundecided))
5739     {
5740       coding->detector = NULL;
5741       coding->decoder = decode_coding_raw_text;
5742       coding->encoder = encode_coding_raw_text;
5743       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5744       coding->spec.undecided.inhibit_nbd
5745         = (encode_inhibit_flag
5746            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5747       coding->spec.undecided.inhibit_ied
5748         = (encode_inhibit_flag
5749            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5750       coding->spec.undecided.prefer_utf_8
5751         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5752     }
5753   else if (EQ (coding_type, Qiso_2022))
5754     {
5755       int i;
5756       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5757
5758       /* Invoke graphic register 0 to plane 0.  */
5759       CODING_ISO_INVOCATION (coding, 0) = 0;
5760       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5761       CODING_ISO_INVOCATION (coding, 1)
5762         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5763       /* Setup the initial status of designation.  */
5764       for (i = 0; i < 4; i++)
5765         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5766       /* Not single shifting initially.  */
5767       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5768       /* Beginning of buffer should also be regarded as bol. */
5769       CODING_ISO_BOL (coding) = 1;
5770       coding->detector = detect_coding_iso_2022;
5771       coding->decoder = decode_coding_iso_2022;
5772       coding->encoder = encode_coding_iso_2022;
5773       if (flags & CODING_ISO_FLAG_SAFE)
5774         coding->mode |= CODING_MODE_SAFE_ENCODING;
5775       coding->common_flags
5776         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5777             | CODING_REQUIRE_FLUSHING_MASK);
5778       if (flags & CODING_ISO_FLAG_COMPOSITION)
5779         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5780       if (flags & CODING_ISO_FLAG_DESIGNATION)
5781         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5782       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5783         {
5784           setup_iso_safe_charsets (attrs);
5785           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5786           coding->max_charset_id = SCHARS (val) - 1;
5787           coding->safe_charsets = SDATA (val);
5788         }
5789       CODING_ISO_FLAGS (coding) = flags;
5790       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5791       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5792       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5793       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5794     }
5795   else if (EQ (coding_type, Qcharset))
5796     {
5797       coding->detector = detect_coding_charset;
5798       coding->decoder = decode_coding_charset;
5799       coding->encoder = encode_coding_charset;
5800       coding->common_flags
5801         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5802     }
5803   else if (EQ (coding_type, Qutf_8))
5804     {
5805       val = AREF (attrs, coding_attr_utf_bom);
5806       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5807                                    : EQ (val, Qt) ? utf_with_bom
5808                                    : utf_without_bom);
5809       coding->detector = detect_coding_utf_8;
5810       coding->decoder = decode_coding_utf_8;
5811       coding->encoder = encode_coding_utf_8;
5812       coding->common_flags
5813         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5814       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5815         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5816     }
5817   else if (EQ (coding_type, Qutf_16))
5818     {
5819       val = AREF (attrs, coding_attr_utf_bom);
5820       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5821                                     : EQ (val, Qt) ? utf_with_bom
5822                                     : utf_without_bom);
5823       val = AREF (attrs, coding_attr_utf_16_endian);
5824       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5825                                        : utf_16_little_endian);
5826       CODING_UTF_16_SURROGATE (coding) = 0;
5827       coding->detector = detect_coding_utf_16;
5828       coding->decoder = decode_coding_utf_16;
5829       coding->encoder = encode_coding_utf_16;
5830       coding->common_flags
5831         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5832       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5833         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5834     }
5835   else if (EQ (coding_type, Qccl))
5836     {
5837       coding->detector = detect_coding_ccl;
5838       coding->decoder = decode_coding_ccl;
5839       coding->encoder = encode_coding_ccl;
5840       coding->common_flags
5841         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5842             | CODING_REQUIRE_FLUSHING_MASK);
5843     }
5844   else if (EQ (coding_type, Qemacs_mule))
5845     {
5846       coding->detector = detect_coding_emacs_mule;
5847       coding->decoder = decode_coding_emacs_mule;
5848       coding->encoder = encode_coding_emacs_mule;
5849       coding->common_flags
5850         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5851       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5852           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5853         {
5854           Lisp_Object tail, safe_charsets;
5855           int max_charset_id = 0;
5856
5857           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5858                tail = XCDR (tail))
5859             if (max_charset_id < XFASTINT (XCAR (tail)))
5860               max_charset_id = XFASTINT (XCAR (tail));
5861           safe_charsets = make_uninit_string (max_charset_id + 1);
5862           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5863           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5864                tail = XCDR (tail))
5865             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5866           coding->max_charset_id = max_charset_id;
5867           coding->safe_charsets = SDATA (safe_charsets);
5868         }
5869       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5870       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5871     }
5872   else if (EQ (coding_type, Qshift_jis))
5873     {
5874       coding->detector = detect_coding_sjis;
5875       coding->decoder = decode_coding_sjis;
5876       coding->encoder = encode_coding_sjis;
5877       coding->common_flags
5878         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5879     }
5880   else if (EQ (coding_type, Qbig5))
5881     {
5882       coding->detector = detect_coding_big5;
5883       coding->decoder = decode_coding_big5;
5884       coding->encoder = encode_coding_big5;
5885       coding->common_flags
5886         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5887     }
5888   else                          /* EQ (coding_type, Qraw_text) */
5889     {
5890       coding->detector = NULL;
5891       coding->decoder = decode_coding_raw_text;
5892       coding->encoder = encode_coding_raw_text;
5893       if (! EQ (eol_type, Qunix))
5894         {
5895           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5896           if (! VECTORP (eol_type))
5897             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5898         }
5899
5900     }
5901
5902   return;
5903 }
5904
5905 /* Return a list of charsets supported by CODING.  */
5906
5907 Lisp_Object
5908 coding_charset_list (struct coding_system *coding)
5909 {
5910   Lisp_Object attrs, charset_list;
5911
5912   CODING_GET_INFO (coding, attrs, charset_list);
5913   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5914     {
5915       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5916
5917       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5918         charset_list = Viso_2022_charset_list;
5919     }
5920   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5921     {
5922       charset_list = Vemacs_mule_charset_list;
5923     }
5924   return charset_list;
5925 }
5926
5927
5928 /* Return a list of charsets supported by CODING-SYSTEM.  */
5929
5930 Lisp_Object
5931 coding_system_charset_list (Lisp_Object coding_system)
5932 {
5933   ptrdiff_t id;
5934   Lisp_Object attrs, charset_list;
5935
5936   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5937   attrs = CODING_ID_ATTRS (id);
5938
5939   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5940     {
5941       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5942
5943       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5944         charset_list = Viso_2022_charset_list;
5945       else
5946         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5947     }
5948   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5949     {
5950       charset_list = Vemacs_mule_charset_list;
5951     }
5952   else
5953     {
5954       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5955     }
5956   return charset_list;
5957 }
5958
5959
5960 /* Return raw-text or one of its subsidiaries that has the same
5961    eol_type as CODING-SYSTEM.  */
5962
5963 Lisp_Object
5964 raw_text_coding_system (Lisp_Object coding_system)
5965 {
5966   Lisp_Object spec, attrs;
5967   Lisp_Object eol_type, raw_text_eol_type;
5968
5969   if (NILP (coding_system))
5970     return Qraw_text;
5971   spec = CODING_SYSTEM_SPEC (coding_system);
5972   attrs = AREF (spec, 0);
5973
5974   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5975     return coding_system;
5976
5977   eol_type = AREF (spec, 2);
5978   if (VECTORP (eol_type))
5979     return Qraw_text;
5980   spec = CODING_SYSTEM_SPEC (Qraw_text);
5981   raw_text_eol_type = AREF (spec, 2);
5982   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5983           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5984           : AREF (raw_text_eol_type, 2));
5985 }
5986
5987 /* Return true if CODING corresponds to raw-text coding-system.  */
5988
5989 bool
5990 raw_text_coding_system_p (struct coding_system *coding)
5991 {
5992   return (coding->decoder == decode_coding_raw_text
5993           && coding->encoder == encode_coding_raw_text) ? true : false;
5994 }
5995
5996
5997 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5998    the subsidiary that has the same eol-spec as PARENT (if it is not
5999    nil and specifies end-of-line format) or the system's setting
6000    (system_eol_type).  */
6001
6002 Lisp_Object
6003 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6004 {
6005   Lisp_Object spec, eol_type;
6006
6007   if (NILP (coding_system))
6008     coding_system = Qraw_text;
6009   else
6010     CHECK_CODING_SYSTEM (coding_system);
6011   spec = CODING_SYSTEM_SPEC (coding_system);
6012   eol_type = AREF (spec, 2);
6013   if (VECTORP (eol_type))
6014     {
6015       Lisp_Object parent_eol_type;
6016
6017       if (! NILP (parent))
6018         {
6019           Lisp_Object parent_spec;
6020
6021           CHECK_CODING_SYSTEM (parent);
6022           parent_spec = CODING_SYSTEM_SPEC (parent);
6023           parent_eol_type = AREF (parent_spec, 2);
6024           if (VECTORP (parent_eol_type))
6025             parent_eol_type = system_eol_type;
6026         }
6027       else
6028         parent_eol_type = system_eol_type;
6029       if (EQ (parent_eol_type, Qunix))
6030         coding_system = AREF (eol_type, 0);
6031       else if (EQ (parent_eol_type, Qdos))
6032         coding_system = AREF (eol_type, 1);
6033       else if (EQ (parent_eol_type, Qmac))
6034         coding_system = AREF (eol_type, 2);
6035     }
6036   return coding_system;
6037 }
6038
6039
6040 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6041    decided for writing to a process.  If not, complement them, and
6042    return a new coding system.  */
6043
6044 Lisp_Object
6045 complement_process_encoding_system (Lisp_Object coding_system)
6046 {
6047   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6048   Lisp_Object spec, attrs;
6049   int i;
6050
6051   for (i = 0; i < 3; i++)
6052     {
6053       if (i == 1)
6054         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6055       else if (i == 2)
6056         coding_system = preferred_coding_system ();
6057       spec = CODING_SYSTEM_SPEC (coding_system);
6058       if (NILP (spec))
6059         continue;
6060       attrs = AREF (spec, 0);
6061       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6062         coding_base = CODING_ATTR_BASE_NAME (attrs);
6063       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6064         eol_base = coding_system;
6065       if (! NILP (coding_base) && ! NILP (eol_base))
6066         break;
6067     }
6068
6069   if (i > 0)
6070     /* The original CODING_SYSTEM didn't specify text-conversion or
6071        eol-conversion.  Be sure that we return a fully complemented
6072        coding system.  */
6073     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6074   return coding_system;
6075 }
6076
6077
6078 /* Emacs has a mechanism to automatically detect a coding system if it
6079    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6080    it's impossible to distinguish some coding systems accurately
6081    because they use the same range of codes.  So, at first, coding
6082    systems are categorized into 7, those are:
6083
6084    o coding-category-emacs-mule
6085
6086         The category for a coding system which has the same code range
6087         as Emacs' internal format.  Assigned the coding-system (Lisp
6088         symbol) `emacs-mule' by default.
6089
6090    o coding-category-sjis
6091
6092         The category for a coding system which has the same code range
6093         as SJIS.  Assigned the coding-system (Lisp
6094         symbol) `japanese-shift-jis' by default.
6095
6096    o coding-category-iso-7
6097
6098         The category for a coding system which has the same code range
6099         as ISO2022 of 7-bit environment.  This doesn't use any locking
6100         shift and single shift functions.  This can encode/decode all
6101         charsets.  Assigned the coding-system (Lisp symbol)
6102         `iso-2022-7bit' by default.
6103
6104    o coding-category-iso-7-tight
6105
6106         Same as coding-category-iso-7 except that this can
6107         encode/decode only the specified charsets.
6108
6109    o coding-category-iso-8-1
6110
6111         The category for a coding system which has the same code range
6112         as ISO2022 of 8-bit environment and graphic plane 1 used only
6113         for DIMENSION1 charset.  This doesn't use any locking shift
6114         and single shift functions.  Assigned the coding-system (Lisp
6115         symbol) `iso-latin-1' by default.
6116
6117    o coding-category-iso-8-2
6118
6119         The category for a coding system which has the same code range
6120         as ISO2022 of 8-bit environment and graphic plane 1 used only
6121         for DIMENSION2 charset.  This doesn't use any locking shift
6122         and single shift functions.  Assigned the coding-system (Lisp
6123         symbol) `japanese-iso-8bit' by default.
6124
6125    o coding-category-iso-7-else
6126
6127         The category for a coding system which has the same code range
6128         as ISO2022 of 7-bit environment but uses locking shift or
6129         single shift functions.  Assigned the coding-system (Lisp
6130         symbol) `iso-2022-7bit-lock' by default.
6131
6132    o coding-category-iso-8-else
6133
6134         The category for a coding system which has the same code range
6135         as ISO2022 of 8-bit environment but uses locking shift or
6136         single shift functions.  Assigned the coding-system (Lisp
6137         symbol) `iso-2022-8bit-ss2' by default.
6138
6139    o coding-category-big5
6140
6141         The category for a coding system which has the same code range
6142         as BIG5.  Assigned the coding-system (Lisp symbol)
6143         `cn-big5' by default.
6144
6145    o coding-category-utf-8
6146
6147         The category for a coding system which has the same code range
6148         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6149         symbol) `utf-8' by default.
6150
6151    o coding-category-utf-16-be
6152
6153         The category for a coding system in which a text has an
6154         Unicode signature (cf. Unicode Standard) in the order of BIG
6155         endian at the head.  Assigned the coding-system (Lisp symbol)
6156         `utf-16-be' by default.
6157
6158    o coding-category-utf-16-le
6159
6160         The category for a coding system in which a text has an
6161         Unicode signature (cf. Unicode Standard) in the order of
6162         LITTLE endian at the head.  Assigned the coding-system (Lisp
6163         symbol) `utf-16-le' by default.
6164
6165    o coding-category-ccl
6166
6167         The category for a coding system of which encoder/decoder is
6168         written in CCL programs.  The default value is nil, i.e., no
6169         coding system is assigned.
6170
6171    o coding-category-binary
6172
6173         The category for a coding system not categorized in any of the
6174         above.  Assigned the coding-system (Lisp symbol)
6175         `no-conversion' by default.
6176
6177    Each of them is a Lisp symbol and the value is an actual
6178    `coding-system's (this is also a Lisp symbol) assigned by a user.
6179    What Emacs does actually is to detect a category of coding system.
6180    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6181    decide only one possible category, it selects a category of the
6182    highest priority.  Priorities of categories are also specified by a
6183    user in a Lisp variable `coding-category-list'.
6184
6185 */
6186
6187 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6188                                            int eol_seen);
6189
6190
6191 /* Return the number of ASCII characters at the head of the source.
6192    By side effects, set coding->head_ascii and update
6193    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6194    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6195    reliable only when all the source bytes are ASCII.  */
6196
6197 static ptrdiff_t
6198 check_ascii (struct coding_system *coding)
6199 {
6200   const unsigned char *src, *end;
6201   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6202   int eol_seen = coding->eol_seen;
6203
6204   coding_set_source (coding);
6205   src = coding->source;
6206   end = src + coding->src_bytes;
6207
6208   if (inhibit_eol_conversion
6209       || SYMBOLP (eol_type))
6210     {
6211       /* We don't have to check EOL format.  */
6212       while (src < end && !( *src & 0x80))
6213         {
6214           if (*src++ == '\n')
6215             eol_seen |= EOL_SEEN_LF;
6216         }
6217     }
6218   else
6219     {
6220       end--;                /* We look ahead one byte for "CR LF".  */
6221       while (src < end)
6222         {
6223           int c = *src;
6224
6225           if (c & 0x80)
6226             break;
6227           src++;
6228           if (c == '\r')
6229             {
6230               if (*src == '\n')
6231                 {
6232                   eol_seen |= EOL_SEEN_CRLF;
6233                   src++;
6234                 }
6235               else
6236                 eol_seen |= EOL_SEEN_CR;
6237             }
6238           else if (c == '\n')
6239             eol_seen |= EOL_SEEN_LF;
6240         }
6241       if (src == end)
6242         {
6243           int c = *src;
6244
6245           /* All bytes but the last one C are ASCII.  */
6246           if (! (c & 0x80))
6247             {
6248               if (c == '\r')
6249                 eol_seen |= EOL_SEEN_CR;
6250               else if (c  == '\n')
6251                 eol_seen |= EOL_SEEN_LF;
6252               src++;
6253             }
6254         }
6255     }
6256   coding->head_ascii = src - coding->source;
6257   coding->eol_seen = eol_seen;
6258   return (coding->head_ascii);
6259 }
6260
6261
6262 /* Return the number of characters at the source if all the bytes are
6263    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6264    effects, update coding->eol_seen.  The value of coding->eol_seen is
6265    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6266    the value is reliable only when all the source bytes are valid
6267    UTF-8.  */
6268
6269 static ptrdiff_t
6270 check_utf_8 (struct coding_system *coding)
6271 {
6272   const unsigned char *src, *end;
6273   int eol_seen;
6274   ptrdiff_t nchars = coding->head_ascii;
6275
6276   if (coding->head_ascii < 0)
6277     check_ascii (coding);
6278   else
6279     coding_set_source (coding);
6280   src = coding->source + coding->head_ascii;
6281   /* We look ahead one byte for CR LF.  */
6282   end = coding->source + coding->src_bytes - 1;
6283   eol_seen = coding->eol_seen;
6284   while (src < end)
6285     {
6286       int c = *src;
6287
6288       if (UTF_8_1_OCTET_P (*src))
6289         {
6290           src++;
6291           if (c < 0x20)
6292             {
6293               if (c == '\r')
6294                 {
6295                   if (*src == '\n')
6296                     {
6297                       eol_seen |= EOL_SEEN_CRLF;
6298                       src++;
6299                       nchars++;
6300                     }
6301                   else
6302                     eol_seen |= EOL_SEEN_CR;
6303                 }
6304               else if (c == '\n')
6305                 eol_seen |= EOL_SEEN_LF;
6306             }
6307         }
6308       else if (UTF_8_2_OCTET_LEADING_P (c))
6309         {
6310           if (c < 0xC2          /* overlong sequence */
6311               || src + 1 >= end
6312               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6313             return -1;
6314           src += 2;
6315         }
6316       else if (UTF_8_3_OCTET_LEADING_P (c))
6317         {
6318           if (src + 2 >= end
6319               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6320                     && UTF_8_EXTRA_OCTET_P (src[2])))
6321             return -1;
6322           c = (((c & 0xF) << 12)
6323                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6324           if (c < 0x800                       /* overlong sequence */
6325               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6326             return -1;
6327           src += 3;
6328         }
6329       else if (UTF_8_4_OCTET_LEADING_P (c))
6330         {
6331           if (src + 3 >= end
6332               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6333                     && UTF_8_EXTRA_OCTET_P (src[2])
6334                     && UTF_8_EXTRA_OCTET_P (src[3])))
6335             return -1;
6336           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6337                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6338           if (c < 0x10000       /* overlong sequence */
6339               || c >= 0x110000) /* non-Unicode character  */
6340             return -1;
6341           src += 4;
6342         }
6343       else
6344         return -1;
6345       nchars++;
6346     }
6347
6348   if (src == end)
6349     {
6350       if (! UTF_8_1_OCTET_P (*src))
6351         return -1;
6352       nchars++;
6353       if (*src == '\r')
6354         eol_seen |= EOL_SEEN_CR;
6355       else if (*src  == '\n')
6356         eol_seen |= EOL_SEEN_LF;
6357     }
6358   coding->eol_seen = eol_seen;
6359   return nchars;
6360 }
6361
6362
6363 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6364    SOURCE is encoded.  If CATEGORY is one of
6365    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6366    two-byte, else they are encoded by one-byte.
6367
6368    Return one of EOL_SEEN_XXX.  */
6369
6370 #define MAX_EOL_CHECK_COUNT 3
6371
6372 static int
6373 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6374             enum coding_category category)
6375 {
6376   const unsigned char *src = source, *src_end = src + src_bytes;
6377   unsigned char c;
6378   int total  = 0;
6379   int eol_seen = EOL_SEEN_NONE;
6380
6381   if ((1 << category) & CATEGORY_MASK_UTF_16)
6382     {
6383       bool msb = category == (coding_category_utf_16_le
6384                               | coding_category_utf_16_le_nosig);
6385       bool lsb = !msb;
6386
6387       while (src + 1 < src_end)
6388         {
6389           c = src[lsb];
6390           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6391             {
6392               int this_eol;
6393
6394               if (c == '\n')
6395                 this_eol = EOL_SEEN_LF;
6396               else if (src + 3 >= src_end
6397                        || src[msb + 2] != 0
6398                        || src[lsb + 2] != '\n')
6399                 this_eol = EOL_SEEN_CR;
6400               else
6401                 {
6402                   this_eol = EOL_SEEN_CRLF;
6403                   src += 2;
6404                 }
6405
6406               if (eol_seen == EOL_SEEN_NONE)
6407                 /* This is the first end-of-line.  */
6408                 eol_seen = this_eol;
6409               else if (eol_seen != this_eol)
6410                 {
6411                   /* The found type is different from what found before.
6412                      Allow for stray ^M characters in DOS EOL files.  */
6413                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6414                       || (eol_seen == EOL_SEEN_CRLF
6415                           && this_eol == EOL_SEEN_CR))
6416                     eol_seen = EOL_SEEN_CRLF;
6417                   else
6418                     {
6419                       eol_seen = EOL_SEEN_LF;
6420                       break;
6421                     }
6422                 }
6423               if (++total == MAX_EOL_CHECK_COUNT)
6424                 break;
6425             }
6426           src += 2;
6427         }
6428     }
6429   else
6430     while (src < src_end)
6431       {
6432         c = *src++;
6433         if (c == '\n' || c == '\r')
6434           {
6435             int this_eol;
6436
6437             if (c == '\n')
6438               this_eol = EOL_SEEN_LF;
6439             else if (src >= src_end || *src != '\n')
6440               this_eol = EOL_SEEN_CR;
6441             else
6442               this_eol = EOL_SEEN_CRLF, src++;
6443
6444             if (eol_seen == EOL_SEEN_NONE)
6445               /* This is the first end-of-line.  */
6446               eol_seen = this_eol;
6447             else if (eol_seen != this_eol)
6448               {
6449                 /* The found type is different from what found before.
6450                    Allow for stray ^M characters in DOS EOL files.  */
6451                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6452                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6453                   eol_seen = EOL_SEEN_CRLF;
6454                 else
6455                   {
6456                     eol_seen = EOL_SEEN_LF;
6457                     break;
6458                   }
6459               }
6460             if (++total == MAX_EOL_CHECK_COUNT)
6461               break;
6462           }
6463       }
6464   return eol_seen;
6465 }
6466
6467
6468 static Lisp_Object
6469 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6470 {
6471   Lisp_Object eol_type;
6472
6473   eol_type = CODING_ID_EOL_TYPE (coding->id);
6474   if (! VECTORP (eol_type))
6475     /* Already adjusted.  */
6476     return eol_type;
6477   if (eol_seen & EOL_SEEN_LF)
6478     {
6479       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6480       eol_type = Qunix;
6481     }
6482   else if (eol_seen & EOL_SEEN_CRLF)
6483     {
6484       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6485       eol_type = Qdos;
6486     }
6487   else if (eol_seen & EOL_SEEN_CR)
6488     {
6489       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6490       eol_type = Qmac;
6491     }
6492   return eol_type;
6493 }
6494
6495 /* Detect how a text specified in CODING is encoded.  If a coding
6496    system is detected, update fields of CODING by the detected coding
6497    system.  */
6498
6499 static void
6500 detect_coding (struct coding_system *coding)
6501 {
6502   const unsigned char *src, *src_end;
6503   unsigned int saved_mode = coding->mode;
6504   Lisp_Object found = Qnil;
6505   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6506
6507   coding->consumed = coding->consumed_char = 0;
6508   coding->produced = coding->produced_char = 0;
6509   coding_set_source (coding);
6510
6511   src_end = coding->source + coding->src_bytes;
6512
6513   coding->eol_seen = EOL_SEEN_NONE;
6514   /* If we have not yet decided the text encoding type, detect it
6515      now.  */
6516   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6517     {
6518       int c, i;
6519       struct coding_detection_info detect_info;
6520       bool null_byte_found = 0, eight_bit_found = 0;
6521       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6522                                        inhibit_null_byte_detection);
6523       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6524                                        inhibit_iso_escape_detection);
6525       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6526
6527       coding->head_ascii = 0;
6528       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6529       for (src = coding->source; src < src_end; src++)
6530         {
6531           c = *src;
6532           if (c & 0x80)
6533             {
6534               eight_bit_found = 1;
6535               if (null_byte_found)
6536                 break;
6537             }
6538           else if (c < 0x20)
6539             {
6540               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6541                   && ! inhibit_ied
6542                   && ! detect_info.checked)
6543                 {
6544                   if (detect_coding_iso_2022 (coding, &detect_info))
6545                     {
6546                       /* We have scanned the whole data.  */
6547                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6548                         {
6549                           /* We didn't find an 8-bit code.  We may
6550                              have found a null-byte, but it's very
6551                              rare that a binary file conforms to
6552                              ISO-2022.  */
6553                           src = src_end;
6554                           coding->head_ascii = src - coding->source;
6555                         }
6556                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6557                       break;
6558                     }
6559                 }
6560               else if (! c && !inhibit_nbd)
6561                 {
6562                   null_byte_found = 1;
6563                   if (eight_bit_found)
6564                     break;
6565                 }
6566               else if (! disable_ascii_optimization
6567                        && ! inhibit_eol_conversion)
6568                 {
6569                   if (c == '\r')
6570                     {
6571                       if (src < src_end && src[1] == '\n')
6572                         {
6573                           coding->eol_seen |= EOL_SEEN_CRLF;
6574                           src++;
6575                           if (! eight_bit_found)
6576                             coding->head_ascii++;
6577                         }
6578                       else
6579                         coding->eol_seen |= EOL_SEEN_CR;
6580                     }
6581                   else if (c == '\n')
6582                     {
6583                       coding->eol_seen |= EOL_SEEN_LF;
6584                     }
6585                 }
6586
6587               if (! eight_bit_found)
6588                 coding->head_ascii++;
6589             }
6590           else if (! eight_bit_found)
6591             coding->head_ascii++;
6592         }
6593
6594       if (null_byte_found || eight_bit_found
6595           || coding->head_ascii < coding->src_bytes
6596           || detect_info.found)
6597         {
6598           enum coding_category category;
6599           struct coding_system *this;
6600
6601           if (coding->head_ascii == coding->src_bytes)
6602             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6603             for (i = 0; i < coding_category_raw_text; i++)
6604               {
6605                 category = coding_priorities[i];
6606                 this = coding_categories + category;
6607                 if (detect_info.found & (1 << category))
6608                   break;
6609               }
6610           else
6611             {
6612               if (null_byte_found)
6613                 {
6614                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6615                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6616                 }
6617               else if (prefer_utf_8
6618                        && detect_coding_utf_8 (coding, &detect_info))
6619                 {
6620                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6621                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6622                 }
6623               for (i = 0; i < coding_category_raw_text; i++)
6624                 {
6625                   category = coding_priorities[i];
6626                   this = coding_categories + category;
6627                   /* Some of this->detector (e.g. detect_coding_sjis)
6628                      require this information.  */
6629                   coding->id = this->id;
6630                   if (this->id < 0)
6631                     {
6632                       /* No coding system of this category is defined.  */
6633                       detect_info.rejected |= (1 << category);
6634                     }
6635                   else if (category >= coding_category_raw_text)
6636                     continue;
6637                   else if (detect_info.checked & (1 << category))
6638                     {
6639                       if (detect_info.found & (1 << category))
6640                         break;
6641                     }
6642                   else if ((*(this->detector)) (coding, &detect_info)
6643                            && detect_info.found & (1 << category))
6644                     break;
6645                 }
6646             }
6647
6648           if (i < coding_category_raw_text)
6649             {
6650               if (category == coding_category_utf_8_auto)
6651                 {
6652                   Lisp_Object coding_systems;
6653
6654                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6655                                          coding_attr_utf_bom);
6656                   if (CONSP (coding_systems))
6657                     {
6658                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6659                         found = XCAR (coding_systems);
6660                       else
6661                         found = XCDR (coding_systems);
6662                     }
6663                   else
6664                     found = CODING_ID_NAME (this->id);
6665                 }
6666               else if (category == coding_category_utf_16_auto)
6667                 {
6668                   Lisp_Object coding_systems;
6669
6670                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6671                                          coding_attr_utf_bom);
6672                   if (CONSP (coding_systems))
6673                     {
6674                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6675                         found = XCAR (coding_systems);
6676                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6677                         found = XCDR (coding_systems);
6678                     }
6679                   else
6680                     found = CODING_ID_NAME (this->id);
6681                 }
6682               else
6683                 found = CODING_ID_NAME (this->id);
6684             }
6685           else if (null_byte_found)
6686             found = Qno_conversion;
6687           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6688                    == CATEGORY_MASK_ANY)
6689             found = Qraw_text;
6690           else if (detect_info.rejected)
6691             for (i = 0; i < coding_category_raw_text; i++)
6692               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6693                 {
6694                   this = coding_categories + coding_priorities[i];
6695                   found = CODING_ID_NAME (this->id);
6696                   break;
6697                 }
6698         }
6699     }
6700   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6701            == coding_category_utf_8_auto)
6702     {
6703       Lisp_Object coding_systems;
6704       struct coding_detection_info detect_info;
6705
6706       coding_systems
6707         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6708       detect_info.found = detect_info.rejected = 0;
6709       if (check_ascii (coding) == coding->src_bytes)
6710         {
6711           if (CONSP (coding_systems))
6712             found = XCDR (coding_systems);
6713         }
6714       else
6715         {
6716           if (CONSP (coding_systems)
6717               && detect_coding_utf_8 (coding, &detect_info))
6718             {
6719               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6720                 found = XCAR (coding_systems);
6721               else
6722                 found = XCDR (coding_systems);
6723             }
6724         }
6725     }
6726   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6727            == coding_category_utf_16_auto)
6728     {
6729       Lisp_Object coding_systems;
6730       struct coding_detection_info detect_info;
6731
6732       coding_systems
6733         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6734       detect_info.found = detect_info.rejected = 0;
6735       coding->head_ascii = 0;
6736       if (CONSP (coding_systems)
6737           && detect_coding_utf_16 (coding, &detect_info))
6738         {
6739           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6740             found = XCAR (coding_systems);
6741           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6742             found = XCDR (coding_systems);
6743         }
6744     }
6745
6746   if (! NILP (found))
6747     {
6748       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6749                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6750                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6751                            : EOL_SEEN_LF);
6752
6753       setup_coding_system (found, coding);
6754       if (specified_eol != EOL_SEEN_NONE)
6755         adjust_coding_eol_type (coding, specified_eol);
6756     }
6757
6758   coding->mode = saved_mode;
6759 }
6760
6761
6762 static void
6763 decode_eol (struct coding_system *coding)
6764 {
6765   Lisp_Object eol_type;
6766   unsigned char *p, *pbeg, *pend;
6767
6768   eol_type = CODING_ID_EOL_TYPE (coding->id);
6769   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6770     return;
6771
6772   if (NILP (coding->dst_object))
6773     pbeg = coding->destination;
6774   else
6775     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6776   pend = pbeg + coding->produced;
6777
6778   if (VECTORP (eol_type))
6779     {
6780       int eol_seen = EOL_SEEN_NONE;
6781
6782       for (p = pbeg; p < pend; p++)
6783         {
6784           if (*p == '\n')
6785             eol_seen |= EOL_SEEN_LF;
6786           else if (*p == '\r')
6787             {
6788               if (p + 1 < pend && *(p + 1) == '\n')
6789                 {
6790                   eol_seen |= EOL_SEEN_CRLF;
6791                   p++;
6792                 }
6793               else
6794                 eol_seen |= EOL_SEEN_CR;
6795             }
6796         }
6797       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6798       if ((eol_seen & EOL_SEEN_CRLF) != 0
6799           && (eol_seen & EOL_SEEN_CR) != 0
6800           && (eol_seen & EOL_SEEN_LF) == 0)
6801         eol_seen = EOL_SEEN_CRLF;
6802       else if (eol_seen != EOL_SEEN_NONE
6803           && eol_seen != EOL_SEEN_LF
6804           && eol_seen != EOL_SEEN_CRLF
6805           && eol_seen != EOL_SEEN_CR)
6806         eol_seen = EOL_SEEN_LF;
6807       if (eol_seen != EOL_SEEN_NONE)
6808         eol_type = adjust_coding_eol_type (coding, eol_seen);
6809     }
6810
6811   if (EQ (eol_type, Qmac))
6812     {
6813       for (p = pbeg; p < pend; p++)
6814         if (*p == '\r')
6815           *p = '\n';
6816     }
6817   else if (EQ (eol_type, Qdos))
6818     {
6819       ptrdiff_t n = 0;
6820       ptrdiff_t pos = coding->dst_pos;
6821       ptrdiff_t pos_byte = coding->dst_pos_byte;
6822       ptrdiff_t pos_end = pos_byte + coding->produced - 1;
6823
6824       /* This assertion is here instead of code, now deleted, that
6825          handled the NILP case, which no longer happens with the
6826          current codebase.  */
6827       eassert (!NILP (coding->dst_object));
6828
6829       while (pos_byte < pos_end)
6830         {
6831           int incr;
6832
6833           p = BYTE_POS_ADDR (pos_byte);
6834           if (coding->dst_multibyte)
6835             incr = BYTES_BY_CHAR_HEAD (*p);
6836           else
6837             incr = 1;
6838
6839           if (*p == '\r' && p[1] == '\n')
6840             {
6841               del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6842               n++;
6843               pos_end--;
6844             }
6845           pos++;
6846           pos_byte += incr;
6847         }
6848       coding->produced -= n;
6849       coding->produced_char -= n;
6850     }
6851 }
6852
6853
6854 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
6855    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
6856    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
6857 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6858
6859 /* Return a translation table (or list of them) from coding system
6860    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6861    not ENCODEP). */
6862
6863 static Lisp_Object
6864 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6865 {
6866   Lisp_Object standard, translation_table;
6867   Lisp_Object val;
6868
6869   if (NILP (Venable_character_translation))
6870     {
6871       if (max_lookup)
6872         *max_lookup = 0;
6873       return Qnil;
6874     }
6875   if (encodep)
6876     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6877       standard = Vstandard_translation_table_for_encode;
6878   else
6879     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6880       standard = Vstandard_translation_table_for_decode;
6881   if (NILP (translation_table))
6882     translation_table = standard;
6883   else
6884     {
6885       if (SYMBOLP (translation_table))
6886         translation_table = Fget (translation_table, Qtranslation_table);
6887       else if (CONSP (translation_table))
6888         {
6889           translation_table = Fcopy_sequence (translation_table);
6890           for (val = translation_table; CONSP (val); val = XCDR (val))
6891             if (SYMBOLP (XCAR (val)))
6892               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6893         }
6894       if (CHAR_TABLE_P (standard))
6895         {
6896           if (CONSP (translation_table))
6897             translation_table = nconc2 (translation_table, list1 (standard));
6898           else
6899             translation_table = list2 (translation_table, standard);
6900         }
6901     }
6902
6903   if (max_lookup)
6904     {
6905       *max_lookup = 1;
6906       if (CHAR_TABLE_P (translation_table)
6907           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6908         {
6909           val = XCHAR_TABLE (translation_table)->extras[1];
6910           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6911             *max_lookup = min (XFASTINT (val), MAX_LOOKUP_MAX);
6912         }
6913       else if (CONSP (translation_table))
6914         {
6915           Lisp_Object tail;
6916
6917           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6918             if (CHAR_TABLE_P (XCAR (tail))
6919                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6920               {
6921                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6922                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6923                   *max_lookup = min (XFASTINT (tailval), MAX_LOOKUP_MAX);
6924               }
6925         }
6926     }
6927   return translation_table;
6928 }
6929
6930 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6931   do {                                                          \
6932     trans = Qnil;                                               \
6933     if (CHAR_TABLE_P (table))                                   \
6934       {                                                         \
6935         trans = CHAR_TABLE_REF (table, c);                      \
6936         if (CHARACTERP (trans))                                 \
6937           c = XFASTINT (trans), trans = Qnil;                   \
6938       }                                                         \
6939     else if (CONSP (table))                                     \
6940       {                                                         \
6941         Lisp_Object tail;                                       \
6942                                                                 \
6943         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6944           if (CHAR_TABLE_P (XCAR (tail)))                       \
6945             {                                                   \
6946               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6947               if (CHARACTERP (trans))                           \
6948                 c = XFASTINT (trans), trans = Qnil;             \
6949               else if (! NILP (trans))                          \
6950                 break;                                          \
6951             }                                                   \
6952       }                                                         \
6953   } while (0)
6954
6955
6956 /* Return a translation of character(s) at BUF according to TRANS.
6957    TRANS is TO-CHAR, [TO-CHAR ...], or ((FROM .  TO) ...) where FROM =
6958    [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].  The return value
6959    is TO-CHAR or [TO-CHAR ...] if a translation is found, Qnil if not
6960    found, or Qt if BUF is too short to lookup characters in FROM.  As
6961    a side effect, if a translation is found, *NCHARS is set to the
6962    number of characters being translated.  */
6963
6964 static Lisp_Object
6965 get_translation (Lisp_Object trans, int *buf, int *buf_end, ptrdiff_t *nchars)
6966 {
6967   if (INTEGERP (trans) || VECTORP (trans))
6968     {
6969       *nchars = 1;
6970       return trans;
6971     }
6972   for (; CONSP (trans); trans = XCDR (trans))
6973     {
6974       Lisp_Object val = XCAR (trans);
6975       Lisp_Object from = XCAR (val);
6976       ptrdiff_t len = ASIZE (from);
6977       ptrdiff_t i;
6978
6979       for (i = 0; i < len; i++)
6980         {
6981           if (buf + i == buf_end)
6982             return Qt;
6983           if (XINT (AREF (from, i)) != buf[i])
6984             break;
6985         }
6986       if (i == len)
6987         {
6988           *nchars = len;
6989           return XCDR (val);
6990         }
6991     }
6992   return Qnil;
6993 }
6994
6995
6996 static int
6997 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6998                bool last_block)
6999 {
7000   unsigned char *dst = coding->destination + coding->produced;
7001   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7002   ptrdiff_t produced;
7003   ptrdiff_t produced_chars = 0;
7004   int carryover = 0;
7005
7006   if (! coding->chars_at_source)
7007     {
7008       /* Source characters are in coding->charbuf.  */
7009       int *buf = coding->charbuf;
7010       int *buf_end = buf + coding->charbuf_used;
7011
7012       if (EQ (coding->src_object, coding->dst_object)
7013           && ! NILP (coding->dst_object))
7014         {
7015           eassert (growable_destination (coding));
7016           coding_set_source (coding);
7017           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7018         }
7019
7020       while (buf < buf_end)
7021         {
7022           int c = *buf;
7023           ptrdiff_t i;
7024
7025           if (c >= 0)
7026             {
7027               ptrdiff_t from_nchars = 1, to_nchars = 1;
7028               Lisp_Object trans = Qnil;
7029
7030               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7031               if (! NILP (trans))
7032                 {
7033                   trans = get_translation (trans, buf, buf_end, &from_nchars);
7034                   if (INTEGERP (trans))
7035                     c = XINT (trans);
7036                   else if (VECTORP (trans))
7037                     {
7038                       to_nchars = ASIZE (trans);
7039                       c = XINT (AREF (trans, 0));
7040                     }
7041                   else if (EQ (trans, Qt) && ! last_block)
7042                     break;
7043                 }
7044
7045               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7046                 {
7047                   eassert (growable_destination (coding));
7048                   ptrdiff_t dst_size;
7049                   if (INT_MULTIPLY_WRAPV (to_nchars, MAX_MULTIBYTE_LENGTH,
7050                                           &dst_size)
7051                       || INT_ADD_WRAPV (buf_end - buf, dst_size, &dst_size))
7052                     memory_full (SIZE_MAX);
7053                   dst = alloc_destination (coding, dst_size, dst);
7054                   if (EQ (coding->src_object, coding->dst_object))
7055                     {
7056                       coding_set_source (coding);
7057                       dst_end = (((unsigned char *) coding->source)
7058                                  + coding->consumed);
7059                     }
7060                   else
7061                     dst_end = coding->destination + coding->dst_bytes;
7062                 }
7063
7064               for (i = 0; i < to_nchars; i++)
7065                 {
7066                   if (i > 0)
7067                     c = XINT (AREF (trans, i));
7068                   if (coding->dst_multibyte
7069                       || ! CHAR_BYTE8_P (c))
7070                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7071                   else
7072                     *dst++ = CHAR_TO_BYTE8 (c);
7073                 }
7074               produced_chars += to_nchars;
7075               buf += from_nchars;
7076             }
7077           else
7078             /* This is an annotation datum.  (-C) is the length.  */
7079             buf += -c;
7080         }
7081       carryover = buf_end - buf;
7082     }
7083   else
7084     {
7085       /* Source characters are at coding->source.  */
7086       const unsigned char *src = coding->source;
7087       const unsigned char *src_end = src + coding->consumed;
7088
7089       if (EQ (coding->dst_object, coding->src_object))
7090         {
7091           eassert (growable_destination (coding));
7092           dst_end = (unsigned char *) src;
7093         }
7094       if (coding->src_multibyte != coding->dst_multibyte)
7095         {
7096           if (coding->src_multibyte)
7097             {
7098               bool multibytep = 1;
7099               ptrdiff_t consumed_chars = 0;
7100
7101               while (1)
7102                 {
7103                   const unsigned char *src_base = src;
7104                   int c;
7105
7106                   ONE_MORE_BYTE (c);
7107                   if (dst == dst_end)
7108                     {
7109                       eassert (growable_destination (coding));
7110                       if (EQ (coding->src_object, coding->dst_object))
7111                         dst_end = (unsigned char *) src;
7112                       if (dst == dst_end)
7113                         {
7114                           ptrdiff_t offset = src - coding->source;
7115
7116                           dst = alloc_destination (coding, src_end - src + 1,
7117                                                    dst);
7118                           dst_end = coding->destination + coding->dst_bytes;
7119                           coding_set_source (coding);
7120                           src = coding->source + offset;
7121                           src_end = coding->source + coding->consumed;
7122                           if (EQ (coding->src_object, coding->dst_object))
7123                             dst_end = (unsigned char *) src;
7124                         }
7125                     }
7126                   *dst++ = c;
7127                   produced_chars++;
7128                 }
7129             no_more_source:
7130               ;
7131             }
7132           else
7133             while (src < src_end)
7134               {
7135                 bool multibytep = 1;
7136                 int c = *src++;
7137
7138                 if (dst >= dst_end - 1)
7139                   {
7140                     eassert (growable_destination (coding));
7141                     if (EQ (coding->src_object, coding->dst_object))
7142                       dst_end = (unsigned char *) src;
7143                     if (dst >= dst_end - 1)
7144                       {
7145                         ptrdiff_t offset = src - coding->source;
7146                         ptrdiff_t more_bytes;
7147
7148                         if (EQ (coding->src_object, coding->dst_object))
7149                           more_bytes = ((src_end - src) / 2) + 2;
7150                         else
7151                           more_bytes = src_end - src + 2;
7152                         dst = alloc_destination (coding, more_bytes, dst);
7153                         dst_end = coding->destination + coding->dst_bytes;
7154                         coding_set_source (coding);
7155                         src = coding->source + offset;
7156                         src_end = coding->source + coding->consumed;
7157                         if (EQ (coding->src_object, coding->dst_object))
7158                           dst_end = (unsigned char *) src;
7159                       }
7160                   }
7161                 EMIT_ONE_BYTE (c);
7162               }
7163         }
7164       else
7165         {
7166           if (!EQ (coding->src_object, coding->dst_object))
7167             {
7168               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7169
7170               if (require > 0)
7171                 {
7172                   ptrdiff_t offset = src - coding->source;
7173
7174                   dst = alloc_destination (coding, require, dst);
7175                   coding_set_source (coding);
7176                   src = coding->source + offset;
7177                   src_end = coding->source + coding->consumed;
7178                 }
7179             }
7180           produced_chars = coding->consumed_char;
7181           while (src < src_end)
7182             *dst++ = *src++;
7183         }
7184     }
7185
7186   produced = dst - (coding->destination + coding->produced);
7187   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7188     insert_from_gap (produced_chars, produced, 0);
7189   coding->produced += produced;
7190   coding->produced_char += produced_chars;
7191   return carryover;
7192 }
7193
7194 /* Compose text in CODING->object according to the annotation data at
7195    CHARBUF.  CHARBUF is an array:
7196      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7197  */
7198
7199 static void
7200 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7201 {
7202   int len;
7203   ptrdiff_t to;
7204   enum composition_method method;
7205   Lisp_Object components;
7206
7207   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7208   to = pos + charbuf[2];
7209   method = (enum composition_method) (charbuf[4]);
7210
7211   if (method == COMPOSITION_RELATIVE)
7212     components = Qnil;
7213   else
7214     {
7215       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7216       int i, j;
7217
7218       if (method == COMPOSITION_WITH_RULE)
7219         len = charbuf[2] * 3 - 2;
7220       charbuf += MAX_ANNOTATION_LENGTH;
7221       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7222       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7223         {
7224           if (charbuf[i] >= 0)
7225             args[j] = make_number (charbuf[i]);
7226           else
7227             {
7228               i++;
7229               args[j] = make_number (charbuf[i] % 0x100);
7230             }
7231         }
7232       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7233     }
7234   compose_text (pos, to, components, Qnil, coding->dst_object);
7235 }
7236
7237
7238 /* Put `charset' property on text in CODING->object according to
7239    the annotation data at CHARBUF.  CHARBUF is an array:
7240      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7241  */
7242
7243 static void
7244 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7245 {
7246   ptrdiff_t from = pos - charbuf[2];
7247   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7248
7249   Fput_text_property (make_number (from), make_number (pos),
7250                       Qcharset, CHARSET_NAME (charset),
7251                       coding->dst_object);
7252 }
7253
7254 #define MAX_CHARBUF_SIZE 0x4000
7255 /* How many units decoding functions expect in coding->charbuf at
7256    most.  Currently, decode_coding_emacs_mule expects the following
7257    size, and that is the largest value.  */
7258 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7259
7260 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7261   do {                                                          \
7262     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
7263                            MAX_CHARBUF_SIZE);                   \
7264     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
7265     coding->charbuf_size = units;                               \
7266   } while (0)
7267
7268 static void
7269 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7270 {
7271   int *charbuf = coding->charbuf;
7272   int *charbuf_end = charbuf + coding->charbuf_used;
7273
7274   if (NILP (coding->dst_object))
7275     return;
7276
7277   while (charbuf < charbuf_end)
7278     {
7279       if (*charbuf >= 0)
7280         pos++, charbuf++;
7281       else
7282         {
7283           int len = -*charbuf;
7284
7285           if (len > 2)
7286             switch (charbuf[1])
7287               {
7288               case CODING_ANNOTATE_COMPOSITION_MASK:
7289                 produce_composition (coding, charbuf, pos);
7290                 break;
7291               case CODING_ANNOTATE_CHARSET_MASK:
7292                 produce_charset (coding, charbuf, pos);
7293                 break;
7294               default:
7295                 break;
7296               }
7297           charbuf += len;
7298         }
7299     }
7300 }
7301
7302 /* Decode the data at CODING->src_object into CODING->dst_object.
7303    CODING->src_object is a buffer, a string, or nil.
7304    CODING->dst_object is a buffer.
7305
7306    If CODING->src_object is a buffer, it must be the current buffer.
7307    In this case, if CODING->src_pos is positive, it is a position of
7308    the source text in the buffer, otherwise, the source text is in the
7309    gap area of the buffer, and CODING->src_pos specifies the offset of
7310    the text from GPT (which must be the same as PT).  If this is the
7311    same buffer as CODING->dst_object, CODING->src_pos must be
7312    negative.
7313
7314    If CODING->src_object is a string, CODING->src_pos is an index to
7315    that string.
7316
7317    If CODING->src_object is nil, CODING->source must already point to
7318    the non-relocatable memory area.  In this case, CODING->src_pos is
7319    an offset from CODING->source.
7320
7321    The decoded data is inserted at the current point of the buffer
7322    CODING->dst_object.
7323 */
7324
7325 static void
7326 decode_coding (struct coding_system *coding)
7327 {
7328   Lisp_Object attrs;
7329   Lisp_Object undo_list;
7330   Lisp_Object translation_table;
7331   struct ccl_spec cclspec;
7332   int carryover;
7333   int i;
7334
7335   USE_SAFE_ALLOCA;
7336
7337   if (BUFFERP (coding->src_object)
7338       && coding->src_pos > 0
7339       && coding->src_pos < GPT
7340       && coding->src_pos + coding->src_chars > GPT)
7341     move_gap_both (coding->src_pos, coding->src_pos_byte);
7342
7343   undo_list = Qt;
7344   if (BUFFERP (coding->dst_object))
7345     {
7346       set_buffer_internal (XBUFFER (coding->dst_object));
7347       if (GPT != PT)
7348         move_gap_both (PT, PT_BYTE);
7349
7350       /* We must disable undo_list in order to record the whole insert
7351          transaction via record_insert at the end.  But doing so also
7352          disables the recording of the first change to the undo_list.
7353          Therefore we check for first change here and record it via
7354          record_first_change if needed.  */
7355       if (MODIFF <= SAVE_MODIFF)
7356         record_first_change ();
7357
7358       undo_list = BVAR (current_buffer, undo_list);
7359       bset_undo_list (current_buffer, Qt);
7360     }
7361
7362   coding->consumed = coding->consumed_char = 0;
7363   coding->produced = coding->produced_char = 0;
7364   coding->chars_at_source = 0;
7365   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7366
7367   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7368
7369   attrs = CODING_ID_ATTRS (coding->id);
7370   translation_table = get_translation_table (attrs, 0, NULL);
7371
7372   carryover = 0;
7373   if (coding->decoder == decode_coding_ccl)
7374     {
7375       coding->spec.ccl = &cclspec;
7376       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7377     }
7378   do
7379     {
7380       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7381
7382       coding_set_source (coding);
7383       coding->annotated = 0;
7384       coding->charbuf_used = carryover;
7385       (*(coding->decoder)) (coding);
7386       coding_set_destination (coding);
7387       carryover = produce_chars (coding, translation_table, 0);
7388       if (coding->annotated)
7389         produce_annotation (coding, pos);
7390       for (i = 0; i < carryover; i++)
7391         coding->charbuf[i]
7392           = coding->charbuf[coding->charbuf_used - carryover + i];
7393     }
7394   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7395          || (coding->consumed < coding->src_bytes
7396              && (coding->result == CODING_RESULT_SUCCESS
7397                  || coding->result == CODING_RESULT_INVALID_SRC)));
7398
7399   if (carryover > 0)
7400     {
7401       coding_set_destination (coding);
7402       coding->charbuf_used = carryover;
7403       produce_chars (coding, translation_table, 1);
7404     }
7405
7406   coding->carryover_bytes = 0;
7407   if (coding->consumed < coding->src_bytes)
7408     {
7409       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7410       const unsigned char *src;
7411
7412       coding_set_source (coding);
7413       coding_set_destination (coding);
7414       src = coding->source + coding->consumed;
7415
7416       if (coding->mode & CODING_MODE_LAST_BLOCK)
7417         {
7418           /* Flush out unprocessed data as binary chars.  We are sure
7419              that the number of data is less than the size of
7420              coding->charbuf.  */
7421           coding->charbuf_used = 0;
7422           coding->chars_at_source = 0;
7423
7424           while (nbytes-- > 0)
7425             {
7426               int c;
7427
7428               /* Copy raw bytes in their 2-byte forms from multibyte
7429                  text as single characters.  */
7430               if (coding->src_multibyte
7431                   && CHAR_BYTE8_HEAD_P (*src) && nbytes > 0)
7432                 {
7433                   c = STRING_CHAR_ADVANCE (src);
7434                   nbytes--;
7435                 }
7436               else
7437                 {
7438                   c = *src++;
7439
7440                   if (c & 0x80)
7441                     c = BYTE8_TO_CHAR (c);
7442                 }
7443               coding->charbuf[coding->charbuf_used++] = c;
7444             }
7445           produce_chars (coding, Qnil, 1);
7446         }
7447       else
7448         {
7449           /* Record unprocessed bytes in coding->carryover.  We are
7450              sure that the number of data is less than the size of
7451              coding->carryover.  */
7452           unsigned char *p = coding->carryover;
7453
7454           if (nbytes > sizeof coding->carryover)
7455             nbytes = sizeof coding->carryover;
7456           coding->carryover_bytes = nbytes;
7457           while (nbytes-- > 0)
7458             *p++ = *src++;
7459         }
7460       coding->consumed = coding->src_bytes;
7461     }
7462
7463   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7464       && !inhibit_eol_conversion)
7465     decode_eol (coding);
7466   if (BUFFERP (coding->dst_object))
7467     {
7468       bset_undo_list (current_buffer, undo_list);
7469       record_insert (coding->dst_pos, coding->produced_char);
7470     }
7471
7472   SAFE_FREE ();
7473 }
7474
7475
7476 /* Extract an annotation datum from a composition starting at POS and
7477    ending before LIMIT of CODING->src_object (buffer or string), store
7478    the data in BUF, set *STOP to a starting position of the next
7479    composition (if any) or to LIMIT, and return the address of the
7480    next element of BUF.
7481
7482    If such an annotation is not found, set *STOP to a starting
7483    position of a composition after POS (if any) or to LIMIT, and
7484    return BUF.  */
7485
7486 static int *
7487 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7488                                struct coding_system *coding, int *buf,
7489                                ptrdiff_t *stop)
7490 {
7491   ptrdiff_t start, end;
7492   Lisp_Object prop;
7493
7494   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7495       || end > limit)
7496     *stop = limit;
7497   else if (start > pos)
7498     *stop = start;
7499   else
7500     {
7501       if (start == pos)
7502         {
7503           /* We found a composition.  Store the corresponding
7504              annotation data in BUF.  */
7505           int *head = buf;
7506           enum composition_method method = composition_method (prop);
7507           int nchars = COMPOSITION_LENGTH (prop);
7508
7509           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7510           if (method != COMPOSITION_RELATIVE)
7511             {
7512               Lisp_Object components;
7513               ptrdiff_t i, len, i_byte;
7514
7515               components = COMPOSITION_COMPONENTS (prop);
7516               if (VECTORP (components))
7517                 {
7518                   len = ASIZE (components);
7519                   for (i = 0; i < len; i++)
7520                     *buf++ = XINT (AREF (components, i));
7521                 }
7522               else if (STRINGP (components))
7523                 {
7524                   len = SCHARS (components);
7525                   i = i_byte = 0;
7526                   while (i < len)
7527                     {
7528                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7529                       buf++;
7530                     }
7531                 }
7532               else if (INTEGERP (components))
7533                 {
7534                   len = 1;
7535                   *buf++ = XINT (components);
7536                 }
7537               else if (CONSP (components))
7538                 {
7539                   for (len = 0; CONSP (components);
7540                        len++, components = XCDR (components))
7541                     *buf++ = XINT (XCAR (components));
7542                 }
7543               else
7544                 emacs_abort ();
7545               *head -= len;
7546             }
7547         }
7548
7549       if (find_composition (end, limit, &start, &end, &prop,
7550                             coding->src_object)
7551           && end <= limit)
7552         *stop = start;
7553       else
7554         *stop = limit;
7555     }
7556   return buf;
7557 }
7558
7559
7560 /* Extract an annotation datum from a text property `charset' at POS of
7561    CODING->src_object (buffer of string), store the data in BUF, set
7562    *STOP to the position where the value of `charset' property changes
7563    (limiting by LIMIT), and return the address of the next element of
7564    BUF.
7565
7566    If the property value is nil, set *STOP to the position where the
7567    property value is non-nil (limiting by LIMIT), and return BUF.  */
7568
7569 static int *
7570 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7571                            struct coding_system *coding, int *buf,
7572                            ptrdiff_t *stop)
7573 {
7574   Lisp_Object val, next;
7575   int id;
7576
7577   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7578   if (! NILP (val) && CHARSETP (val))
7579     id = XINT (CHARSET_SYMBOL_ID (val));
7580   else
7581     id = -1;
7582   ADD_CHARSET_DATA (buf, 0, id);
7583   next = Fnext_single_property_change (make_number (pos), Qcharset,
7584                                        coding->src_object,
7585                                        make_number (limit));
7586   *stop = XINT (next);
7587   return buf;
7588 }
7589
7590
7591 static void
7592 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7593                int max_lookup)
7594 {
7595   int *buf = coding->charbuf;
7596   int *buf_end = coding->charbuf + coding->charbuf_size;
7597   const unsigned char *src = coding->source + coding->consumed;
7598   const unsigned char *src_end = coding->source + coding->src_bytes;
7599   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7600   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7601   bool multibytep = coding->src_multibyte;
7602   Lisp_Object eol_type;
7603   int c;
7604   ptrdiff_t stop, stop_composition, stop_charset;
7605   int *lookup_buf = NULL;
7606
7607   if (! NILP (translation_table))
7608     lookup_buf = alloca (sizeof (int) * max_lookup);
7609
7610   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7611   if (VECTORP (eol_type))
7612     eol_type = Qunix;
7613
7614   /* Note: composition handling is not yet implemented.  */
7615   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7616
7617   if (NILP (coding->src_object))
7618     stop = stop_composition = stop_charset = end_pos;
7619   else
7620     {
7621       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7622         stop = stop_composition = pos;
7623       else
7624         stop = stop_composition = end_pos;
7625       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7626         stop = stop_charset = pos;
7627       else
7628         stop_charset = end_pos;
7629     }
7630
7631   /* Compensate for CRLF and conversion.  */
7632   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7633   while (buf < buf_end)
7634     {
7635       Lisp_Object trans;
7636
7637       if (pos == stop)
7638         {
7639           if (pos == end_pos)
7640             break;
7641           if (pos == stop_composition)
7642             buf = handle_composition_annotation (pos, end_pos, coding,
7643                                                  buf, &stop_composition);
7644           if (pos == stop_charset)
7645             buf = handle_charset_annotation (pos, end_pos, coding,
7646                                              buf, &stop_charset);
7647           stop = (stop_composition < stop_charset
7648                   ? stop_composition : stop_charset);
7649         }
7650
7651       if (! multibytep)
7652         {
7653           int bytes;
7654
7655           if (coding->encoder == encode_coding_raw_text
7656               || coding->encoder == encode_coding_ccl)
7657             c = *src++, pos++;
7658           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7659             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7660           else
7661             c = BYTE8_TO_CHAR (*src), src++, pos++;
7662         }
7663       else
7664         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7665       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7666         c = '\n';
7667       if (! EQ (eol_type, Qunix))
7668         {
7669           if (c == '\n')
7670             {
7671               if (EQ (eol_type, Qdos))
7672                 *buf++ = '\r';
7673               else
7674                 c = '\r';
7675             }
7676         }
7677
7678       trans = Qnil;
7679       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7680       if (NILP (trans))
7681         *buf++ = c;
7682       else
7683         {
7684           ptrdiff_t from_nchars = 1, to_nchars = 1;
7685           int *lookup_buf_end;
7686           const unsigned char *p = src;
7687           int i;
7688
7689           lookup_buf[0] = c;
7690           for (i = 1; i < max_lookup && p < src_end; i++)
7691             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7692           lookup_buf_end = lookup_buf + i;
7693           trans = get_translation (trans, lookup_buf, lookup_buf_end,
7694                                    &from_nchars);
7695           if (INTEGERP (trans))
7696             c = XINT (trans);
7697           else if (VECTORP (trans))
7698             {
7699               to_nchars = ASIZE (trans);
7700               if (buf_end - buf < to_nchars)
7701                 break;
7702               c = XINT (AREF (trans, 0));
7703             }
7704           else
7705             break;
7706           *buf++ = c;
7707           for (i = 1; i < to_nchars; i++)
7708             *buf++ = XINT (AREF (trans, i));
7709           for (i = 1; i < from_nchars; i++, pos++)
7710             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7711         }
7712     }
7713
7714   coding->consumed = src - coding->source;
7715   coding->consumed_char = pos - coding->src_pos;
7716   coding->charbuf_used = buf - coding->charbuf;
7717   coding->chars_at_source = 0;
7718 }
7719
7720
7721 /* Encode the text at CODING->src_object into CODING->dst_object.
7722    CODING->src_object is a buffer or a string.
7723    CODING->dst_object is a buffer or nil.
7724
7725    If CODING->src_object is a buffer, it must be the current buffer.
7726    In this case, if CODING->src_pos is positive, it is a position of
7727    the source text in the buffer, otherwise. the source text is in the
7728    gap area of the buffer, and coding->src_pos specifies the offset of
7729    the text from GPT (which must be the same as PT).  If this is the
7730    same buffer as CODING->dst_object, CODING->src_pos must be
7731    negative and CODING should not have `pre-write-conversion'.
7732
7733    If CODING->src_object is a string, CODING should not have
7734    `pre-write-conversion'.
7735
7736    If CODING->dst_object is a buffer, the encoded data is inserted at
7737    the current point of that buffer.
7738
7739    If CODING->dst_object is nil, the encoded data is placed at the
7740    memory area specified by CODING->destination.  */
7741
7742 static void
7743 encode_coding (struct coding_system *coding)
7744 {
7745   Lisp_Object attrs;
7746   Lisp_Object translation_table;
7747   int max_lookup;
7748   struct ccl_spec cclspec;
7749
7750   USE_SAFE_ALLOCA;
7751
7752   attrs = CODING_ID_ATTRS (coding->id);
7753   if (coding->encoder == encode_coding_raw_text)
7754     translation_table = Qnil, max_lookup = 0;
7755   else
7756     translation_table = get_translation_table (attrs, 1, &max_lookup);
7757
7758   if (BUFFERP (coding->dst_object))
7759     {
7760       set_buffer_internal (XBUFFER (coding->dst_object));
7761       coding->dst_multibyte
7762         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7763     }
7764
7765   coding->consumed = coding->consumed_char = 0;
7766   coding->produced = coding->produced_char = 0;
7767   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7768
7769   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7770
7771   if (coding->encoder == encode_coding_ccl)
7772     {
7773       coding->spec.ccl = &cclspec;
7774       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7775     }
7776   do {
7777     coding_set_source (coding);
7778     consume_chars (coding, translation_table, max_lookup);
7779     coding_set_destination (coding);
7780     (*(coding->encoder)) (coding);
7781   } while (coding->consumed_char < coding->src_chars);
7782
7783   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7784     insert_from_gap (coding->produced_char, coding->produced, 0);
7785
7786   SAFE_FREE ();
7787 }
7788
7789
7790 /* Name (or base name) of work buffer for code conversion.  */
7791 static Lisp_Object Vcode_conversion_workbuf_name;
7792
7793 /* A working buffer used by the top level conversion.  Once it is
7794    created, it is never destroyed.  It has the name
7795    Vcode_conversion_workbuf_name.  The other working buffers are
7796    destroyed after the use is finished, and their names are modified
7797    versions of Vcode_conversion_workbuf_name.  */
7798 static Lisp_Object Vcode_conversion_reused_workbuf;
7799
7800 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7801 static bool reused_workbuf_in_use;
7802
7803
7804 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7805    multibyteness of returning buffer.  */
7806
7807 static Lisp_Object
7808 make_conversion_work_buffer (bool multibyte)
7809 {
7810   Lisp_Object name, workbuf;
7811   struct buffer *current;
7812
7813   if (reused_workbuf_in_use)
7814     {
7815       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7816       workbuf = Fget_buffer_create (name);
7817     }
7818   else
7819     {
7820       reused_workbuf_in_use = 1;
7821       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7822         Vcode_conversion_reused_workbuf
7823           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7824       workbuf = Vcode_conversion_reused_workbuf;
7825     }
7826   current = current_buffer;
7827   set_buffer_internal (XBUFFER (workbuf));
7828   /* We can't allow modification hooks to run in the work buffer.  For
7829      instance, directory_files_internal assumes that file decoding
7830      doesn't compile new regexps.  */
7831   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7832   Ferase_buffer ();
7833   bset_undo_list (current_buffer, Qt);
7834   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7835   set_buffer_internal (current);
7836   return workbuf;
7837 }
7838
7839
7840 static void
7841 code_conversion_restore (Lisp_Object arg)
7842 {
7843   Lisp_Object current, workbuf;
7844
7845   current = XCAR (arg);
7846   workbuf = XCDR (arg);
7847   if (! NILP (workbuf))
7848     {
7849       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7850         reused_workbuf_in_use = 0;
7851       else
7852         Fkill_buffer (workbuf);
7853     }
7854   set_buffer_internal (XBUFFER (current));
7855 }
7856
7857 Lisp_Object
7858 code_conversion_save (bool with_work_buf, bool multibyte)
7859 {
7860   Lisp_Object workbuf = Qnil;
7861
7862   if (with_work_buf)
7863     workbuf = make_conversion_work_buffer (multibyte);
7864   record_unwind_protect (code_conversion_restore,
7865                          Fcons (Fcurrent_buffer (), workbuf));
7866   return workbuf;
7867 }
7868
7869 static void
7870 coding_restore_undo_list (Lisp_Object arg)
7871 {
7872   Lisp_Object undo_list = XCAR (arg);
7873   struct buffer *buf = XBUFFER (XCDR (arg));
7874
7875   bset_undo_list (buf, undo_list);
7876 }
7877
7878 void
7879 decode_coding_gap (struct coding_system *coding,
7880                    ptrdiff_t chars, ptrdiff_t bytes)
7881 {
7882   ptrdiff_t count = SPECPDL_INDEX ();
7883   Lisp_Object attrs;
7884
7885   coding->src_object = Fcurrent_buffer ();
7886   coding->src_chars = chars;
7887   coding->src_bytes = bytes;
7888   coding->src_pos = -chars;
7889   coding->src_pos_byte = -bytes;
7890   coding->src_multibyte = chars < bytes;
7891   coding->dst_object = coding->src_object;
7892   coding->dst_pos = PT;
7893   coding->dst_pos_byte = PT_BYTE;
7894   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7895
7896   coding->head_ascii = -1;
7897   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7898   coding->eol_seen = EOL_SEEN_NONE;
7899   if (CODING_REQUIRE_DETECTION (coding))
7900     detect_coding (coding);
7901   attrs = CODING_ID_ATTRS (coding->id);
7902   if (! disable_ascii_optimization
7903       && ! coding->src_multibyte
7904       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7905       && NILP (CODING_ATTR_POST_READ (attrs))
7906       && NILP (get_translation_table (attrs, 0, NULL)))
7907     {
7908       chars = coding->head_ascii;
7909       if (chars < 0)
7910         chars = check_ascii (coding);
7911       if (chars != bytes)
7912         {
7913           /* There exists a non-ASCII byte.  */
7914           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7915               && coding->detected_utf8_bytes == coding->src_bytes)
7916             {
7917               if (coding->detected_utf8_chars >= 0)
7918                 chars = coding->detected_utf8_chars;
7919               else
7920                 chars = check_utf_8 (coding);
7921               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7922                   && coding->head_ascii == 0
7923                   && coding->source[0] == UTF_8_BOM_1
7924                   && coding->source[1] == UTF_8_BOM_2
7925                   && coding->source[2] == UTF_8_BOM_3)
7926                 {
7927                   chars--;
7928                   bytes -= 3;
7929                   coding->src_bytes -= 3;
7930                 }
7931             }
7932           else
7933             chars = -1;
7934         }
7935       if (chars >= 0)
7936         {
7937           Lisp_Object eol_type;
7938
7939           eol_type = CODING_ID_EOL_TYPE (coding->id);
7940           if (VECTORP (eol_type))
7941             {
7942               if (coding->eol_seen != EOL_SEEN_NONE)
7943                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7944             }
7945           if (EQ (eol_type, Qmac))
7946             {
7947               unsigned char *src_end = GAP_END_ADDR;
7948               unsigned char *src = src_end - coding->src_bytes;
7949
7950               while (src < src_end)
7951                 {
7952                   if (*src++ == '\r')
7953                     src[-1] = '\n';
7954                 }
7955             }
7956           else if (EQ (eol_type, Qdos))
7957             {
7958               unsigned char *src = GAP_END_ADDR;
7959               unsigned char *src_beg = src - coding->src_bytes;
7960               unsigned char *dst = src;
7961               ptrdiff_t diff;
7962
7963               while (src_beg < src)
7964                 {
7965                   *--dst = *--src;
7966                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7967                     src--;
7968                 }
7969               diff = dst - src;
7970               bytes -= diff;
7971               chars -= diff;
7972             }
7973           coding->produced = bytes;
7974           coding->produced_char = chars;
7975           insert_from_gap (chars, bytes, 1);
7976           return;
7977         }
7978     }
7979   code_conversion_save (0, 0);
7980
7981   coding->mode |= CODING_MODE_LAST_BLOCK;
7982   current_buffer->text->inhibit_shrinking = 1;
7983   decode_coding (coding);
7984   current_buffer->text->inhibit_shrinking = 0;
7985
7986   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7987     {
7988       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7989       Lisp_Object val;
7990       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
7991       ptrdiff_t count1 = SPECPDL_INDEX ();
7992
7993       record_unwind_protect (coding_restore_undo_list,
7994                              Fcons (undo_list, Fcurrent_buffer ()));
7995       bset_undo_list (current_buffer, Qt);
7996       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7997       val = call1 (CODING_ATTR_POST_READ (attrs),
7998                    make_number (coding->produced_char));
7999       CHECK_NATNUM (val);
8000       coding->produced_char += Z - prev_Z;
8001       coding->produced += Z_BYTE - prev_Z_BYTE;
8002       unbind_to (count1, Qnil);
8003     }
8004
8005   unbind_to (count, Qnil);
8006 }
8007
8008
8009 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8010    SRC_OBJECT into DST_OBJECT by coding context CODING.
8011
8012    SRC_OBJECT is a buffer, a string, or Qnil.
8013
8014    If it is a buffer, the text is at point of the buffer.  FROM and TO
8015    are positions in the buffer.
8016
8017    If it is a string, the text is at the beginning of the string.
8018    FROM and TO are indices to the string.
8019
8020    If it is nil, the text is at coding->source.  FROM and TO are
8021    indices to coding->source.
8022
8023    DST_OBJECT is a buffer, Qt, or Qnil.
8024
8025    If it is a buffer, the decoded text is inserted at point of the
8026    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8027    is deleted.
8028
8029    If it is Qt, a string is made from the decoded text, and
8030    set in CODING->dst_object.
8031
8032    If it is Qnil, the decoded text is stored at CODING->destination.
8033    The caller must allocate CODING->dst_bytes bytes at
8034    CODING->destination by xmalloc.  If the decoded text is longer than
8035    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8036  */
8037
8038 void
8039 decode_coding_object (struct coding_system *coding,
8040                       Lisp_Object src_object,
8041                       ptrdiff_t from, ptrdiff_t from_byte,
8042                       ptrdiff_t to, ptrdiff_t to_byte,
8043                       Lisp_Object dst_object)
8044 {
8045   ptrdiff_t count = SPECPDL_INDEX ();
8046   unsigned char *destination UNINIT;
8047   ptrdiff_t dst_bytes UNINIT;
8048   ptrdiff_t chars = to - from;
8049   ptrdiff_t bytes = to_byte - from_byte;
8050   Lisp_Object attrs;
8051   ptrdiff_t saved_pt = -1, saved_pt_byte UNINIT;
8052   bool need_marker_adjustment = 0;
8053   Lisp_Object old_deactivate_mark;
8054
8055   old_deactivate_mark = Vdeactivate_mark;
8056
8057   if (NILP (dst_object))
8058     {
8059       destination = coding->destination;
8060       dst_bytes = coding->dst_bytes;
8061     }
8062
8063   coding->src_object = src_object;
8064   coding->src_chars = chars;
8065   coding->src_bytes = bytes;
8066   coding->src_multibyte = chars < bytes;
8067
8068   if (STRINGP (src_object))
8069     {
8070       coding->src_pos = from;
8071       coding->src_pos_byte = from_byte;
8072     }
8073   else if (BUFFERP (src_object))
8074     {
8075       set_buffer_internal (XBUFFER (src_object));
8076       if (from != GPT)
8077         move_gap_both (from, from_byte);
8078       if (EQ (src_object, dst_object))
8079         {
8080           struct Lisp_Marker *tail;
8081
8082           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8083             {
8084               tail->need_adjustment
8085                 = tail->charpos == (tail->insertion_type ? from : to);
8086               need_marker_adjustment |= tail->need_adjustment;
8087             }
8088           saved_pt = PT, saved_pt_byte = PT_BYTE;
8089           TEMP_SET_PT_BOTH (from, from_byte);
8090           current_buffer->text->inhibit_shrinking = 1;
8091           del_range_both (from, from_byte, to, to_byte, 1);
8092           coding->src_pos = -chars;
8093           coding->src_pos_byte = -bytes;
8094         }
8095       else
8096         {
8097           coding->src_pos = from;
8098           coding->src_pos_byte = from_byte;
8099         }
8100     }
8101
8102   if (CODING_REQUIRE_DETECTION (coding))
8103     detect_coding (coding);
8104   attrs = CODING_ID_ATTRS (coding->id);
8105
8106   if (EQ (dst_object, Qt)
8107       || (! NILP (CODING_ATTR_POST_READ (attrs))
8108           && NILP (dst_object)))
8109     {
8110       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8111       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8112       coding->dst_pos = BEG;
8113       coding->dst_pos_byte = BEG_BYTE;
8114     }
8115   else if (BUFFERP (dst_object))
8116     {
8117       code_conversion_save (0, 0);
8118       coding->dst_object = dst_object;
8119       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8120       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8121       coding->dst_multibyte
8122         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8123     }
8124   else
8125     {
8126       code_conversion_save (0, 0);
8127       coding->dst_object = Qnil;
8128       /* Most callers presume this will return a multibyte result, and they
8129          won't use `binary' or `raw-text' anyway, so let's not worry about
8130          CODING_FOR_UNIBYTE.  */
8131       coding->dst_multibyte = 1;
8132     }
8133
8134   decode_coding (coding);
8135
8136   if (BUFFERP (coding->dst_object))
8137     set_buffer_internal (XBUFFER (coding->dst_object));
8138
8139   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8140     {
8141       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8142       Lisp_Object val;
8143       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
8144       ptrdiff_t count1 = SPECPDL_INDEX ();
8145
8146       record_unwind_protect (coding_restore_undo_list,
8147                              Fcons (undo_list, Fcurrent_buffer ()));
8148       bset_undo_list (current_buffer, Qt);
8149       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8150       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8151                         make_number (coding->produced_char));
8152       CHECK_NATNUM (val);
8153       coding->produced_char += Z - prev_Z;
8154       coding->produced += Z_BYTE - prev_Z_BYTE;
8155       unbind_to (count1, Qnil);
8156     }
8157
8158   if (EQ (dst_object, Qt))
8159     {
8160       coding->dst_object = Fbuffer_string ();
8161     }
8162   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8163     {
8164       set_buffer_internal (XBUFFER (coding->dst_object));
8165       if (dst_bytes < coding->produced)
8166         {
8167           eassert (coding->produced > 0);
8168           destination = xrealloc (destination, coding->produced);
8169           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8170             move_gap_both (BEGV, BEGV_BYTE);
8171           memcpy (destination, BEGV_ADDR, coding->produced);
8172           coding->destination = destination;
8173         }
8174     }
8175
8176   if (saved_pt >= 0)
8177     {
8178       /* This is the case of:
8179          (BUFFERP (src_object) && EQ (src_object, dst_object))
8180          As we have moved PT while replacing the original buffer
8181          contents, we must recover it now.  */
8182       set_buffer_internal (XBUFFER (src_object));
8183       current_buffer->text->inhibit_shrinking = 0;
8184       if (saved_pt < from)
8185         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8186       else if (saved_pt < from + chars)
8187         TEMP_SET_PT_BOTH (from, from_byte);
8188       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8189         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8190                           saved_pt_byte + (coding->produced - bytes));
8191       else
8192         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8193                           saved_pt_byte + (coding->produced - bytes));
8194
8195       if (need_marker_adjustment)
8196         {
8197           struct Lisp_Marker *tail;
8198
8199           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8200             if (tail->need_adjustment)
8201               {
8202                 tail->need_adjustment = 0;
8203                 if (tail->insertion_type)
8204                   {
8205                     tail->bytepos = from_byte;
8206                     tail->charpos = from;
8207                   }
8208                 else
8209                   {
8210                     tail->bytepos = from_byte + coding->produced;
8211                     tail->charpos
8212                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8213                          ? tail->bytepos : from + coding->produced_char);
8214                   }
8215               }
8216         }
8217     }
8218
8219   Vdeactivate_mark = old_deactivate_mark;
8220   unbind_to (count, coding->dst_object);
8221 }
8222
8223
8224 void
8225 encode_coding_object (struct coding_system *coding,
8226                       Lisp_Object src_object,
8227                       ptrdiff_t from, ptrdiff_t from_byte,
8228                       ptrdiff_t to, ptrdiff_t to_byte,
8229                       Lisp_Object dst_object)
8230 {
8231   ptrdiff_t count = SPECPDL_INDEX ();
8232   ptrdiff_t chars = to - from;
8233   ptrdiff_t bytes = to_byte - from_byte;
8234   Lisp_Object attrs;
8235   ptrdiff_t saved_pt = -1, saved_pt_byte;
8236   bool need_marker_adjustment = 0;
8237   bool kill_src_buffer = 0;
8238   Lisp_Object old_deactivate_mark;
8239
8240   old_deactivate_mark = Vdeactivate_mark;
8241
8242   coding->src_object = src_object;
8243   coding->src_chars = chars;
8244   coding->src_bytes = bytes;
8245   coding->src_multibyte = chars < bytes;
8246
8247   attrs = CODING_ID_ATTRS (coding->id);
8248
8249   if (EQ (src_object, dst_object))
8250     {
8251       struct Lisp_Marker *tail;
8252
8253       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8254         {
8255           tail->need_adjustment
8256             = tail->charpos == (tail->insertion_type ? from : to);
8257           need_marker_adjustment |= tail->need_adjustment;
8258         }
8259     }
8260
8261   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8262     {
8263       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8264       set_buffer_internal (XBUFFER (coding->src_object));
8265       if (STRINGP (src_object))
8266         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8267       else if (BUFFERP (src_object))
8268         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8269       else
8270         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8271
8272       if (EQ (src_object, dst_object))
8273         {
8274           set_buffer_internal (XBUFFER (src_object));
8275           saved_pt = PT, saved_pt_byte = PT_BYTE;
8276           del_range_both (from, from_byte, to, to_byte, 1);
8277           set_buffer_internal (XBUFFER (coding->src_object));
8278         }
8279
8280       safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8281                   make_number (BEG), make_number (Z));
8282       if (XBUFFER (coding->src_object) != current_buffer)
8283         kill_src_buffer = 1;
8284       coding->src_object = Fcurrent_buffer ();
8285       if (BEG != GPT)
8286         move_gap_both (BEG, BEG_BYTE);
8287       coding->src_chars = Z - BEG;
8288       coding->src_bytes = Z_BYTE - BEG_BYTE;
8289       coding->src_pos = BEG;
8290       coding->src_pos_byte = BEG_BYTE;
8291       coding->src_multibyte = Z < Z_BYTE;
8292     }
8293   else if (STRINGP (src_object))
8294     {
8295       code_conversion_save (0, 0);
8296       coding->src_pos = from;
8297       coding->src_pos_byte = from_byte;
8298     }
8299   else if (BUFFERP (src_object))
8300     {
8301       code_conversion_save (0, 0);
8302       set_buffer_internal (XBUFFER (src_object));
8303       if (EQ (src_object, dst_object))
8304         {
8305           saved_pt = PT, saved_pt_byte = PT_BYTE;
8306           coding->src_object = del_range_1 (from, to, 1, 1);
8307           coding->src_pos = 0;
8308           coding->src_pos_byte = 0;
8309         }
8310       else
8311         {
8312           if (from < GPT && to >= GPT)
8313             move_gap_both (from, from_byte);
8314           coding->src_pos = from;
8315           coding->src_pos_byte = from_byte;
8316         }
8317     }
8318   else
8319     {
8320       code_conversion_save (0, 0);
8321       coding->src_pos = from;
8322       coding->src_pos_byte = from_byte;
8323     }
8324
8325   if (BUFFERP (dst_object))
8326     {
8327       coding->dst_object = dst_object;
8328       if (EQ (src_object, dst_object))
8329         {
8330           coding->dst_pos = from;
8331           coding->dst_pos_byte = from_byte;
8332         }
8333       else
8334         {
8335           struct buffer *current = current_buffer;
8336
8337           set_buffer_temp (XBUFFER (dst_object));
8338           coding->dst_pos = PT;
8339           coding->dst_pos_byte = PT_BYTE;
8340           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8341           set_buffer_temp (current);
8342         }
8343       coding->dst_multibyte
8344         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8345     }
8346   else if (EQ (dst_object, Qt))
8347     {
8348       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8349       coding->dst_object = Qnil;
8350       coding->destination = xmalloc (dst_bytes);
8351       coding->dst_bytes = dst_bytes;
8352       coding->dst_multibyte = 0;
8353     }
8354   else
8355     {
8356       coding->dst_object = Qnil;
8357       coding->dst_multibyte = 0;
8358     }
8359
8360   encode_coding (coding);
8361
8362   if (EQ (dst_object, Qt))
8363     {
8364       if (BUFFERP (coding->dst_object))
8365         coding->dst_object = Fbuffer_string ();
8366       else if (coding->raw_destination)
8367         /* This is used to avoid creating huge Lisp string.
8368            NOTE: caller who sets `raw_destination' is also
8369            responsible for freeing `destination' buffer.  */
8370         coding->dst_object = Qnil;
8371       else
8372         {
8373           coding->dst_object
8374             = make_unibyte_string ((char *) coding->destination,
8375                                    coding->produced);
8376           xfree (coding->destination);
8377         }
8378     }
8379
8380   if (saved_pt >= 0)
8381     {
8382       /* This is the case of:
8383          (BUFFERP (src_object) && EQ (src_object, dst_object))
8384          As we have moved PT while replacing the original buffer
8385          contents, we must recover it now.  */
8386       set_buffer_internal (XBUFFER (src_object));
8387       if (saved_pt < from)
8388         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8389       else if (saved_pt < from + chars)
8390         TEMP_SET_PT_BOTH (from, from_byte);
8391       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8392         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8393                           saved_pt_byte + (coding->produced - bytes));
8394       else
8395         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8396                           saved_pt_byte + (coding->produced - bytes));
8397
8398       if (need_marker_adjustment)
8399         {
8400           struct Lisp_Marker *tail;
8401
8402           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8403             if (tail->need_adjustment)
8404               {
8405                 tail->need_adjustment = 0;
8406                 if (tail->insertion_type)
8407                   {
8408                     tail->bytepos = from_byte;
8409                     tail->charpos = from;
8410                   }
8411                 else
8412                   {
8413                     tail->bytepos = from_byte + coding->produced;
8414                     tail->charpos
8415                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8416                          ? tail->bytepos : from + coding->produced_char);
8417                   }
8418               }
8419         }
8420     }
8421
8422   if (kill_src_buffer)
8423     Fkill_buffer (coding->src_object);
8424
8425   Vdeactivate_mark = old_deactivate_mark;
8426   unbind_to (count, Qnil);
8427 }
8428
8429
8430 Lisp_Object
8431 preferred_coding_system (void)
8432 {
8433   int id = coding_categories[coding_priorities[0]].id;
8434
8435   return CODING_ID_NAME (id);
8436 }
8437
8438 #if defined (WINDOWSNT) || defined (CYGWIN)
8439
8440 Lisp_Object
8441 from_unicode (Lisp_Object str)
8442 {
8443   CHECK_STRING (str);
8444   if (!STRING_MULTIBYTE (str) &&
8445       SBYTES (str) & 1)
8446     {
8447       str = Fsubstring (str, make_number (0), make_number (-1));
8448     }
8449
8450   return code_convert_string_norecord (str, Qutf_16le, 0);
8451 }
8452
8453 Lisp_Object
8454 from_unicode_buffer (const wchar_t *wstr)
8455 {
8456   /* We get one of the two final null bytes for free.  */
8457   ptrdiff_t len = 1 + sizeof (wchar_t) * wcslen (wstr);
8458   AUTO_STRING_WITH_LEN (str, (char *) wstr, len);
8459   return from_unicode (str);
8460 }
8461
8462 wchar_t *
8463 to_unicode (Lisp_Object str, Lisp_Object *buf)
8464 {
8465   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8466   /* We need to make another copy (in addition to the one made by
8467      code_convert_string_norecord) to ensure that the final string is
8468      _doubly_ zero terminated --- that is, that the string is
8469      terminated by two zero bytes and one utf-16le null character.
8470      Because strings are already terminated with a single zero byte,
8471      we just add one additional zero. */
8472   str = make_uninit_string (SBYTES (*buf) + 1);
8473   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8474   SDATA (str) [SBYTES (*buf)] = '\0';
8475   *buf = str;
8476   return WCSDATA (*buf);
8477 }
8478
8479 #endif /* WINDOWSNT || CYGWIN */
8480
8481 \f
8482 #ifdef emacs
8483 /*** 8. Emacs Lisp library functions ***/
8484
8485 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8486        doc: /* Return t if OBJECT is nil or a coding-system.
8487 See the documentation of `define-coding-system' for information
8488 about coding-system objects.  */)
8489   (Lisp_Object object)
8490 {
8491   if (NILP (object)
8492       || CODING_SYSTEM_ID (object) >= 0)
8493     return Qt;
8494   if (! SYMBOLP (object)
8495       || NILP (Fget (object, Qcoding_system_define_form)))
8496     return Qnil;
8497   return Qt;
8498 }
8499
8500 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8501        Sread_non_nil_coding_system, 1, 1, 0,
8502        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8503   (Lisp_Object prompt)
8504 {
8505   Lisp_Object val;
8506   do
8507     {
8508       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8509                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8510     }
8511   while (SCHARS (val) == 0);
8512   return (Fintern (val, Qnil));
8513 }
8514
8515 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8516        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8517 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8518 Ignores case when completing coding systems (all Emacs coding systems
8519 are lower-case).  */)
8520   (Lisp_Object prompt, Lisp_Object default_coding_system)
8521 {
8522   Lisp_Object val;
8523   ptrdiff_t count = SPECPDL_INDEX ();
8524
8525   if (SYMBOLP (default_coding_system))
8526     default_coding_system = SYMBOL_NAME (default_coding_system);
8527   specbind (Qcompletion_ignore_case, Qt);
8528   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8529                           Qt, Qnil, Qcoding_system_history,
8530                           default_coding_system, Qnil);
8531   unbind_to (count, Qnil);
8532   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8533 }
8534
8535 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8536        1, 1, 0,
8537        doc: /* Check validity of CODING-SYSTEM.
8538 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8539 It is valid if it is nil or a symbol defined as a coding system by the
8540 function `define-coding-system'.  */)
8541   (Lisp_Object coding_system)
8542 {
8543   Lisp_Object define_form;
8544
8545   define_form = Fget (coding_system, Qcoding_system_define_form);
8546   if (! NILP (define_form))
8547     {
8548       Fput (coding_system, Qcoding_system_define_form, Qnil);
8549       safe_eval (define_form);
8550     }
8551   if (!NILP (Fcoding_system_p (coding_system)))
8552     return coding_system;
8553   xsignal1 (Qcoding_system_error, coding_system);
8554 }
8555
8556 \f
8557 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8558    HIGHEST, return the coding system of the highest
8559    priority among the detected coding systems.  Otherwise return a
8560    list of detected coding systems sorted by their priorities.  If
8561    MULTIBYTEP, it is assumed that the bytes are in correct
8562    multibyte form but contains only ASCII and eight-bit chars.
8563    Otherwise, the bytes are raw bytes.
8564
8565    CODING-SYSTEM controls the detection as below:
8566
8567    If it is nil, detect both text-format and eol-format.  If the
8568    text-format part of CODING-SYSTEM is already specified
8569    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8570    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8571    detect only text-format.  */
8572
8573 Lisp_Object
8574 detect_coding_system (const unsigned char *src,
8575                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8576                       bool highest, bool multibytep,
8577                       Lisp_Object coding_system)
8578 {
8579   const unsigned char *src_end = src + src_bytes;
8580   Lisp_Object attrs, eol_type;
8581   Lisp_Object val = Qnil;
8582   struct coding_system coding;
8583   ptrdiff_t id;
8584   struct coding_detection_info detect_info;
8585   enum coding_category base_category;
8586   bool null_byte_found = 0, eight_bit_found = 0;
8587
8588   if (NILP (coding_system))
8589     coding_system = Qundecided;
8590   setup_coding_system (coding_system, &coding);
8591   attrs = CODING_ID_ATTRS (coding.id);
8592   eol_type = CODING_ID_EOL_TYPE (coding.id);
8593   coding_system = CODING_ATTR_BASE_NAME (attrs);
8594
8595   coding.source = src;
8596   coding.src_chars = src_chars;
8597   coding.src_bytes = src_bytes;
8598   coding.src_multibyte = multibytep;
8599   coding.consumed = 0;
8600   coding.mode |= CODING_MODE_LAST_BLOCK;
8601   coding.head_ascii = 0;
8602
8603   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8604
8605   /* At first, detect text-format if necessary.  */
8606   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8607   if (base_category == coding_category_undecided)
8608     {
8609       enum coding_category category UNINIT;
8610       struct coding_system *this UNINIT;
8611       int c, i;
8612       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8613                                        inhibit_null_byte_detection);
8614       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8615                                        inhibit_iso_escape_detection);
8616       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8617
8618       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8619       for (; src < src_end; src++)
8620         {
8621           c = *src;
8622           if (c & 0x80)
8623             {
8624               eight_bit_found = 1;
8625               if (null_byte_found)
8626                 break;
8627             }
8628           else if (c < 0x20)
8629             {
8630               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8631                   && ! inhibit_ied
8632                   && ! detect_info.checked)
8633                 {
8634                   if (detect_coding_iso_2022 (&coding, &detect_info))
8635                     {
8636                       /* We have scanned the whole data.  */
8637                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8638                         {
8639                           /* We didn't find an 8-bit code.  We may
8640                              have found a null-byte, but it's very
8641                              rare that a binary file confirm to
8642                              ISO-2022.  */
8643                           src = src_end;
8644                           coding.head_ascii = src - coding.source;
8645                         }
8646                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8647                       break;
8648                     }
8649                 }
8650               else if (! c && !inhibit_nbd)
8651                 {
8652                   null_byte_found = 1;
8653                   if (eight_bit_found)
8654                     break;
8655                 }
8656               if (! eight_bit_found)
8657                 coding.head_ascii++;
8658             }
8659           else if (! eight_bit_found)
8660             coding.head_ascii++;
8661         }
8662
8663       if (null_byte_found || eight_bit_found
8664           || coding.head_ascii < coding.src_bytes
8665           || detect_info.found)
8666         {
8667           if (coding.head_ascii == coding.src_bytes)
8668             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8669             for (i = 0; i < coding_category_raw_text; i++)
8670               {
8671                 category = coding_priorities[i];
8672                 this = coding_categories + category;
8673                 if (detect_info.found & (1 << category))
8674                   break;
8675               }
8676           else
8677             {
8678               if (null_byte_found)
8679                 {
8680                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8681                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8682                 }
8683               else if (prefer_utf_8
8684                        && detect_coding_utf_8 (&coding, &detect_info))
8685                 {
8686                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8687                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8688                 }
8689               for (i = 0; i < coding_category_raw_text; i++)
8690                 {
8691                   category = coding_priorities[i];
8692                   this = coding_categories + category;
8693
8694                   if (this->id < 0)
8695                     {
8696                       /* No coding system of this category is defined.  */
8697                       detect_info.rejected |= (1 << category);
8698                     }
8699                   else if (category >= coding_category_raw_text)
8700                     continue;
8701                   else if (detect_info.checked & (1 << category))
8702                     {
8703                       if (highest
8704                           && (detect_info.found & (1 << category)))
8705                         break;
8706                     }
8707                   else if ((*(this->detector)) (&coding, &detect_info)
8708                            && highest
8709                            && (detect_info.found & (1 << category)))
8710                     {
8711                       if (category == coding_category_utf_16_auto)
8712                         {
8713                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8714                             category = coding_category_utf_16_le;
8715                           else
8716                             category = coding_category_utf_16_be;
8717                         }
8718                       break;
8719                     }
8720                 }
8721             }
8722         }
8723
8724       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8725           || null_byte_found)
8726         {
8727           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8728           id = CODING_SYSTEM_ID (Qno_conversion);
8729           val = list1 (make_number (id));
8730         }
8731       else if (! detect_info.rejected && ! detect_info.found)
8732         {
8733           detect_info.found = CATEGORY_MASK_ANY;
8734           id = coding_categories[coding_category_undecided].id;
8735           val = list1 (make_number (id));
8736         }
8737       else if (highest)
8738         {
8739           if (detect_info.found)
8740             {
8741               detect_info.found = 1 << category;
8742               val = list1 (make_number (this->id));
8743             }
8744           else
8745             for (i = 0; i < coding_category_raw_text; i++)
8746               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8747                 {
8748                   detect_info.found = 1 << coding_priorities[i];
8749                   id = coding_categories[coding_priorities[i]].id;
8750                   val = list1 (make_number (id));
8751                   break;
8752                 }
8753         }
8754       else
8755         {
8756           int mask = detect_info.rejected | detect_info.found;
8757           int found = 0;
8758
8759           for (i = coding_category_raw_text - 1; i >= 0; i--)
8760             {
8761               category = coding_priorities[i];
8762               if (! (mask & (1 << category)))
8763                 {
8764                   found |= 1 << category;
8765                   id = coding_categories[category].id;
8766                   if (id >= 0)
8767                     val = list1 (make_number (id));
8768                 }
8769             }
8770           for (i = coding_category_raw_text - 1; i >= 0; i--)
8771             {
8772               category = coding_priorities[i];
8773               if (detect_info.found & (1 << category))
8774                 {
8775                   id = coding_categories[category].id;
8776                   val = Fcons (make_number (id), val);
8777                 }
8778             }
8779           detect_info.found |= found;
8780         }
8781     }
8782   else if (base_category == coding_category_utf_8_auto)
8783     {
8784       if (detect_coding_utf_8 (&coding, &detect_info))
8785         {
8786           struct coding_system *this;
8787
8788           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8789             this = coding_categories + coding_category_utf_8_sig;
8790           else
8791             this = coding_categories + coding_category_utf_8_nosig;
8792           val = list1 (make_number (this->id));
8793         }
8794     }
8795   else if (base_category == coding_category_utf_16_auto)
8796     {
8797       if (detect_coding_utf_16 (&coding, &detect_info))
8798         {
8799           struct coding_system *this;
8800
8801           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8802             this = coding_categories + coding_category_utf_16_le;
8803           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8804             this = coding_categories + coding_category_utf_16_be;
8805           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8806             this = coding_categories + coding_category_utf_16_be_nosig;
8807           else
8808             this = coding_categories + coding_category_utf_16_le_nosig;
8809           val = list1 (make_number (this->id));
8810         }
8811     }
8812   else
8813     {
8814       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8815       val = list1 (make_number (coding.id));
8816     }
8817
8818   /* Then, detect eol-format if necessary.  */
8819   {
8820     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8821     Lisp_Object tail;
8822
8823     if (VECTORP (eol_type))
8824       {
8825         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8826           {
8827             if (null_byte_found)
8828               normal_eol = EOL_SEEN_LF;
8829             else
8830               normal_eol = detect_eol (coding.source, src_bytes,
8831                                        coding_category_raw_text);
8832           }
8833         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8834                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8835           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8836                                       coding_category_utf_16_be);
8837         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8838                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8839           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8840                                       coding_category_utf_16_le);
8841       }
8842     else
8843       {
8844         if (EQ (eol_type, Qunix))
8845           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8846         else if (EQ (eol_type, Qdos))
8847           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8848         else
8849           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8850       }
8851
8852     for (tail = val; CONSP (tail); tail = XCDR (tail))
8853       {
8854         enum coding_category category;
8855         int this_eol;
8856
8857         id = XINT (XCAR (tail));
8858         attrs = CODING_ID_ATTRS (id);
8859         category = XINT (CODING_ATTR_CATEGORY (attrs));
8860         eol_type = CODING_ID_EOL_TYPE (id);
8861         if (VECTORP (eol_type))
8862           {
8863             if (category == coding_category_utf_16_be
8864                 || category == coding_category_utf_16_be_nosig)
8865               this_eol = utf_16_be_eol;
8866             else if (category == coding_category_utf_16_le
8867                      || category == coding_category_utf_16_le_nosig)
8868               this_eol = utf_16_le_eol;
8869             else
8870               this_eol = normal_eol;
8871
8872             if (this_eol == EOL_SEEN_LF)
8873               XSETCAR (tail, AREF (eol_type, 0));
8874             else if (this_eol == EOL_SEEN_CRLF)
8875               XSETCAR (tail, AREF (eol_type, 1));
8876             else if (this_eol == EOL_SEEN_CR)
8877               XSETCAR (tail, AREF (eol_type, 2));
8878             else
8879               XSETCAR (tail, CODING_ID_NAME (id));
8880           }
8881         else
8882           XSETCAR (tail, CODING_ID_NAME (id));
8883       }
8884   }
8885
8886   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8887 }
8888
8889
8890 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8891        2, 3, 0,
8892        doc: /* Detect coding system of the text in the region between START and END.
8893 Return a list of possible coding systems ordered by priority.
8894 The coding systems to try and their priorities follows what
8895 the function `coding-system-priority-list' (which see) returns.
8896
8897 If only ASCII characters are found (except for such ISO-2022 control
8898 characters as ESC), it returns a list of single element `undecided'
8899 or its subsidiary coding system according to a detected end-of-line
8900 format.
8901
8902 If optional argument HIGHEST is non-nil, return the coding system of
8903 highest priority.  */)
8904   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8905 {
8906   ptrdiff_t from, to;
8907   ptrdiff_t from_byte, to_byte;
8908
8909   validate_region (&start, &end);
8910   from = XINT (start), to = XINT (end);
8911   from_byte = CHAR_TO_BYTE (from);
8912   to_byte = CHAR_TO_BYTE (to);
8913
8914   if (from < GPT && to >= GPT)
8915     move_gap_both (to, to_byte);
8916
8917   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8918                                to - from, to_byte - from_byte,
8919                                !NILP (highest),
8920                                !NILP (BVAR (current_buffer
8921                                       , enable_multibyte_characters)),
8922                                Qnil);
8923 }
8924
8925 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8926        1, 2, 0,
8927        doc: /* Detect coding system of the text in STRING.
8928 Return a list of possible coding systems ordered by priority.
8929 The coding systems to try and their priorities follows what
8930 the function `coding-system-priority-list' (which see) returns.
8931
8932 If only ASCII characters are found (except for such ISO-2022 control
8933 characters as ESC), it returns a list of single element `undecided'
8934 or its subsidiary coding system according to a detected end-of-line
8935 format.
8936
8937 If optional argument HIGHEST is non-nil, return the coding system of
8938 highest priority.  */)
8939   (Lisp_Object string, Lisp_Object highest)
8940 {
8941   CHECK_STRING (string);
8942
8943   return detect_coding_system (SDATA (string),
8944                                SCHARS (string), SBYTES (string),
8945                                !NILP (highest), STRING_MULTIBYTE (string),
8946                                Qnil);
8947 }
8948
8949
8950 static bool
8951 char_encodable_p (int c, Lisp_Object attrs)
8952 {
8953   Lisp_Object tail;
8954   struct charset *charset;
8955   Lisp_Object translation_table;
8956
8957   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8958   if (! NILP (translation_table))
8959     c = translate_char (translation_table, c);
8960   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8961        CONSP (tail); tail = XCDR (tail))
8962     {
8963       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8964       if (CHAR_CHARSET_P (c, charset))
8965         break;
8966     }
8967   return (! NILP (tail));
8968 }
8969
8970
8971 /* Return a list of coding systems that safely encode the text between
8972    START and END.  If EXCLUDE is non-nil, it is a list of coding
8973    systems not to check.  The returned list doesn't contain any such
8974    coding systems.  In any case, if the text contains only ASCII or is
8975    unibyte, return t.  */
8976
8977 DEFUN ("find-coding-systems-region-internal",
8978        Ffind_coding_systems_region_internal,
8979        Sfind_coding_systems_region_internal, 2, 3, 0,
8980        doc: /* Internal use only.  */)
8981   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8982 {
8983   Lisp_Object coding_attrs_list, safe_codings;
8984   ptrdiff_t start_byte, end_byte;
8985   const unsigned char *p, *pbeg, *pend;
8986   int c;
8987   Lisp_Object tail, elt, work_table;
8988
8989   if (STRINGP (start))
8990     {
8991       if (!STRING_MULTIBYTE (start)
8992           || SCHARS (start) == SBYTES (start))
8993         return Qt;
8994       start_byte = 0;
8995       end_byte = SBYTES (start);
8996     }
8997   else
8998     {
8999       CHECK_NUMBER_COERCE_MARKER (start);
9000       CHECK_NUMBER_COERCE_MARKER (end);
9001       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9002         args_out_of_range (start, end);
9003       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9004         return Qt;
9005       start_byte = CHAR_TO_BYTE (XINT (start));
9006       end_byte = CHAR_TO_BYTE (XINT (end));
9007       if (XINT (end) - XINT (start) == end_byte - start_byte)
9008         return Qt;
9009
9010       if (XINT (start) < GPT && XINT (end) > GPT)
9011         {
9012           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9013             move_gap_both (XINT (start), start_byte);
9014           else
9015             move_gap_both (XINT (end), end_byte);
9016         }
9017     }
9018
9019   coding_attrs_list = Qnil;
9020   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9021     if (NILP (exclude)
9022         || NILP (Fmemq (XCAR (tail), exclude)))
9023       {
9024         Lisp_Object attrs;
9025
9026         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9027         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9028           {
9029             ASET (attrs, coding_attr_trans_tbl,
9030                   get_translation_table (attrs, 1, NULL));
9031             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9032           }
9033       }
9034
9035   if (STRINGP (start))
9036     p = pbeg = SDATA (start);
9037   else
9038     p = pbeg = BYTE_POS_ADDR (start_byte);
9039   pend = p + (end_byte - start_byte);
9040
9041   while (p < pend && ASCII_CHAR_P (*p)) p++;
9042   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9043
9044   work_table = Fmake_char_table (Qnil, Qnil);
9045   while (p < pend)
9046     {
9047       if (ASCII_CHAR_P (*p))
9048         p++;
9049       else
9050         {
9051           c = STRING_CHAR_ADVANCE (p);
9052           if (!NILP (char_table_ref (work_table, c)))
9053             /* This character was already checked.  Ignore it.  */
9054             continue;
9055
9056           charset_map_loaded = 0;
9057           for (tail = coding_attrs_list; CONSP (tail);)
9058             {
9059               elt = XCAR (tail);
9060               if (NILP (elt))
9061                 tail = XCDR (tail);
9062               else if (char_encodable_p (c, elt))
9063                 tail = XCDR (tail);
9064               else if (CONSP (XCDR (tail)))
9065                 {
9066                   XSETCAR (tail, XCAR (XCDR (tail)));
9067                   XSETCDR (tail, XCDR (XCDR (tail)));
9068                 }
9069               else
9070                 {
9071                   XSETCAR (tail, Qnil);
9072                   tail = XCDR (tail);
9073                 }
9074             }
9075           if (charset_map_loaded)
9076             {
9077               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9078
9079               if (STRINGP (start))
9080                 pbeg = SDATA (start);
9081               else
9082                 pbeg = BYTE_POS_ADDR (start_byte);
9083               p = pbeg + p_offset;
9084               pend = pbeg + pend_offset;
9085             }
9086           char_table_set (work_table, c, Qt);
9087         }
9088     }
9089
9090   safe_codings = list2 (Qraw_text, Qno_conversion);
9091   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9092     if (! NILP (XCAR (tail)))
9093       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9094
9095   return safe_codings;
9096 }
9097
9098
9099 DEFUN ("unencodable-char-position", Funencodable_char_position,
9100        Sunencodable_char_position, 3, 5, 0,
9101        doc: /* Return position of first un-encodable character in a region.
9102 START and END specify the region and CODING-SYSTEM specifies the
9103 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9104
9105 If optional 4th argument COUNT is non-nil, it specifies at most how
9106 many un-encodable characters to search.  In this case, the value is a
9107 list of positions.
9108
9109 If optional 5th argument STRING is non-nil, it is a string to search
9110 for un-encodable characters.  In that case, START and END are indexes
9111 to the string and treated as in `substring'.  */)
9112   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9113    Lisp_Object count, Lisp_Object string)
9114 {
9115   EMACS_INT n;
9116   struct coding_system coding;
9117   Lisp_Object attrs, charset_list, translation_table;
9118   Lisp_Object positions;
9119   ptrdiff_t from, to;
9120   const unsigned char *p, *stop, *pend;
9121   bool ascii_compatible;
9122
9123   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9124   attrs = CODING_ID_ATTRS (coding.id);
9125   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9126     return Qnil;
9127   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9128   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9129   translation_table = get_translation_table (attrs, 1, NULL);
9130
9131   if (NILP (string))
9132     {
9133       validate_region (&start, &end);
9134       from = XINT (start);
9135       to = XINT (end);
9136       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9137           || (ascii_compatible
9138               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9139         return Qnil;
9140       p = CHAR_POS_ADDR (from);
9141       pend = CHAR_POS_ADDR (to);
9142       if (from < GPT && to >= GPT)
9143         stop = GPT_ADDR;
9144       else
9145         stop = pend;
9146     }
9147   else
9148     {
9149       CHECK_STRING (string);
9150       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9151       if (! STRING_MULTIBYTE (string))
9152         return Qnil;
9153       p = SDATA (string) + string_char_to_byte (string, from);
9154       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9155       if (ascii_compatible && (to - from) == (pend - p))
9156         return Qnil;
9157     }
9158
9159   if (NILP (count))
9160     n = 1;
9161   else
9162     {
9163       CHECK_NATNUM (count);
9164       n = XINT (count);
9165     }
9166
9167   positions = Qnil;
9168   charset_map_loaded = 0;
9169   while (1)
9170     {
9171       int c;
9172
9173       if (ascii_compatible)
9174         while (p < stop && ASCII_CHAR_P (*p))
9175           p++, from++;
9176       if (p >= stop)
9177         {
9178           if (p >= pend)
9179             break;
9180           stop = pend;
9181           p = GAP_END_ADDR;
9182         }
9183
9184       c = STRING_CHAR_ADVANCE (p);
9185       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9186           && ! char_charset (translate_char (translation_table, c),
9187                              charset_list, NULL))
9188         {
9189           positions = Fcons (make_number (from), positions);
9190           n--;
9191           if (n == 0)
9192             break;
9193         }
9194
9195       from++;
9196       if (charset_map_loaded && NILP (string))
9197         {
9198           p = CHAR_POS_ADDR (from);
9199           pend = CHAR_POS_ADDR (to);
9200           if (from < GPT && to >= GPT)
9201             stop = GPT_ADDR;
9202           else
9203             stop = pend;
9204           charset_map_loaded = 0;
9205         }
9206     }
9207
9208   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9209 }
9210
9211
9212 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9213        Scheck_coding_systems_region, 3, 3, 0,
9214        doc: /* Check if the region is encodable by coding systems.
9215
9216 START and END are buffer positions specifying the region.
9217 CODING-SYSTEM-LIST is a list of coding systems to check.
9218
9219 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9220 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9221 whole region, POS0, POS1, ... are buffer positions where non-encodable
9222 characters are found.
9223
9224 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9225 value is nil.
9226
9227 START may be a string.  In that case, check if the string is
9228 encodable, and the value contains indices to the string instead of
9229 buffer positions.  END is ignored.
9230
9231 If the current buffer (or START if it is a string) is unibyte, the value
9232 is nil.  */)
9233   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9234 {
9235   Lisp_Object list;
9236   ptrdiff_t start_byte, end_byte;
9237   ptrdiff_t pos;
9238   const unsigned char *p, *pbeg, *pend;
9239   int c;
9240   Lisp_Object tail, elt, attrs;
9241
9242   if (STRINGP (start))
9243     {
9244       if (!STRING_MULTIBYTE (start)
9245           || SCHARS (start) == SBYTES (start))
9246         return Qnil;
9247       start_byte = 0;
9248       end_byte = SBYTES (start);
9249       pos = 0;
9250     }
9251   else
9252     {
9253       CHECK_NUMBER_COERCE_MARKER (start);
9254       CHECK_NUMBER_COERCE_MARKER (end);
9255       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9256         args_out_of_range (start, end);
9257       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9258         return Qnil;
9259       start_byte = CHAR_TO_BYTE (XINT (start));
9260       end_byte = CHAR_TO_BYTE (XINT (end));
9261       if (XINT (end) - XINT (start) == end_byte - start_byte)
9262         return Qnil;
9263
9264       if (XINT (start) < GPT && XINT (end) > GPT)
9265         {
9266           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9267             move_gap_both (XINT (start), start_byte);
9268           else
9269             move_gap_both (XINT (end), end_byte);
9270         }
9271       pos = XINT (start);
9272     }
9273
9274   list = Qnil;
9275   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9276     {
9277       elt = XCAR (tail);
9278       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9279       ASET (attrs, coding_attr_trans_tbl,
9280             get_translation_table (attrs, 1, NULL));
9281       list = Fcons (list2 (elt, attrs), list);
9282     }
9283
9284   if (STRINGP (start))
9285     p = pbeg = SDATA (start);
9286   else
9287     p = pbeg = BYTE_POS_ADDR (start_byte);
9288   pend = p + (end_byte - start_byte);
9289
9290   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9291   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9292
9293   while (p < pend)
9294     {
9295       if (ASCII_CHAR_P (*p))
9296         p++;
9297       else
9298         {
9299           c = STRING_CHAR_ADVANCE (p);
9300
9301           charset_map_loaded = 0;
9302           for (tail = list; CONSP (tail); tail = XCDR (tail))
9303             {
9304               elt = XCDR (XCAR (tail));
9305               if (! char_encodable_p (c, XCAR (elt)))
9306                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9307             }
9308           if (charset_map_loaded)
9309             {
9310               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9311
9312               if (STRINGP (start))
9313                 pbeg = SDATA (start);
9314               else
9315                 pbeg = BYTE_POS_ADDR (start_byte);
9316               p = pbeg + p_offset;
9317               pend = pbeg + pend_offset;
9318             }
9319         }
9320       pos++;
9321     }
9322
9323   tail = list;
9324   list = Qnil;
9325   for (; CONSP (tail); tail = XCDR (tail))
9326     {
9327       elt = XCAR (tail);
9328       if (CONSP (XCDR (XCDR (elt))))
9329         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9330                       list);
9331     }
9332
9333   return list;
9334 }
9335
9336
9337 static Lisp_Object
9338 code_convert_region (Lisp_Object start, Lisp_Object end,
9339                      Lisp_Object coding_system, Lisp_Object dst_object,
9340                      bool encodep, bool norecord)
9341 {
9342   struct coding_system coding;
9343   ptrdiff_t from, from_byte, to, to_byte;
9344   Lisp_Object src_object;
9345
9346   if (NILP (coding_system))
9347     coding_system = Qno_conversion;
9348   else
9349     CHECK_CODING_SYSTEM (coding_system);
9350   src_object = Fcurrent_buffer ();
9351   if (NILP (dst_object))
9352     dst_object = src_object;
9353   else if (! EQ (dst_object, Qt))
9354     CHECK_BUFFER (dst_object);
9355
9356   validate_region (&start, &end);
9357   from = XFASTINT (start);
9358   from_byte = CHAR_TO_BYTE (from);
9359   to = XFASTINT (end);
9360   to_byte = CHAR_TO_BYTE (to);
9361
9362   setup_coding_system (coding_system, &coding);
9363   coding.mode |= CODING_MODE_LAST_BLOCK;
9364
9365   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9366     {
9367       struct buffer *buf = XBUFFER (dst_object);
9368       ptrdiff_t buf_pt = BUF_PT (buf);
9369
9370       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9371     }
9372
9373   if (encodep)
9374     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9375                           dst_object);
9376   else
9377     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9378                           dst_object);
9379   if (! norecord)
9380     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9381
9382   return (BUFFERP (dst_object)
9383           ? make_number (coding.produced_char)
9384           : coding.dst_object);
9385 }
9386
9387
9388 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9389        3, 4, "r\nzCoding system: ",
9390        doc: /* Decode the current region from the specified coding system.
9391 When called from a program, takes four arguments:
9392         START, END, CODING-SYSTEM, and DESTINATION.
9393 START and END are buffer positions.
9394
9395 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9396 If nil, the region between START and END is replaced by the decoded text.
9397 If buffer, the decoded text is inserted in that buffer after point (point
9398 does not move).
9399 In those cases, the length of the decoded text is returned.
9400 If DESTINATION is t, the decoded text is returned.
9401
9402 This function sets `last-coding-system-used' to the precise coding system
9403 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9404 not fully specified.)  */)
9405   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9406 {
9407   return code_convert_region (start, end, coding_system, destination, 0, 0);
9408 }
9409
9410 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9411        3, 4, "r\nzCoding system: ",
9412        doc: /* Encode the current region by specified coding system.
9413 When called from a program, takes four arguments:
9414         START, END, CODING-SYSTEM and DESTINATION.
9415 START and END are buffer positions.
9416
9417 Optional 4th argument DESTINATION specifies where the encoded text goes.
9418 If nil, the region between START and END is replaced by the encoded text.
9419 If buffer, the encoded text is inserted in that buffer after point (point
9420 does not move).
9421 In those cases, the length of the encoded text is returned.
9422 If DESTINATION is t, the encoded text is returned.
9423
9424 This function sets `last-coding-system-used' to the precise coding system
9425 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9426 not fully specified.)  */)
9427   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9428 {
9429   return code_convert_region (start, end, coding_system, destination, 1, 0);
9430 }
9431
9432 Lisp_Object
9433 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9434                      Lisp_Object dst_object, bool encodep, bool nocopy,
9435                      bool norecord)
9436 {
9437   struct coding_system coding;
9438   ptrdiff_t chars, bytes;
9439
9440   CHECK_STRING (string);
9441   if (NILP (coding_system))
9442     {
9443       if (! norecord)
9444         Vlast_coding_system_used = Qno_conversion;
9445       if (NILP (dst_object))
9446         return (nocopy ? Fcopy_sequence (string) : string);
9447     }
9448
9449   if (NILP (coding_system))
9450     coding_system = Qno_conversion;
9451   else
9452     CHECK_CODING_SYSTEM (coding_system);
9453   if (NILP (dst_object))
9454     dst_object = Qt;
9455   else if (! EQ (dst_object, Qt))
9456     CHECK_BUFFER (dst_object);
9457
9458   setup_coding_system (coding_system, &coding);
9459   coding.mode |= CODING_MODE_LAST_BLOCK;
9460   chars = SCHARS (string);
9461   bytes = SBYTES (string);
9462
9463   if (BUFFERP (dst_object))
9464     {
9465       struct buffer *buf = XBUFFER (dst_object);
9466       ptrdiff_t buf_pt = BUF_PT (buf);
9467
9468       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9469     }
9470
9471   if (encodep)
9472     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9473   else
9474     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9475   if (! norecord)
9476     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9477
9478   return (BUFFERP (dst_object)
9479           ? make_number (coding.produced_char)
9480           : coding.dst_object);
9481 }
9482
9483
9484 /* Encode or decode STRING according to CODING_SYSTEM.
9485    Do not set Vlast_coding_system_used.
9486
9487    This function is called only from macros DECODE_FILE and
9488    ENCODE_FILE, thus we ignore character composition.  */
9489
9490 Lisp_Object
9491 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9492                               bool encodep)
9493 {
9494   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9495 }
9496
9497 /* Encode or decode a file name, to or from a unibyte string suitable
9498    for passing to C library functions.  */
9499 Lisp_Object
9500 decode_file_name (Lisp_Object fname)
9501 {
9502 #ifdef WINDOWSNT
9503   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9504      converts the file names either to UTF-16LE or to the system ANSI
9505      codepage internally, depending on the underlying OS; see w32.c.  */
9506   if (! NILP (Fcoding_system_p (Qutf_8)))
9507     return code_convert_string_norecord (fname, Qutf_8, 0);
9508   return fname;
9509 #else  /* !WINDOWSNT */
9510   if (! NILP (Vfile_name_coding_system))
9511     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9512   else if (! NILP (Vdefault_file_name_coding_system))
9513     return code_convert_string_norecord (fname,
9514                                          Vdefault_file_name_coding_system, 0);
9515   else
9516     return fname;
9517 #endif
9518 }
9519
9520 Lisp_Object
9521 encode_file_name (Lisp_Object fname)
9522 {
9523   /* This is especially important during bootstrap and dumping, when
9524      file-name encoding is not yet known, and therefore any non-ASCII
9525      file names are unibyte strings, and could only be thrashed if we
9526      try to encode them.  */
9527   if (!STRING_MULTIBYTE (fname))
9528     return fname;
9529 #ifdef WINDOWSNT
9530   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9531      converts the file names either to UTF-16LE or to the system ANSI
9532      codepage internally, depending on the underlying OS; see w32.c.  */
9533   if (! NILP (Fcoding_system_p (Qutf_8)))
9534     return code_convert_string_norecord (fname, Qutf_8, 1);
9535   return fname;
9536 #else  /* !WINDOWSNT */
9537   if (! NILP (Vfile_name_coding_system))
9538     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9539   else if (! NILP (Vdefault_file_name_coding_system))
9540     return code_convert_string_norecord (fname,
9541                                          Vdefault_file_name_coding_system, 1);
9542   else
9543     return fname;
9544 #endif
9545 }
9546
9547 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9548        2, 4, 0,
9549        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9550
9551 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9552 if the decoding operation is trivial.
9553
9554 Optional fourth arg BUFFER non-nil means that the decoded text is
9555 inserted in that buffer after point (point does not move).  In this
9556 case, the return value is the length of the decoded text.
9557
9558 This function sets `last-coding-system-used' to the precise coding system
9559 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9560 not fully specified.)  */)
9561   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9562 {
9563   return code_convert_string (string, coding_system, buffer,
9564                               0, ! NILP (nocopy), 0);
9565 }
9566
9567 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9568        2, 4, 0,
9569        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9570
9571 Optional third arg NOCOPY non-nil means it is OK to return STRING
9572 itself if the encoding operation is trivial.
9573
9574 Optional fourth arg BUFFER non-nil means that the encoded text is
9575 inserted in that buffer after point (point does not move).  In this
9576 case, the return value is the length of the encoded text.
9577
9578 This function sets `last-coding-system-used' to the precise coding system
9579 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9580 not fully specified.)  */)
9581   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9582 {
9583   return code_convert_string (string, coding_system, buffer,
9584                               1, ! NILP (nocopy), 0);
9585 }
9586
9587 \f
9588 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9589        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9590 Return the corresponding character.  */)
9591   (Lisp_Object code)
9592 {
9593   Lisp_Object spec, attrs, val;
9594   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9595   EMACS_INT ch;
9596   int c;
9597
9598   CHECK_NATNUM (code);
9599   ch = XFASTINT (code);
9600   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9601   attrs = AREF (spec, 0);
9602
9603   if (ASCII_CHAR_P (ch)
9604       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9605     return code;
9606
9607   val = CODING_ATTR_CHARSET_LIST (attrs);
9608   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9609   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9610   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9611
9612   if (ch <= 0x7F)
9613     {
9614       c = ch;
9615       charset = charset_roman;
9616     }
9617   else if (ch >= 0xA0 && ch < 0xDF)
9618     {
9619       c = ch - 0x80;
9620       charset = charset_kana;
9621     }
9622   else
9623     {
9624       EMACS_INT c1 = ch >> 8;
9625       int c2 = ch & 0xFF;
9626
9627       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9628           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9629         error ("Invalid code: %"pI"d", ch);
9630       c = ch;
9631       SJIS_TO_JIS (c);
9632       charset = charset_kanji;
9633     }
9634   c = DECODE_CHAR (charset, c);
9635   if (c < 0)
9636     error ("Invalid code: %"pI"d", ch);
9637   return make_number (c);
9638 }
9639
9640
9641 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9642        doc: /* Encode a Japanese character CH to shift_jis encoding.
9643 Return the corresponding code in SJIS.  */)
9644   (Lisp_Object ch)
9645 {
9646   Lisp_Object spec, attrs, charset_list;
9647   int c;
9648   struct charset *charset;
9649   unsigned code;
9650
9651   CHECK_CHARACTER (ch);
9652   c = XFASTINT (ch);
9653   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9654   attrs = AREF (spec, 0);
9655
9656   if (ASCII_CHAR_P (c)
9657       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9658     return ch;
9659
9660   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9661   charset = char_charset (c, charset_list, &code);
9662   if (code == CHARSET_INVALID_CODE (charset))
9663     error ("Can't encode by shift_jis encoding: %c", c);
9664   JIS_TO_SJIS (code);
9665
9666   return make_number (code);
9667 }
9668
9669 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9670        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9671 Return the corresponding character.  */)
9672   (Lisp_Object code)
9673 {
9674   Lisp_Object spec, attrs, val;
9675   struct charset *charset_roman, *charset_big5, *charset;
9676   EMACS_INT ch;
9677   int c;
9678
9679   CHECK_NATNUM (code);
9680   ch = XFASTINT (code);
9681   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9682   attrs = AREF (spec, 0);
9683
9684   if (ASCII_CHAR_P (ch)
9685       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9686     return code;
9687
9688   val = CODING_ATTR_CHARSET_LIST (attrs);
9689   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9690   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9691
9692   if (ch <= 0x7F)
9693     {
9694       c = ch;
9695       charset = charset_roman;
9696     }
9697   else
9698     {
9699       EMACS_INT b1 = ch >> 8;
9700       int b2 = ch & 0x7F;
9701       if (b1 < 0xA1 || b1 > 0xFE
9702           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9703         error ("Invalid code: %"pI"d", ch);
9704       c = ch;
9705       charset = charset_big5;
9706     }
9707   c = DECODE_CHAR (charset, c);
9708   if (c < 0)
9709     error ("Invalid code: %"pI"d", ch);
9710   return make_number (c);
9711 }
9712
9713 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9714        doc: /* Encode the Big5 character CH to BIG5 coding system.
9715 Return the corresponding character code in Big5.  */)
9716   (Lisp_Object ch)
9717 {
9718   Lisp_Object spec, attrs, charset_list;
9719   struct charset *charset;
9720   int c;
9721   unsigned code;
9722
9723   CHECK_CHARACTER (ch);
9724   c = XFASTINT (ch);
9725   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9726   attrs = AREF (spec, 0);
9727   if (ASCII_CHAR_P (c)
9728       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9729     return ch;
9730
9731   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9732   charset = char_charset (c, charset_list, &code);
9733   if (code == CHARSET_INVALID_CODE (charset))
9734     error ("Can't encode by Big5 encoding: %c", c);
9735
9736   return make_number (code);
9737 }
9738
9739 \f
9740 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9741        Sset_terminal_coding_system_internal, 1, 2, 0,
9742        doc: /* Internal use only.  */)
9743   (Lisp_Object coding_system, Lisp_Object terminal)
9744 {
9745   struct terminal *term = decode_live_terminal (terminal);
9746   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9747   CHECK_SYMBOL (coding_system);
9748   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9749   /* We had better not send unsafe characters to terminal.  */
9750   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9751   /* Character composition should be disabled.  */
9752   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9753   terminal_coding->src_multibyte = 1;
9754   terminal_coding->dst_multibyte = 0;
9755   tset_charset_list
9756     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9757             ? coding_charset_list (terminal_coding)
9758             : list1 (make_number (charset_ascii))));
9759   return Qnil;
9760 }
9761
9762 DEFUN ("set-safe-terminal-coding-system-internal",
9763        Fset_safe_terminal_coding_system_internal,
9764        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9765        doc: /* Internal use only.  */)
9766   (Lisp_Object coding_system)
9767 {
9768   CHECK_SYMBOL (coding_system);
9769   setup_coding_system (Fcheck_coding_system (coding_system),
9770                        &safe_terminal_coding);
9771   /* Character composition should be disabled.  */
9772   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9773   safe_terminal_coding.src_multibyte = 1;
9774   safe_terminal_coding.dst_multibyte = 0;
9775   return Qnil;
9776 }
9777
9778 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9779        Sterminal_coding_system, 0, 1, 0,
9780        doc: /* Return coding system specified for terminal output on the given terminal.
9781 TERMINAL may be a terminal object, a frame, or nil for the selected
9782 frame's terminal device.  */)
9783   (Lisp_Object terminal)
9784 {
9785   struct coding_system *terminal_coding
9786     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
9787   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9788
9789   /* For backward compatibility, return nil if it is `undecided'.  */
9790   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9791 }
9792
9793 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9794        Sset_keyboard_coding_system_internal, 1, 2, 0,
9795        doc: /* Internal use only.  */)
9796   (Lisp_Object coding_system, Lisp_Object terminal)
9797 {
9798   struct terminal *t = decode_live_terminal (terminal);
9799   CHECK_SYMBOL (coding_system);
9800   if (NILP (coding_system))
9801     coding_system = Qno_conversion;
9802   else
9803     Fcheck_coding_system (coding_system);
9804   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9805   /* Character composition should be disabled.  */
9806   TERMINAL_KEYBOARD_CODING (t)->common_flags
9807     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9808   return Qnil;
9809 }
9810
9811 DEFUN ("keyboard-coding-system",
9812        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9813        doc: /* Return coding system specified for decoding keyboard input.  */)
9814   (Lisp_Object terminal)
9815 {
9816   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9817                          (decode_live_terminal (terminal))->id);
9818 }
9819
9820 \f
9821 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9822        Sfind_operation_coding_system,  1, MANY, 0,
9823        doc: /* Choose a coding system for an operation based on the target name.
9824 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9825 DECODING-SYSTEM is the coding system to use for decoding
9826 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9827 for encoding (in case OPERATION does encoding).
9828
9829 The first argument OPERATION specifies an I/O primitive:
9830   For file I/O, `insert-file-contents' or `write-region'.
9831   For process I/O, `call-process', `call-process-region', or `start-process'.
9832   For network I/O, `open-network-stream'.
9833
9834 The remaining arguments should be the same arguments that were passed
9835 to the primitive.  Depending on which primitive, one of those arguments
9836 is selected as the TARGET.  For example, if OPERATION does file I/O,
9837 whichever argument specifies the file name is TARGET.
9838
9839 TARGET has a meaning which depends on OPERATION:
9840   For file I/O, TARGET is a file name (except for the special case below).
9841   For process I/O, TARGET is a process name.
9842   For network I/O, TARGET is a service name or a port number.
9843
9844 This function looks up what is specified for TARGET in
9845 `file-coding-system-alist', `process-coding-system-alist',
9846 or `network-coding-system-alist' depending on OPERATION.
9847 They may specify a coding system, a cons of coding systems,
9848 or a function symbol to call.
9849 In the last case, we call the function with one argument,
9850 which is a list of all the arguments given to this function.
9851 If the function can't decide a coding system, it can return
9852 `undecided' so that the normal code-detection is performed.
9853
9854 If OPERATION is `insert-file-contents', the argument corresponding to
9855 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9856 file name to look up, and BUFFER is a buffer that contains the file's
9857 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9858 function to call for FILENAME, that function should examine the
9859 contents of BUFFER instead of reading the file.
9860
9861 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9862   (ptrdiff_t nargs, Lisp_Object *args)
9863 {
9864   Lisp_Object operation, target_idx, target, val;
9865   register Lisp_Object chain;
9866
9867   if (nargs < 2)
9868     error ("Too few arguments");
9869   operation = args[0];
9870   if (!SYMBOLP (operation)
9871       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9872     error ("Invalid first argument");
9873   if (nargs <= 1 + XFASTINT (target_idx))
9874     error ("Too few arguments for operation `%s'",
9875            SDATA (SYMBOL_NAME (operation)));
9876   target = args[XFASTINT (target_idx) + 1];
9877   if (!(STRINGP (target)
9878         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9879             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9880         || (EQ (operation, Qopen_network_stream)
9881             && (INTEGERP (target) || EQ (target, Qt)))))
9882     error ("Invalid argument %"pI"d of operation `%s'",
9883            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9884   if (CONSP (target))
9885     target = XCAR (target);
9886
9887   chain = ((EQ (operation, Qinsert_file_contents)
9888             || EQ (operation, Qwrite_region))
9889            ? Vfile_coding_system_alist
9890            : (EQ (operation, Qopen_network_stream)
9891               ? Vnetwork_coding_system_alist
9892               : Vprocess_coding_system_alist));
9893   if (NILP (chain))
9894     return Qnil;
9895
9896   for (; CONSP (chain); chain = XCDR (chain))
9897     {
9898       Lisp_Object elt;
9899
9900       elt = XCAR (chain);
9901       if (CONSP (elt)
9902           && ((STRINGP (target)
9903                && STRINGP (XCAR (elt))
9904                && fast_string_match (XCAR (elt), target) >= 0)
9905               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9906         {
9907           val = XCDR (elt);
9908           /* Here, if VAL is both a valid coding system and a valid
9909              function symbol, we return VAL as a coding system.  */
9910           if (CONSP (val))
9911             return val;
9912           if (! SYMBOLP (val))
9913             return Qnil;
9914           if (! NILP (Fcoding_system_p (val)))
9915             return Fcons (val, val);
9916           if (! NILP (Ffboundp (val)))
9917             {
9918               /* We use call1 rather than safe_call1
9919                  so as to get bug reports about functions called here
9920                  which don't handle the current interface.  */
9921               val = call1 (val, Flist (nargs, args));
9922               if (CONSP (val))
9923                 return val;
9924               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9925                 return Fcons (val, val);
9926             }
9927           return Qnil;
9928         }
9929     }
9930   return Qnil;
9931 }
9932
9933 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9934        Sset_coding_system_priority, 0, MANY, 0,
9935        doc: /* Assign higher priority to the coding systems given as arguments.
9936 If multiple coding systems belong to the same category,
9937 all but the first one are ignored.
9938
9939 usage: (set-coding-system-priority &rest coding-systems)  */)
9940   (ptrdiff_t nargs, Lisp_Object *args)
9941 {
9942   ptrdiff_t i, j;
9943   bool changed[coding_category_max];
9944   enum coding_category priorities[coding_category_max];
9945
9946   memset (changed, 0, sizeof changed);
9947
9948   for (i = j = 0; i < nargs; i++)
9949     {
9950       enum coding_category category;
9951       Lisp_Object spec, attrs;
9952
9953       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9954       attrs = AREF (spec, 0);
9955       category = XINT (CODING_ATTR_CATEGORY (attrs));
9956       if (changed[category])
9957         /* Ignore this coding system because a coding system of the
9958            same category already had a higher priority.  */
9959         continue;
9960       changed[category] = 1;
9961       priorities[j++] = category;
9962       if (coding_categories[category].id >= 0
9963           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9964         setup_coding_system (args[i], &coding_categories[category]);
9965       Fset (AREF (Vcoding_category_table, category), args[i]);
9966     }
9967
9968   /* Now we have decided top J priorities.  Reflect the order of the
9969      original priorities to the remaining priorities.  */
9970
9971   for (i = j, j = 0; i < coding_category_max; i++, j++)
9972     {
9973       while (j < coding_category_max
9974              && changed[coding_priorities[j]])
9975         j++;
9976       if (j == coding_category_max)
9977         emacs_abort ();
9978       priorities[i] = coding_priorities[j];
9979     }
9980
9981   memcpy (coding_priorities, priorities, sizeof priorities);
9982
9983   /* Update `coding-category-list'.  */
9984   Vcoding_category_list = Qnil;
9985   for (i = coding_category_max; i-- > 0; )
9986     Vcoding_category_list
9987       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9988                Vcoding_category_list);
9989
9990   return Qnil;
9991 }
9992
9993 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9994        Scoding_system_priority_list, 0, 1, 0,
9995        doc: /* Return a list of coding systems ordered by their priorities.
9996 The list contains a subset of coding systems; i.e. coding systems
9997 assigned to each coding category (see `coding-category-list').
9998
9999 HIGHESTP non-nil means just return the highest priority one.  */)
10000   (Lisp_Object highestp)
10001 {
10002   int i;
10003   Lisp_Object val;
10004
10005   for (i = 0, val = Qnil; i < coding_category_max; i++)
10006     {
10007       enum coding_category category = coding_priorities[i];
10008       int id = coding_categories[category].id;
10009       Lisp_Object attrs;
10010
10011       if (id < 0)
10012         continue;
10013       attrs = CODING_ID_ATTRS (id);
10014       if (! NILP (highestp))
10015         return CODING_ATTR_BASE_NAME (attrs);
10016       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10017     }
10018   return Fnreverse (val);
10019 }
10020
10021 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10022
10023 static Lisp_Object
10024 make_subsidiaries (Lisp_Object base)
10025 {
10026   Lisp_Object subsidiaries;
10027   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10028   USE_SAFE_ALLOCA;
10029   char *buf = SAFE_ALLOCA (base_name_len + 6);
10030   int i;
10031
10032   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10033   subsidiaries = make_uninit_vector (3);
10034   for (i = 0; i < 3; i++)
10035     {
10036       strcpy (buf + base_name_len, suffixes[i]);
10037       ASET (subsidiaries, i, intern (buf));
10038     }
10039   SAFE_FREE ();
10040   return subsidiaries;
10041 }
10042
10043
10044 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10045        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10046        doc: /* For internal use only.
10047 usage: (define-coding-system-internal ...)  */)
10048   (ptrdiff_t nargs, Lisp_Object *args)
10049 {
10050   Lisp_Object name;
10051   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10052   Lisp_Object attrs;            /* Vector of attributes.  */
10053   Lisp_Object eol_type;
10054   Lisp_Object aliases;
10055   Lisp_Object coding_type, charset_list, safe_charsets;
10056   enum coding_category category;
10057   Lisp_Object tail, val;
10058   int max_charset_id = 0;
10059   int i;
10060
10061   if (nargs < coding_arg_max)
10062     goto short_args;
10063
10064   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10065
10066   name = args[coding_arg_name];
10067   CHECK_SYMBOL (name);
10068   ASET (attrs, coding_attr_base_name, name);
10069
10070   val = args[coding_arg_mnemonic];
10071   if (! STRINGP (val))
10072     CHECK_CHARACTER (val);
10073   ASET (attrs, coding_attr_mnemonic, val);
10074
10075   coding_type = args[coding_arg_coding_type];
10076   CHECK_SYMBOL (coding_type);
10077   ASET (attrs, coding_attr_type, coding_type);
10078
10079   charset_list = args[coding_arg_charset_list];
10080   if (SYMBOLP (charset_list))
10081     {
10082       if (EQ (charset_list, Qiso_2022))
10083         {
10084           if (! EQ (coding_type, Qiso_2022))
10085             error ("Invalid charset-list");
10086           charset_list = Viso_2022_charset_list;
10087         }
10088       else if (EQ (charset_list, Qemacs_mule))
10089         {
10090           if (! EQ (coding_type, Qemacs_mule))
10091             error ("Invalid charset-list");
10092           charset_list = Vemacs_mule_charset_list;
10093         }
10094       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10095         {
10096           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10097             error ("Invalid charset-list");
10098           if (max_charset_id < XFASTINT (XCAR (tail)))
10099             max_charset_id = XFASTINT (XCAR (tail));
10100         }
10101     }
10102   else
10103     {
10104       charset_list = Fcopy_sequence (charset_list);
10105       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10106         {
10107           struct charset *charset;
10108
10109           val = XCAR (tail);
10110           CHECK_CHARSET_GET_CHARSET (val, charset);
10111           if (EQ (coding_type, Qiso_2022)
10112               ? CHARSET_ISO_FINAL (charset) < 0
10113               : EQ (coding_type, Qemacs_mule)
10114               ? CHARSET_EMACS_MULE_ID (charset) < 0
10115               : 0)
10116             error ("Can't handle charset `%s'",
10117                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10118
10119           XSETCAR (tail, make_number (charset->id));
10120           if (max_charset_id < charset->id)
10121             max_charset_id = charset->id;
10122         }
10123     }
10124   ASET (attrs, coding_attr_charset_list, charset_list);
10125
10126   safe_charsets = make_uninit_string (max_charset_id + 1);
10127   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10128   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10129     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10130   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10131
10132   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10133
10134   val = args[coding_arg_decode_translation_table];
10135   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10136     CHECK_SYMBOL (val);
10137   ASET (attrs, coding_attr_decode_tbl, val);
10138
10139   val = args[coding_arg_encode_translation_table];
10140   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10141     CHECK_SYMBOL (val);
10142   ASET (attrs, coding_attr_encode_tbl, val);
10143
10144   val = args[coding_arg_post_read_conversion];
10145   CHECK_SYMBOL (val);
10146   ASET (attrs, coding_attr_post_read, val);
10147
10148   val = args[coding_arg_pre_write_conversion];
10149   CHECK_SYMBOL (val);
10150   ASET (attrs, coding_attr_pre_write, val);
10151
10152   val = args[coding_arg_default_char];
10153   if (NILP (val))
10154     ASET (attrs, coding_attr_default_char, make_number (' '));
10155   else
10156     {
10157       CHECK_CHARACTER (val);
10158       ASET (attrs, coding_attr_default_char, val);
10159     }
10160
10161   val = args[coding_arg_for_unibyte];
10162   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10163
10164   val = args[coding_arg_plist];
10165   CHECK_LIST (val);
10166   ASET (attrs, coding_attr_plist, val);
10167
10168   if (EQ (coding_type, Qcharset))
10169     {
10170       /* Generate a lisp vector of 256 elements.  Each element is nil,
10171          integer, or a list of charset IDs.
10172
10173          If Nth element is nil, the byte code N is invalid in this
10174          coding system.
10175
10176          If Nth element is a number NUM, N is the first byte of a
10177          charset whose ID is NUM.
10178
10179          If Nth element is a list of charset IDs, N is the first byte
10180          of one of them.  The list is sorted by dimensions of the
10181          charsets.  A charset of smaller dimension comes first. */
10182       val = Fmake_vector (make_number (256), Qnil);
10183
10184       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10185         {
10186           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10187           int dim = CHARSET_DIMENSION (charset);
10188           int idx = (dim - 1) * 4;
10189
10190           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10191             ASET (attrs, coding_attr_ascii_compat, Qt);
10192
10193           for (i = charset->code_space[idx];
10194                i <= charset->code_space[idx + 1]; i++)
10195             {
10196               Lisp_Object tmp, tmp2;
10197               int dim2;
10198
10199               tmp = AREF (val, i);
10200               if (NILP (tmp))
10201                 tmp = XCAR (tail);
10202               else if (NUMBERP (tmp))
10203                 {
10204                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10205                   if (dim < dim2)
10206                     tmp = list2 (XCAR (tail), tmp);
10207                   else
10208                     tmp = list2 (tmp, XCAR (tail));
10209                 }
10210               else
10211                 {
10212                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10213                     {
10214                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10215                       if (dim < dim2)
10216                         break;
10217                     }
10218                   if (NILP (tmp2))
10219                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10220                   else
10221                     {
10222                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10223                       XSETCAR (tmp2, XCAR (tail));
10224                     }
10225                 }
10226               ASET (val, i, tmp);
10227             }
10228         }
10229       ASET (attrs, coding_attr_charset_valids, val);
10230       category = coding_category_charset;
10231     }
10232   else if (EQ (coding_type, Qccl))
10233     {
10234       Lisp_Object valids;
10235
10236       if (nargs < coding_arg_ccl_max)
10237         goto short_args;
10238
10239       val = args[coding_arg_ccl_decoder];
10240       CHECK_CCL_PROGRAM (val);
10241       if (VECTORP (val))
10242         val = Fcopy_sequence (val);
10243       ASET (attrs, coding_attr_ccl_decoder, val);
10244
10245       val = args[coding_arg_ccl_encoder];
10246       CHECK_CCL_PROGRAM (val);
10247       if (VECTORP (val))
10248         val = Fcopy_sequence (val);
10249       ASET (attrs, coding_attr_ccl_encoder, val);
10250
10251       val = args[coding_arg_ccl_valids];
10252       valids = Fmake_string (make_number (256), make_number (0));
10253       for (tail = val; CONSP (tail); tail = XCDR (tail))
10254         {
10255           int from, to;
10256
10257           val = XCAR (tail);
10258           if (INTEGERP (val))
10259             {
10260               if (! (0 <= XINT (val) && XINT (val) <= 255))
10261                 args_out_of_range_3 (val, make_number (0), make_number (255));
10262               from = to = XINT (val);
10263             }
10264           else
10265             {
10266               CHECK_CONS (val);
10267               CHECK_NATNUM_CAR (val);
10268               CHECK_NUMBER_CDR (val);
10269               if (XINT (XCAR (val)) > 255)
10270                 args_out_of_range_3 (XCAR (val),
10271                                      make_number (0), make_number (255));
10272               from = XINT (XCAR (val));
10273               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10274                 args_out_of_range_3 (XCDR (val),
10275                                      XCAR (val), make_number (255));
10276               to = XINT (XCDR (val));
10277             }
10278           for (i = from; i <= to; i++)
10279             SSET (valids, i, 1);
10280         }
10281       ASET (attrs, coding_attr_ccl_valids, valids);
10282
10283       category = coding_category_ccl;
10284     }
10285   else if (EQ (coding_type, Qutf_16))
10286     {
10287       Lisp_Object bom, endian;
10288
10289       ASET (attrs, coding_attr_ascii_compat, Qnil);
10290
10291       if (nargs < coding_arg_utf16_max)
10292         goto short_args;
10293
10294       bom = args[coding_arg_utf16_bom];
10295       if (! NILP (bom) && ! EQ (bom, Qt))
10296         {
10297           CHECK_CONS (bom);
10298           val = XCAR (bom);
10299           CHECK_CODING_SYSTEM (val);
10300           val = XCDR (bom);
10301           CHECK_CODING_SYSTEM (val);
10302         }
10303       ASET (attrs, coding_attr_utf_bom, bom);
10304
10305       endian = args[coding_arg_utf16_endian];
10306       CHECK_SYMBOL (endian);
10307       if (NILP (endian))
10308         endian = Qbig;
10309       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10310         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10311       ASET (attrs, coding_attr_utf_16_endian, endian);
10312
10313       category = (CONSP (bom)
10314                   ? coding_category_utf_16_auto
10315                   : NILP (bom)
10316                   ? (EQ (endian, Qbig)
10317                      ? coding_category_utf_16_be_nosig
10318                      : coding_category_utf_16_le_nosig)
10319                   : (EQ (endian, Qbig)
10320                      ? coding_category_utf_16_be
10321                      : coding_category_utf_16_le));
10322     }
10323   else if (EQ (coding_type, Qiso_2022))
10324     {
10325       Lisp_Object initial, reg_usage, request, flags;
10326
10327       if (nargs < coding_arg_iso2022_max)
10328         goto short_args;
10329
10330       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10331       CHECK_VECTOR (initial);
10332       for (i = 0; i < 4; i++)
10333         {
10334           val = AREF (initial, i);
10335           if (! NILP (val))
10336             {
10337               struct charset *charset;
10338
10339               CHECK_CHARSET_GET_CHARSET (val, charset);
10340               ASET (initial, i, make_number (CHARSET_ID (charset)));
10341               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10342                 ASET (attrs, coding_attr_ascii_compat, Qt);
10343             }
10344           else
10345             ASET (initial, i, make_number (-1));
10346         }
10347
10348       reg_usage = args[coding_arg_iso2022_reg_usage];
10349       CHECK_CONS (reg_usage);
10350       CHECK_NUMBER_CAR (reg_usage);
10351       CHECK_NUMBER_CDR (reg_usage);
10352
10353       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10354       for (tail = request; CONSP (tail); tail = XCDR (tail))
10355         {
10356           int id;
10357           Lisp_Object tmp1;
10358
10359           val = XCAR (tail);
10360           CHECK_CONS (val);
10361           tmp1 = XCAR (val);
10362           CHECK_CHARSET_GET_ID (tmp1, id);
10363           CHECK_NATNUM_CDR (val);
10364           if (XINT (XCDR (val)) >= 4)
10365             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10366           XSETCAR (val, make_number (id));
10367         }
10368
10369       flags = args[coding_arg_iso2022_flags];
10370       CHECK_NATNUM (flags);
10371       i = XINT (flags) & INT_MAX;
10372       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10373         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10374       flags = make_number (i);
10375
10376       ASET (attrs, coding_attr_iso_initial, initial);
10377       ASET (attrs, coding_attr_iso_usage, reg_usage);
10378       ASET (attrs, coding_attr_iso_request, request);
10379       ASET (attrs, coding_attr_iso_flags, flags);
10380       setup_iso_safe_charsets (attrs);
10381
10382       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10383         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10384                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10385                     ? coding_category_iso_7_else
10386                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10387                     ? coding_category_iso_7
10388                     : coding_category_iso_7_tight);
10389       else
10390         {
10391           int id = XINT (AREF (initial, 1));
10392
10393           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10394                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10395                        || id < 0)
10396                       ? coding_category_iso_8_else
10397                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10398                       ? coding_category_iso_8_1
10399                       : coding_category_iso_8_2);
10400         }
10401       if (category != coding_category_iso_8_1
10402           && category != coding_category_iso_8_2)
10403         ASET (attrs, coding_attr_ascii_compat, Qnil);
10404     }
10405   else if (EQ (coding_type, Qemacs_mule))
10406     {
10407       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10408         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10409       ASET (attrs, coding_attr_ascii_compat, Qt);
10410       category = coding_category_emacs_mule;
10411     }
10412   else if (EQ (coding_type, Qshift_jis))
10413     {
10414
10415       struct charset *charset;
10416
10417       if (XINT (Flength (charset_list)) != 3
10418           && XINT (Flength (charset_list)) != 4)
10419         error ("There should be three or four charsets");
10420
10421       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10422       if (CHARSET_DIMENSION (charset) != 1)
10423         error ("Dimension of charset %s is not one",
10424                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10425       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10426         ASET (attrs, coding_attr_ascii_compat, Qt);
10427
10428       charset_list = XCDR (charset_list);
10429       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10430       if (CHARSET_DIMENSION (charset) != 1)
10431         error ("Dimension of charset %s is not one",
10432                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10433
10434       charset_list = XCDR (charset_list);
10435       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10436       if (CHARSET_DIMENSION (charset) != 2)
10437         error ("Dimension of charset %s is not two",
10438                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10439
10440       charset_list = XCDR (charset_list);
10441       if (! NILP (charset_list))
10442         {
10443           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10444           if (CHARSET_DIMENSION (charset) != 2)
10445             error ("Dimension of charset %s is not two",
10446                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10447         }
10448
10449       category = coding_category_sjis;
10450       Vsjis_coding_system = name;
10451     }
10452   else if (EQ (coding_type, Qbig5))
10453     {
10454       struct charset *charset;
10455
10456       if (XINT (Flength (charset_list)) != 2)
10457         error ("There should be just two charsets");
10458
10459       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10460       if (CHARSET_DIMENSION (charset) != 1)
10461         error ("Dimension of charset %s is not one",
10462                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10463       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10464         ASET (attrs, coding_attr_ascii_compat, Qt);
10465
10466       charset_list = XCDR (charset_list);
10467       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10468       if (CHARSET_DIMENSION (charset) != 2)
10469         error ("Dimension of charset %s is not two",
10470                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10471
10472       category = coding_category_big5;
10473       Vbig5_coding_system = name;
10474     }
10475   else if (EQ (coding_type, Qraw_text))
10476     {
10477       category = coding_category_raw_text;
10478       ASET (attrs, coding_attr_ascii_compat, Qt);
10479     }
10480   else if (EQ (coding_type, Qutf_8))
10481     {
10482       Lisp_Object bom;
10483
10484       if (nargs < coding_arg_utf8_max)
10485         goto short_args;
10486
10487       bom = args[coding_arg_utf8_bom];
10488       if (! NILP (bom) && ! EQ (bom, Qt))
10489         {
10490           CHECK_CONS (bom);
10491           val = XCAR (bom);
10492           CHECK_CODING_SYSTEM (val);
10493           val = XCDR (bom);
10494           CHECK_CODING_SYSTEM (val);
10495         }
10496       ASET (attrs, coding_attr_utf_bom, bom);
10497       if (NILP (bom))
10498         ASET (attrs, coding_attr_ascii_compat, Qt);
10499
10500       category = (CONSP (bom) ? coding_category_utf_8_auto
10501                   : NILP (bom) ? coding_category_utf_8_nosig
10502                   : coding_category_utf_8_sig);
10503     }
10504   else if (EQ (coding_type, Qundecided))
10505     {
10506       if (nargs < coding_arg_undecided_max)
10507         goto short_args;
10508       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10509             args[coding_arg_undecided_inhibit_null_byte_detection]);
10510       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10511             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10512       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10513             args[coding_arg_undecided_prefer_utf_8]);
10514       category = coding_category_undecided;
10515     }
10516   else
10517     error ("Invalid coding system type: %s",
10518            SDATA (SYMBOL_NAME (coding_type)));
10519
10520   ASET (attrs, coding_attr_category, make_number (category));
10521   ASET (attrs, coding_attr_plist,
10522         Fcons (QCcategory,
10523                Fcons (AREF (Vcoding_category_table, category),
10524                       CODING_ATTR_PLIST (attrs))));
10525   ASET (attrs, coding_attr_plist,
10526         Fcons (QCascii_compatible_p,
10527                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10528                       CODING_ATTR_PLIST (attrs))));
10529
10530   eol_type = args[coding_arg_eol_type];
10531   if (! NILP (eol_type)
10532       && ! EQ (eol_type, Qunix)
10533       && ! EQ (eol_type, Qdos)
10534       && ! EQ (eol_type, Qmac))
10535     error ("Invalid eol-type");
10536
10537   aliases = list1 (name);
10538
10539   if (NILP (eol_type))
10540     {
10541       eol_type = make_subsidiaries (name);
10542       for (i = 0; i < 3; i++)
10543         {
10544           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10545
10546           this_name = AREF (eol_type, i);
10547           this_aliases = list1 (this_name);
10548           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10549           this_spec = make_uninit_vector (3);
10550           ASET (this_spec, 0, attrs);
10551           ASET (this_spec, 1, this_aliases);
10552           ASET (this_spec, 2, this_eol_type);
10553           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10554           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10555           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist, Qnil);
10556           if (NILP (val))
10557             Vcoding_system_alist
10558               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10559                        Vcoding_system_alist);
10560         }
10561     }
10562
10563   spec_vec = make_uninit_vector (3);
10564   ASET (spec_vec, 0, attrs);
10565   ASET (spec_vec, 1, aliases);
10566   ASET (spec_vec, 2, eol_type);
10567
10568   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10569   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10570   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist, Qnil);
10571   if (NILP (val))
10572     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10573                                   Vcoding_system_alist);
10574
10575   {
10576     int id = coding_categories[category].id;
10577
10578     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10579       setup_coding_system (name, &coding_categories[category]);
10580   }
10581
10582   return Qnil;
10583
10584  short_args:
10585   Fsignal (Qwrong_number_of_arguments,
10586            Fcons (intern ("define-coding-system-internal"),
10587                   make_number (nargs)));
10588 }
10589
10590
10591 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10592        3, 3, 0,
10593        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10594   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10595 {
10596   Lisp_Object spec, attrs;
10597
10598   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10599   attrs = AREF (spec, 0);
10600   if (EQ (prop, QCmnemonic))
10601     {
10602       if (! STRINGP (val))
10603         CHECK_CHARACTER (val);
10604       ASET (attrs, coding_attr_mnemonic, val);
10605     }
10606   else if (EQ (prop, QCdefault_char))
10607     {
10608       if (NILP (val))
10609         val = make_number (' ');
10610       else
10611         CHECK_CHARACTER (val);
10612       ASET (attrs, coding_attr_default_char, val);
10613     }
10614   else if (EQ (prop, QCdecode_translation_table))
10615     {
10616       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10617         CHECK_SYMBOL (val);
10618       ASET (attrs, coding_attr_decode_tbl, val);
10619     }
10620   else if (EQ (prop, QCencode_translation_table))
10621     {
10622       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10623         CHECK_SYMBOL (val);
10624       ASET (attrs, coding_attr_encode_tbl, val);
10625     }
10626   else if (EQ (prop, QCpost_read_conversion))
10627     {
10628       CHECK_SYMBOL (val);
10629       ASET (attrs, coding_attr_post_read, val);
10630     }
10631   else if (EQ (prop, QCpre_write_conversion))
10632     {
10633       CHECK_SYMBOL (val);
10634       ASET (attrs, coding_attr_pre_write, val);
10635     }
10636   else if (EQ (prop, QCascii_compatible_p))
10637     {
10638       ASET (attrs, coding_attr_ascii_compat, val);
10639     }
10640
10641   ASET (attrs, coding_attr_plist,
10642         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10643   return val;
10644 }
10645
10646
10647 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10648        Sdefine_coding_system_alias, 2, 2, 0,
10649        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10650   (Lisp_Object alias, Lisp_Object coding_system)
10651 {
10652   Lisp_Object spec, aliases, eol_type, val;
10653
10654   CHECK_SYMBOL (alias);
10655   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10656   aliases = AREF (spec, 1);
10657   /* ALIASES should be a list of length more than zero, and the first
10658      element is a base coding system.  Append ALIAS at the tail of the
10659      list.  */
10660   while (!NILP (XCDR (aliases)))
10661     aliases = XCDR (aliases);
10662   XSETCDR (aliases, list1 (alias));
10663
10664   eol_type = AREF (spec, 2);
10665   if (VECTORP (eol_type))
10666     {
10667       Lisp_Object subsidiaries;
10668       int i;
10669
10670       subsidiaries = make_subsidiaries (alias);
10671       for (i = 0; i < 3; i++)
10672         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10673                                      AREF (eol_type, i));
10674     }
10675
10676   Fputhash (alias, spec, Vcoding_system_hash_table);
10677   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10678   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist, Qnil);
10679   if (NILP (val))
10680     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10681                                   Vcoding_system_alist);
10682
10683   return Qnil;
10684 }
10685
10686 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10687        1, 1, 0,
10688        doc: /* Return the base of CODING-SYSTEM.
10689 Any alias or subsidiary coding system is not a base coding system.  */)
10690   (Lisp_Object coding_system)
10691 {
10692   Lisp_Object spec, attrs;
10693
10694   if (NILP (coding_system))
10695     return (Qno_conversion);
10696   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10697   attrs = AREF (spec, 0);
10698   return CODING_ATTR_BASE_NAME (attrs);
10699 }
10700
10701 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10702        1, 1, 0,
10703        doc: /* Return the property list of CODING-SYSTEM.  */)
10704   (Lisp_Object coding_system)
10705 {
10706   Lisp_Object spec, attrs;
10707
10708   if (NILP (coding_system))
10709     coding_system = Qno_conversion;
10710   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10711   attrs = AREF (spec, 0);
10712   return CODING_ATTR_PLIST (attrs);
10713 }
10714
10715
10716 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10717        1, 1, 0,
10718        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10719   (Lisp_Object coding_system)
10720 {
10721   Lisp_Object spec;
10722
10723   if (NILP (coding_system))
10724     coding_system = Qno_conversion;
10725   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10726   return AREF (spec, 1);
10727 }
10728
10729 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10730        Scoding_system_eol_type, 1, 1, 0,
10731        doc: /* Return eol-type of CODING-SYSTEM.
10732 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10733
10734 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10735 and CR respectively.
10736
10737 A vector value indicates that a format of end-of-line should be
10738 detected automatically.  Nth element of the vector is the subsidiary
10739 coding system whose eol-type is N.  */)
10740   (Lisp_Object coding_system)
10741 {
10742   Lisp_Object spec, eol_type;
10743   int n;
10744
10745   if (NILP (coding_system))
10746     coding_system = Qno_conversion;
10747   if (! CODING_SYSTEM_P (coding_system))
10748     return Qnil;
10749   spec = CODING_SYSTEM_SPEC (coding_system);
10750   eol_type = AREF (spec, 2);
10751   if (VECTORP (eol_type))
10752     return Fcopy_sequence (eol_type);
10753   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10754   return make_number (n);
10755 }
10756
10757 #endif /* emacs */
10758
10759 \f
10760 /*** 9. Post-amble ***/
10761
10762 void
10763 init_coding_once (void)
10764 {
10765   int i;
10766
10767   for (i = 0; i < coding_category_max; i++)
10768     {
10769       coding_categories[i].id = -1;
10770       coding_priorities[i] = i;
10771     }
10772
10773   /* ISO2022 specific initialize routine.  */
10774   for (i = 0; i < 0x20; i++)
10775     iso_code_class[i] = ISO_control_0;
10776   for (i = 0x21; i < 0x7F; i++)
10777     iso_code_class[i] = ISO_graphic_plane_0;
10778   for (i = 0x80; i < 0xA0; i++)
10779     iso_code_class[i] = ISO_control_1;
10780   for (i = 0xA1; i < 0xFF; i++)
10781     iso_code_class[i] = ISO_graphic_plane_1;
10782   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10783   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10784   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10785   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10786   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10787   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10788   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10789   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10790   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10791
10792   for (i = 0; i < 256; i++)
10793     {
10794       emacs_mule_bytes[i] = 1;
10795     }
10796   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10797   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10798   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10799   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10800 }
10801
10802 #ifdef emacs
10803
10804 void
10805 syms_of_coding (void)
10806 {
10807   staticpro (&Vcoding_system_hash_table);
10808   Vcoding_system_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
10809
10810   staticpro (&Vsjis_coding_system);
10811   Vsjis_coding_system = Qnil;
10812
10813   staticpro (&Vbig5_coding_system);
10814   Vbig5_coding_system = Qnil;
10815
10816   staticpro (&Vcode_conversion_reused_workbuf);
10817   Vcode_conversion_reused_workbuf = Qnil;
10818
10819   staticpro (&Vcode_conversion_workbuf_name);
10820   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10821
10822   reused_workbuf_in_use = 0;
10823
10824   DEFSYM (Qcharset, "charset");
10825   DEFSYM (Qtarget_idx, "target-idx");
10826   DEFSYM (Qcoding_system_history, "coding-system-history");
10827   Fset (Qcoding_system_history, Qnil);
10828
10829   /* Target FILENAME is the first argument.  */
10830   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10831   /* Target FILENAME is the third argument.  */
10832   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10833
10834   DEFSYM (Qcall_process, "call-process");
10835   /* Target PROGRAM is the first argument.  */
10836   Fput (Qcall_process, Qtarget_idx, make_number (0));
10837
10838   DEFSYM (Qcall_process_region, "call-process-region");
10839   /* Target PROGRAM is the third argument.  */
10840   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10841
10842   DEFSYM (Qstart_process, "start-process");
10843   /* Target PROGRAM is the third argument.  */
10844   Fput (Qstart_process, Qtarget_idx, make_number (2));
10845
10846   DEFSYM (Qopen_network_stream, "open-network-stream");
10847   /* Target SERVICE is the fourth argument.  */
10848   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10849
10850   DEFSYM (Qunix, "unix");
10851   DEFSYM (Qdos, "dos");
10852   DEFSYM (Qmac, "mac");
10853
10854   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10855   DEFSYM (Qundecided, "undecided");
10856   DEFSYM (Qno_conversion, "no-conversion");
10857   DEFSYM (Qraw_text, "raw-text");
10858
10859   DEFSYM (Qiso_2022, "iso-2022");
10860
10861   DEFSYM (Qutf_8, "utf-8");
10862   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10863
10864 #if defined (WINDOWSNT) || defined (CYGWIN)
10865   /* No, not utf-16-le: that one has a BOM.  */
10866   DEFSYM (Qutf_16le, "utf-16le");
10867 #endif
10868
10869   DEFSYM (Qutf_16, "utf-16");
10870   DEFSYM (Qbig, "big");
10871   DEFSYM (Qlittle, "little");
10872
10873   DEFSYM (Qshift_jis, "shift-jis");
10874   DEFSYM (Qbig5, "big5");
10875
10876   DEFSYM (Qcoding_system_p, "coding-system-p");
10877
10878   /* Error signaled when there's a problem with detecting a coding system.  */
10879   DEFSYM (Qcoding_system_error, "coding-system-error");
10880   Fput (Qcoding_system_error, Qerror_conditions,
10881         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10882   Fput (Qcoding_system_error, Qerror_message,
10883         build_pure_c_string ("Invalid coding system"));
10884
10885   DEFSYM (Qtranslation_table, "translation-table");
10886   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10887   DEFSYM (Qtranslation_table_id, "translation-table-id");
10888
10889   /* Coding system emacs-mule and raw-text are for converting only
10890      end-of-line format.  */
10891   DEFSYM (Qemacs_mule, "emacs-mule");
10892
10893   DEFSYM (QCcategory, ":category");
10894   DEFSYM (QCmnemonic, ":mnemonic");
10895   DEFSYM (QCdefault_char, ":default-char");
10896   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10897   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10898   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10899   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10900   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10901
10902   Vcoding_category_table
10903     = Fmake_vector (make_number (coding_category_max), Qnil);
10904   staticpro (&Vcoding_category_table);
10905   /* Followings are target of code detection.  */
10906   ASET (Vcoding_category_table, coding_category_iso_7,
10907         intern_c_string ("coding-category-iso-7"));
10908   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10909         intern_c_string ("coding-category-iso-7-tight"));
10910   ASET (Vcoding_category_table, coding_category_iso_8_1,
10911         intern_c_string ("coding-category-iso-8-1"));
10912   ASET (Vcoding_category_table, coding_category_iso_8_2,
10913         intern_c_string ("coding-category-iso-8-2"));
10914   ASET (Vcoding_category_table, coding_category_iso_7_else,
10915         intern_c_string ("coding-category-iso-7-else"));
10916   ASET (Vcoding_category_table, coding_category_iso_8_else,
10917         intern_c_string ("coding-category-iso-8-else"));
10918   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10919         intern_c_string ("coding-category-utf-8-auto"));
10920   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10921         intern_c_string ("coding-category-utf-8"));
10922   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10923         intern_c_string ("coding-category-utf-8-sig"));
10924   ASET (Vcoding_category_table, coding_category_utf_16_be,
10925         intern_c_string ("coding-category-utf-16-be"));
10926   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10927         intern_c_string ("coding-category-utf-16-auto"));
10928   ASET (Vcoding_category_table, coding_category_utf_16_le,
10929         intern_c_string ("coding-category-utf-16-le"));
10930   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10931         intern_c_string ("coding-category-utf-16-be-nosig"));
10932   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10933         intern_c_string ("coding-category-utf-16-le-nosig"));
10934   ASET (Vcoding_category_table, coding_category_charset,
10935         intern_c_string ("coding-category-charset"));
10936   ASET (Vcoding_category_table, coding_category_sjis,
10937         intern_c_string ("coding-category-sjis"));
10938   ASET (Vcoding_category_table, coding_category_big5,
10939         intern_c_string ("coding-category-big5"));
10940   ASET (Vcoding_category_table, coding_category_ccl,
10941         intern_c_string ("coding-category-ccl"));
10942   ASET (Vcoding_category_table, coding_category_emacs_mule,
10943         intern_c_string ("coding-category-emacs-mule"));
10944   /* Followings are NOT target of code detection.  */
10945   ASET (Vcoding_category_table, coding_category_raw_text,
10946         intern_c_string ("coding-category-raw-text"));
10947   ASET (Vcoding_category_table, coding_category_undecided,
10948         intern_c_string ("coding-category-undecided"));
10949
10950   DEFSYM (Qinsufficient_source, "insufficient-source");
10951   DEFSYM (Qinvalid_source, "invalid-source");
10952   DEFSYM (Qinterrupted, "interrupted");
10953
10954   /* If a symbol has this property, evaluate the value to define the
10955      symbol as a coding system.  */
10956   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10957
10958   defsubr (&Scoding_system_p);
10959   defsubr (&Sread_coding_system);
10960   defsubr (&Sread_non_nil_coding_system);
10961   defsubr (&Scheck_coding_system);
10962   defsubr (&Sdetect_coding_region);
10963   defsubr (&Sdetect_coding_string);
10964   defsubr (&Sfind_coding_systems_region_internal);
10965   defsubr (&Sunencodable_char_position);
10966   defsubr (&Scheck_coding_systems_region);
10967   defsubr (&Sdecode_coding_region);
10968   defsubr (&Sencode_coding_region);
10969   defsubr (&Sdecode_coding_string);
10970   defsubr (&Sencode_coding_string);
10971   defsubr (&Sdecode_sjis_char);
10972   defsubr (&Sencode_sjis_char);
10973   defsubr (&Sdecode_big5_char);
10974   defsubr (&Sencode_big5_char);
10975   defsubr (&Sset_terminal_coding_system_internal);
10976   defsubr (&Sset_safe_terminal_coding_system_internal);
10977   defsubr (&Sterminal_coding_system);
10978   defsubr (&Sset_keyboard_coding_system_internal);
10979   defsubr (&Skeyboard_coding_system);
10980   defsubr (&Sfind_operation_coding_system);
10981   defsubr (&Sset_coding_system_priority);
10982   defsubr (&Sdefine_coding_system_internal);
10983   defsubr (&Sdefine_coding_system_alias);
10984   defsubr (&Scoding_system_put);
10985   defsubr (&Scoding_system_base);
10986   defsubr (&Scoding_system_plist);
10987   defsubr (&Scoding_system_aliases);
10988   defsubr (&Scoding_system_eol_type);
10989   defsubr (&Scoding_system_priority_list);
10990
10991   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10992                doc: /* List of coding systems.
10993
10994 Do not alter the value of this variable manually.  This variable should be
10995 updated by the functions `define-coding-system' and
10996 `define-coding-system-alias'.  */);
10997   Vcoding_system_list = Qnil;
10998
10999   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
11000                doc: /* Alist of coding system names.
11001 Each element is one element list of coding system name.
11002 This variable is given to `completing-read' as COLLECTION argument.
11003
11004 Do not alter the value of this variable manually.  This variable should be
11005 updated by the functions `make-coding-system' and
11006 `define-coding-system-alias'.  */);
11007   Vcoding_system_alist = Qnil;
11008
11009   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
11010                doc: /* List of coding-categories (symbols) ordered by priority.
11011
11012 On detecting a coding system, Emacs tries code detection algorithms
11013 associated with each coding-category one by one in this order.  When
11014 one algorithm agrees with a byte sequence of source text, the coding
11015 system bound to the corresponding coding-category is selected.
11016
11017 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
11018   {
11019     int i;
11020
11021     Vcoding_category_list = Qnil;
11022     for (i = coding_category_max - 1; i >= 0; i--)
11023       Vcoding_category_list
11024         = Fcons (AREF (Vcoding_category_table, i),
11025                  Vcoding_category_list);
11026   }
11027
11028   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11029                doc: /* Specify the coding system for read operations.
11030 It is useful to bind this variable with `let', but do not set it globally.
11031 If the value is a coding system, it is used for decoding on read operation.
11032 If not, an appropriate element is used from one of the coding system alists.
11033 There are three such tables: `file-coding-system-alist',
11034 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11035   Vcoding_system_for_read = Qnil;
11036
11037   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11038                doc: /* Specify the coding system for write operations.
11039 Programs bind this variable with `let', but you should not set it globally.
11040 If the value is a coding system, it is used for encoding of output,
11041 when writing it to a file and when sending it to a file or subprocess.
11042
11043 If this does not specify a coding system, an appropriate element
11044 is used from one of the coding system alists.
11045 There are three such tables: `file-coding-system-alist',
11046 `process-coding-system-alist', and `network-coding-system-alist'.
11047 For output to files, if the above procedure does not specify a coding system,
11048 the value of `buffer-file-coding-system' is used.  */);
11049   Vcoding_system_for_write = Qnil;
11050
11051   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11052                doc: /*
11053 Coding system used in the latest file or process I/O.  */);
11054   Vlast_coding_system_used = Qnil;
11055
11056   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11057                doc: /*
11058 Error status of the last code conversion.
11059
11060 When an error was detected in the last code conversion, this variable
11061 is set to one of the following symbols.
11062   `insufficient-source'
11063   `inconsistent-eol'
11064   `invalid-source'
11065   `interrupted'
11066   `insufficient-memory'
11067 When no error was detected, the value doesn't change.  So, to check
11068 the error status of a code conversion by this variable, you must
11069 explicitly set this variable to nil before performing code
11070 conversion.  */);
11071   Vlast_code_conversion_error = Qnil;
11072
11073   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11074                doc: /*
11075 Non-nil means always inhibit code conversion of end-of-line format.
11076 See info node `Coding Systems' and info node `Text and Binary' concerning
11077 such conversion.  */);
11078   inhibit_eol_conversion = 0;
11079
11080   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11081                doc: /*
11082 Non-nil means process buffer inherits coding system of process output.
11083 Bind it to t if the process output is to be treated as if it were a file
11084 read from some filesystem.  */);
11085   inherit_process_coding_system = 0;
11086
11087   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11088                doc: /*
11089 Alist to decide a coding system to use for a file I/O operation.
11090 The format is ((PATTERN . VAL) ...),
11091 where PATTERN is a regular expression matching a file name,
11092 VAL is a coding system, a cons of coding systems, or a function symbol.
11093 If VAL is a coding system, it is used for both decoding and encoding
11094 the file contents.
11095 If VAL is a cons of coding systems, the car part is used for decoding,
11096 and the cdr part is used for encoding.
11097 If VAL is a function symbol, the function must return a coding system
11098 or a cons of coding systems which are used as above.  The function is
11099 called with an argument that is a list of the arguments with which
11100 `find-operation-coding-system' was called.  If the function can't decide
11101 a coding system, it can return `undecided' so that the normal
11102 code-detection is performed.
11103
11104 See also the function `find-operation-coding-system'
11105 and the variable `auto-coding-alist'.  */);
11106   Vfile_coding_system_alist = Qnil;
11107
11108   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11109                doc: /*
11110 Alist to decide a coding system to use for a process I/O operation.
11111 The format is ((PATTERN . VAL) ...),
11112 where PATTERN is a regular expression matching a program name,
11113 VAL is a coding system, a cons of coding systems, or a function symbol.
11114 If VAL is a coding system, it is used for both decoding what received
11115 from the program and encoding what sent to the program.
11116 If VAL is a cons of coding systems, the car part is used for decoding,
11117 and the cdr part is used for encoding.
11118 If VAL is a function symbol, the function must return a coding system
11119 or a cons of coding systems which are used as above.
11120
11121 See also the function `find-operation-coding-system'.  */);
11122   Vprocess_coding_system_alist = Qnil;
11123
11124   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11125                doc: /*
11126 Alist to decide a coding system to use for a network I/O operation.
11127 The format is ((PATTERN . VAL) ...),
11128 where PATTERN is a regular expression matching a network service name
11129 or is a port number to connect to,
11130 VAL is a coding system, a cons of coding systems, or a function symbol.
11131 If VAL is a coding system, it is used for both decoding what received
11132 from the network stream and encoding what sent to the network stream.
11133 If VAL is a cons of coding systems, the car part is used for decoding,
11134 and the cdr part is used for encoding.
11135 If VAL is a function symbol, the function must return a coding system
11136 or a cons of coding systems which are used as above.
11137
11138 See also the function `find-operation-coding-system'.  */);
11139   Vnetwork_coding_system_alist = Qnil;
11140
11141   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11142                doc: /* Coding system to use with system messages.
11143 Also used for decoding keyboard input on X Window system, and for
11144 encoding standard output and error streams.  */);
11145   Vlocale_coding_system = Qnil;
11146
11147   /* The eol mnemonics are reset in startup.el system-dependently.  */
11148   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11149                doc: /*
11150 String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11151   eol_mnemonic_unix = build_pure_c_string (":");
11152
11153   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11154                doc: /*
11155 String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11156   eol_mnemonic_dos = build_pure_c_string ("\\");
11157
11158   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11159                doc: /*
11160 String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11161   eol_mnemonic_mac = build_pure_c_string ("/");
11162
11163   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11164                doc: /*
11165 String displayed in mode line when end-of-line format is not yet determined.  */);
11166   eol_mnemonic_undecided = build_pure_c_string (":");
11167
11168   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11169                doc: /*
11170 Non-nil enables character translation while encoding and decoding.  */);
11171   Venable_character_translation = Qt;
11172
11173   DEFVAR_LISP ("standard-translation-table-for-decode",
11174                Vstandard_translation_table_for_decode,
11175                doc: /* Table for translating characters while decoding.  */);
11176   Vstandard_translation_table_for_decode = Qnil;
11177
11178   DEFVAR_LISP ("standard-translation-table-for-encode",
11179                Vstandard_translation_table_for_encode,
11180                doc: /* Table for translating characters while encoding.  */);
11181   Vstandard_translation_table_for_encode = Qnil;
11182
11183   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11184                doc: /* Alist of charsets vs revision numbers.
11185 While encoding, if a charset (car part of an element) is found,
11186 designate it with the escape sequence identifying revision (cdr part
11187 of the element).  */);
11188   Vcharset_revision_table = Qnil;
11189
11190   DEFVAR_LISP ("default-process-coding-system",
11191                Vdefault_process_coding_system,
11192                doc: /* Cons of coding systems used for process I/O by default.
11193 The car part is used for decoding a process output,
11194 the cdr part is used for encoding a text to be sent to a process.  */);
11195   Vdefault_process_coding_system = Qnil;
11196
11197   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11198                doc: /*
11199 Table of extra Latin codes in the range 128..159 (inclusive).
11200 This is a vector of length 256.
11201 If Nth element is non-nil, the existence of code N in a file
11202 \(or output of subprocess) doesn't prevent it to be detected as
11203 a coding system of ISO 2022 variant which has a flag
11204 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11205 or reading output of a subprocess.
11206 Only 128th through 159th elements have a meaning.  */);
11207   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11208
11209   DEFVAR_LISP ("select-safe-coding-system-function",
11210                Vselect_safe_coding_system_function,
11211                doc: /*
11212 Function to call to select safe coding system for encoding a text.
11213
11214 If set, this function is called to force a user to select a proper
11215 coding system which can encode the text in the case that a default
11216 coding system used in each operation can't encode the text.  The
11217 function should take care that the buffer is not modified while
11218 the coding system is being selected.
11219
11220 The default value is `select-safe-coding-system' (which see).  */);
11221   Vselect_safe_coding_system_function = Qnil;
11222
11223   DEFVAR_BOOL ("coding-system-require-warning",
11224                coding_system_require_warning,
11225                doc: /* Internal use only.
11226 If non-nil, on writing a file, `select-safe-coding-system-function' is
11227 called even if `coding-system-for-write' is non-nil.  The command
11228 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11229   coding_system_require_warning = 0;
11230
11231
11232   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11233                inhibit_iso_escape_detection,
11234                doc: /*
11235 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11236
11237 When Emacs reads text, it tries to detect how the text is encoded.
11238 This code detection is sensitive to escape sequences.  If Emacs sees
11239 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11240 of the ISO2022 encodings, and decodes text by the corresponding coding
11241 system (e.g. `iso-2022-7bit').
11242
11243 However, there may be a case that you want to read escape sequences in
11244 a file as is.  In such a case, you can set this variable to non-nil.
11245 Then the code detection will ignore any escape sequences, and no text is
11246 detected as encoded in some ISO-2022 encoding.  The result is that all
11247 escape sequences become visible in a buffer.
11248
11249 The default value is nil, and it is strongly recommended not to change
11250 it.  That is because many Emacs Lisp source files that contain
11251 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11252 in Emacs's distribution, and they won't be decoded correctly on
11253 reading if you suppress escape sequence detection.
11254
11255 The other way to read escape sequences in a file without decoding is
11256 to explicitly specify some coding system that doesn't use ISO-2022
11257 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11258   inhibit_iso_escape_detection = 0;
11259
11260   DEFVAR_BOOL ("inhibit-null-byte-detection",
11261                inhibit_null_byte_detection,
11262                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11263 By default, Emacs treats it as binary data, and does not attempt to
11264 decode it.  The effect is as if you specified `no-conversion' for
11265 reading that text.
11266
11267 Set this to non-nil when a regular text happens to include null bytes.
11268 Examples are Index nodes of Info files and null-byte delimited output
11269 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11270 decode text as usual.  */);
11271   inhibit_null_byte_detection = 0;
11272
11273   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11274                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11275 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11276   disable_ascii_optimization = 0;
11277
11278   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11279                doc: /* Char table for translating self-inserting characters.
11280 This is applied to the result of input methods, not their input.
11281 See also `keyboard-translate-table'.
11282
11283 Use of this variable for character code unification was rendered
11284 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11285 internal character representation.  */);
11286   Vtranslation_table_for_input = Qnil;
11287
11288   Lisp_Object args[coding_arg_undecided_max];
11289   memclear (args, sizeof args);
11290
11291   Lisp_Object plist[] =
11292     {
11293       QCname,
11294       args[coding_arg_name] = Qno_conversion,
11295       QCmnemonic,
11296       args[coding_arg_mnemonic] = make_number ('='),
11297       intern_c_string (":coding-type"),
11298       args[coding_arg_coding_type] = Qraw_text,
11299       QCascii_compatible_p,
11300       args[coding_arg_ascii_compatible_p] = Qt,
11301       QCdefault_char,
11302       args[coding_arg_default_char] = make_number (0),
11303       intern_c_string (":for-unibyte"),
11304       args[coding_arg_for_unibyte] = Qt,
11305       intern_c_string (":docstring"),
11306       (build_pure_c_string
11307        ("Do no conversion.\n"
11308         "\n"
11309         "When you visit a file with this coding, the file is read into a\n"
11310         "unibyte buffer as is, thus each byte of a file is treated as a\n"
11311         "character.")),
11312       intern_c_string (":eol-type"),
11313       args[coding_arg_eol_type] = Qunix,
11314     };
11315   args[coding_arg_plist] = CALLMANY (Flist, plist);
11316   Fdefine_coding_system_internal (coding_arg_max, args);
11317
11318   plist[1] = args[coding_arg_name] = Qundecided;
11319   plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11320   plist[5] = args[coding_arg_coding_type] = Qundecided;
11321   /* This is already set.
11322      plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11323   plist[8] = intern_c_string (":charset-list");
11324   plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11325   plist[11] = args[coding_arg_for_unibyte] = Qnil;
11326   plist[13] = build_pure_c_string ("No conversion on encoding, "
11327                                    "automatic conversion on decoding.");
11328   plist[15] = args[coding_arg_eol_type] = Qnil;
11329   args[coding_arg_plist] = CALLMANY (Flist, plist);
11330   args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11331   args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11332   Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11333
11334   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11335
11336   for (int i = 0; i < coding_category_max; i++)
11337     Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11338
11339 #if defined (DOS_NT)
11340   system_eol_type = Qdos;
11341 #else
11342   system_eol_type = Qunix;
11343 #endif
11344   staticpro (&system_eol_type);
11345 }
11346 #endif /* emacs */