src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2018 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or (at
  16 your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  Classic Mac OS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce an encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "termhooks.h"
 301
 302 Lisp_Object Vcoding_system_hash_table;
 303
 304 /* Format of end-of-line decided by system.  This is Qunix on
 305    Unix and Mac, Qdos on DOS/Windows.
 306    This has an effect only for external encoding (i.e. for output to
 307    file and process), not for in-buffer or Lisp string encoding.  */
 308 static Lisp_Object system_eol_type;
 309
 310 #ifdef emacs
 311
 312 /* Coding-systems are handed between Emacs Lisp programs and C internal
 313    routines by the following three variables.  */
 314 /* Coding system to be used to encode text for terminal display when
 315    terminal coding system is nil.  */
 316 struct coding_system safe_terminal_coding;
 317
 318 #endif /* emacs */
 319
 320 /* Two special coding systems.  */
 321 static Lisp_Object Vsjis_coding_system;
 322 static Lisp_Object Vbig5_coding_system;
 323
 324 /* ISO2022 section */
 325
 326 #define CODING_ISO_INITIAL(coding, reg)                 \
 327   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 328                      coding_attr_iso_initial),          \
 329                reg)))
 330
 331
 332 #define CODING_ISO_REQUEST(coding, charset_id)          \
 333   (((charset_id) <= (coding)->max_charset_id            \
 334     ? ((coding)->safe_charsets[charset_id] != 255       \
 335        ? (coding)->safe_charsets[charset_id]            \
 336        : -1)                                            \
 337     : -1))
 338
 339
 340 #define CODING_ISO_FLAGS(coding)        \
 341   ((coding)->spec.iso_2022.flags)
 342 #define CODING_ISO_DESIGNATION(coding, reg)     \
 343   ((coding)->spec.iso_2022.current_designation[reg])
 344 #define CODING_ISO_INVOCATION(coding, plane)    \
 345   ((coding)->spec.iso_2022.current_invocation[plane])
 346 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 347   ((coding)->spec.iso_2022.single_shifting)
 348 #define CODING_ISO_BOL(coding)  \
 349   ((coding)->spec.iso_2022.bol)
 350 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 351   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1       \
 352    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
 353 #define CODING_ISO_CMP_STATUS(coding)   \
 354   (&(coding)->spec.iso_2022.cmp_status)
 355 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 356   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 357 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 358   ((coding)->spec.iso_2022.embedded_utf_8)
 359
 360 /* Control characters of ISO2022.  */
 361                         /* code */      /* function */
 362 #define ISO_CODE_SO     0x0E            /* shift-out */
 363 #define ISO_CODE_SI     0x0F            /* shift-in */
 364 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 365 #define ISO_CODE_ESC    0x1B            /* escape */
 366 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 367 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 368 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 369
 370 /* All code (1-byte) of ISO2022 is classified into one of the
 371    followings.  */
 372 enum iso_code_class_type
 373   {
 374     ISO_control_0,              /* Control codes in the range
 375                                    0x00..0x1F and 0x7F, except for the
 376                                    following 5 codes.  */
 377     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 378     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 379     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 380     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 381     ISO_control_1,              /* Control codes in the range
 382                                    0x80..0x9F, except for the
 383                                    following 3 codes.  */
 384     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 385     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 386     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 387     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 388     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 389     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 390     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 391   };
 392
 393 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 394     `iso-flags' attribute of an iso2022 coding system.  */
 395
 396 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 397    instead of the correct short-form sequence (e.g. ESC $ A).  */
 398 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 399
 400 /* If set, reset graphic planes and registers at end-of-line to the
 401    initial state.  */
 402 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 403
 404 /* If set, reset graphic planes and registers before any control
 405    characters to the initial state.  */
 406 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 407
 408 /* If set, encode by 7-bit environment.  */
 409 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 410
 411 /* If set, use locking-shift function.  */
 412 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 413
 414 /* If set, use single-shift function.  Overwrite
 415    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 416 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 417
 418 /* If set, use designation escape sequence.  */
 419 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 420
 421 /* If set, produce revision number sequence.  */
 422 #define CODING_ISO_FLAG_REVISION        0x0080
 423
 424 /* If set, produce ISO6429's direction specifying sequence.  */
 425 #define CODING_ISO_FLAG_DIRECTION       0x0100
 426
 427 /* If set, assume designation states are reset at beginning of line on
 428    output.  */
 429 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 430
 431 /* If set, designation sequence should be placed at beginning of line
 432    on output.  */
 433 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 434
 435 /* If set, do not encode unsafe characters on output.  */
 436 #define CODING_ISO_FLAG_SAFE            0x0800
 437
 438 /* If set, extra latin codes (128..159) are accepted as a valid code
 439    on input.  */
 440 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 441
 442 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 443
 444 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 445
 446 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 447
 448 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 449
 450 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 451
 452 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 453
 454 /* A character to be produced on output if encoding of the original
 455    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 456 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 457
 458 /* UTF-8 section */
 459 #define CODING_UTF_8_BOM(coding)        \
 460   ((coding)->spec.utf_8_bom)
 461
 462 /* UTF-16 section */
 463 #define CODING_UTF_16_BOM(coding)       \
 464   ((coding)->spec.utf_16.bom)
 465
 466 #define CODING_UTF_16_ENDIAN(coding)    \
 467   ((coding)->spec.utf_16.endian)
 468
 469 #define CODING_UTF_16_SURROGATE(coding) \
 470   ((coding)->spec.utf_16.surrogate)
 471
 472
 473 /* CCL section */
 474 #define CODING_CCL_DECODER(coding)      \
 475   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 476 #define CODING_CCL_ENCODER(coding)      \
 477   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 478 #define CODING_CCL_VALIDS(coding)                                          \
 479   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 480
 481 /* Index for each coding category in `coding_categories' */
 482
 483 enum coding_category
 484   {
 485     coding_category_iso_7,
 486     coding_category_iso_7_tight,
 487     coding_category_iso_8_1,
 488     coding_category_iso_8_2,
 489     coding_category_iso_7_else,
 490     coding_category_iso_8_else,
 491     coding_category_utf_8_auto,
 492     coding_category_utf_8_nosig,
 493     coding_category_utf_8_sig,
 494     coding_category_utf_16_auto,
 495     coding_category_utf_16_be,
 496     coding_category_utf_16_le,
 497     coding_category_utf_16_be_nosig,
 498     coding_category_utf_16_le_nosig,
 499     coding_category_charset,
 500     coding_category_sjis,
 501     coding_category_big5,
 502     coding_category_ccl,
 503     coding_category_emacs_mule,
 504     /* All above are targets of code detection.  */
 505     coding_category_raw_text,
 506     coding_category_undecided,
 507     coding_category_max
 508   };
 509
 510 /* Definitions of flag bits used in detect_coding_XXXX.  */
 511 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 512 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 513 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 514 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 515 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 516 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 517 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 518 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 519 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 520 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 521 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 522 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 523 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 524 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 525 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 526 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 527 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 528 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 529 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 530 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 531
 532 /* This value is returned if detect_coding_mask () find nothing other
 533    than ASCII characters.  */
 534 #define CATEGORY_MASK_ANY               \
 535   (CATEGORY_MASK_ISO_7                  \
 536    | CATEGORY_MASK_ISO_7_TIGHT          \
 537    | CATEGORY_MASK_ISO_8_1              \
 538    | CATEGORY_MASK_ISO_8_2              \
 539    | CATEGORY_MASK_ISO_7_ELSE           \
 540    | CATEGORY_MASK_ISO_8_ELSE           \
 541    | CATEGORY_MASK_UTF_8_AUTO           \
 542    | CATEGORY_MASK_UTF_8_NOSIG          \
 543    | CATEGORY_MASK_UTF_8_SIG            \
 544    | CATEGORY_MASK_UTF_16_AUTO          \
 545    | CATEGORY_MASK_UTF_16_BE            \
 546    | CATEGORY_MASK_UTF_16_LE            \
 547    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 548    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 549    | CATEGORY_MASK_CHARSET              \
 550    | CATEGORY_MASK_SJIS                 \
 551    | CATEGORY_MASK_BIG5                 \
 552    | CATEGORY_MASK_CCL                  \
 553    | CATEGORY_MASK_EMACS_MULE)
 554
 555
 556 #define CATEGORY_MASK_ISO_7BIT \
 557   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 558
 559 #define CATEGORY_MASK_ISO_8BIT \
 560   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 561
 562 #define CATEGORY_MASK_ISO_ELSE \
 563   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 564
 565 #define CATEGORY_MASK_ISO_ESCAPE        \
 566   (CATEGORY_MASK_ISO_7                  \
 567    | CATEGORY_MASK_ISO_7_TIGHT          \
 568    | CATEGORY_MASK_ISO_7_ELSE           \
 569    | CATEGORY_MASK_ISO_8_ELSE)
 570
 571 #define CATEGORY_MASK_ISO       \
 572   (  CATEGORY_MASK_ISO_7BIT     \
 573      | CATEGORY_MASK_ISO_8BIT   \
 574      | CATEGORY_MASK_ISO_ELSE)
 575
 576 #define CATEGORY_MASK_UTF_16            \
 577   (CATEGORY_MASK_UTF_16_AUTO            \
 578    | CATEGORY_MASK_UTF_16_BE            \
 579    | CATEGORY_MASK_UTF_16_LE            \
 580    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 581    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 582
 583 #define CATEGORY_MASK_UTF_8     \
 584   (CATEGORY_MASK_UTF_8_AUTO     \
 585    | CATEGORY_MASK_UTF_8_NOSIG  \
 586    | CATEGORY_MASK_UTF_8_SIG)
 587
 588 /* Table of coding categories (Lisp symbols).  This variable is for
 589    internal use only.  */
 590 static Lisp_Object Vcoding_category_table;
 591
 592 /* Table of coding-categories ordered by priority.  */
 593 static enum coding_category coding_priorities[coding_category_max];
 594
 595 /* Nth element is a coding context for the coding system bound to the
 596    Nth coding category.  */
 597 static struct coding_system coding_categories[coding_category_max];
 598
 599 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 600
 601 static int
 602 encode_inhibit_flag (Lisp_Object flag)
 603 {
 604   return NILP (flag) ? -1 : EQ (flag, Qt);
 605 }
 606
 607 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 608    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 609
 610 static bool
 611 inhibit_flag (int encoded_flag, bool var)
 612 {
 613   return 0 < encoded_flag + var;
 614 }
 615
 616 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 617   do {                                                  \
 618     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 619     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 620   } while (0)
 621
 622 static void
 623 CHECK_NATNUM_CAR (Lisp_Object x)
 624 {
 625   Lisp_Object tmp = XCAR (x);
 626   CHECK_NATNUM (tmp);
 627   XSETCAR (x, tmp);
 628 }
 629
 630 static void
 631 CHECK_NATNUM_CDR (Lisp_Object x)
 632 {
 633   Lisp_Object tmp = XCDR (x);
 634   CHECK_NATNUM (tmp);
 635   XSETCDR (x, tmp);
 636 }
 637
 638 /* True if CODING's destination can be grown.  */
 639
 640 static bool
 641 growable_destination (struct coding_system *coding)
 642 {
 643   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
 644 }
 645
 646
 647 /* Safely get one byte from the source text pointed by SRC which ends
 648    at SRC_END, and set C to that byte.  If there are not enough bytes
 649    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 650    and a multibyte character is found at SRC, set C to the
 651    negative value of the character code.  The caller should declare
 652    and set these variables appropriately in advance:
 653         src, src_end, multibytep */
 654
 655 #define ONE_MORE_BYTE(c)                                \
 656   do {                                                  \
 657     if (src == src_end)                                 \
 658       {                                                 \
 659         if (src_base < src)                             \
 660           record_conversion_result                      \
 661             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 662         goto no_more_source;                            \
 663       }                                                 \
 664     c = *src++;                                         \
 665     if (multibytep && (c & 0x80))                       \
 666       {                                                 \
 667         if ((c & 0xFE) == 0xC0)                         \
 668           c = ((c & 1) << 6) | *src++;                  \
 669         else                                            \
 670           {                                             \
 671             src--;                                      \
 672             c = - string_char (src, &src, NULL);        \
 673             record_conversion_result                    \
 674               (coding, CODING_RESULT_INVALID_SRC);      \
 675           }                                             \
 676       }                                                 \
 677     consumed_chars++;                                   \
 678   } while (0)
 679
 680 /* Safely get two bytes from the source text pointed by SRC which ends
 681    at SRC_END, and set C1 and C2 to those bytes while skipping the
 682    heading multibyte characters.  If there are not enough bytes in the
 683    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 684    a multibyte character is found for C2, set C2 to the negative value
 685    of the character code.  The caller should declare and set these
 686    variables appropriately in advance:
 687         src, src_end, multibytep
 688    It is intended that this macro is used in detect_coding_utf_16.  */
 689
 690 #define TWO_MORE_BYTES(c1, c2)                          \
 691   do {                                                  \
 692     do {                                                \
 693       if (src == src_end)                               \
 694         goto no_more_source;                            \
 695       c1 = *src++;                                      \
 696       if (multibytep && (c1 & 0x80))                    \
 697         {                                               \
 698           if ((c1 & 0xFE) == 0xC0)                      \
 699             c1 = ((c1 & 1) << 6) | *src++;              \
 700           else                                          \
 701             {                                           \
 702               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 703               c1 = -1;                                  \
 704             }                                           \
 705         }                                               \
 706     } while (c1 < 0);                                   \
 707     if (src == src_end)                                 \
 708       goto no_more_source;                              \
 709     c2 = *src++;                                        \
 710     if (multibytep && (c2 & 0x80))                      \
 711       {                                                 \
 712         if ((c2 & 0xFE) == 0xC0)                        \
 713           c2 = ((c2 & 1) << 6) | *src++;                \
 714         else                                            \
 715           c2 = -1;                                      \
 716       }                                                 \
 717   } while (0)
 718
 719
 720 /* Store a byte C in the place pointed by DST and increment DST to the
 721    next free point, and increment PRODUCED_CHARS.  The caller should
 722    assure that C is 0..127, and declare and set the variable `dst'
 723    appropriately in advance.
 724 */
 725
 726
 727 #define EMIT_ONE_ASCII_BYTE(c)  \
 728   do {                          \
 729     produced_chars++;           \
 730     *dst++ = (c);               \
 731   } while (0)
 732
 733
 734 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 735
 736 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 737   do {                                  \
 738     produced_chars += 2;                \
 739     *dst++ = (c1), *dst++ = (c2);       \
 740   } while (0)
 741
 742
 743 /* Store a byte C in the place pointed by DST and increment DST to the
 744    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 745    store in an appropriate multibyte form.  The caller should
 746    declare and set the variables `dst' and `multibytep' appropriately
 747    in advance.  */
 748
 749 #define EMIT_ONE_BYTE(c)                \
 750   do {                                  \
 751     produced_chars++;                   \
 752     if (multibytep)                     \
 753       {                                 \
 754         unsigned ch = (c);              \
 755         if (ch >= 0x80)                 \
 756           ch = BYTE8_TO_CHAR (ch);      \
 757         CHAR_STRING_ADVANCE (ch, dst);  \
 758       }                                 \
 759     else                                \
 760       *dst++ = (c);                     \
 761   } while (0)
 762
 763
 764 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 765
 766 #define EMIT_TWO_BYTES(c1, c2)          \
 767   do {                                  \
 768     produced_chars += 2;                \
 769     if (multibytep)                     \
 770       {                                 \
 771         unsigned ch;                    \
 772                                         \
 773         ch = (c1);                      \
 774         if (ch >= 0x80)                 \
 775           ch = BYTE8_TO_CHAR (ch);      \
 776         CHAR_STRING_ADVANCE (ch, dst);  \
 777         ch = (c2);                      \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781       }                                 \
 782     else                                \
 783       {                                 \
 784         *dst++ = (c1);                  \
 785         *dst++ = (c2);                  \
 786       }                                 \
 787   } while (0)
 788
 789
 790 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 791   do {                                  \
 792     EMIT_ONE_BYTE (c1);                 \
 793     EMIT_TWO_BYTES (c2, c3);            \
 794   } while (0)
 795
 796
 797 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 798   do {                                          \
 799     EMIT_TWO_BYTES (c1, c2);                    \
 800     EMIT_TWO_BYTES (c3, c4);                    \
 801   } while (0)
 802
 803
 804 static void
 805 record_conversion_result (struct coding_system *coding,
 806                           enum coding_result_code result)
 807 {
 808   coding->result = result;
 809   switch (result)
 810     {
 811     case CODING_RESULT_INSUFFICIENT_SRC:
 812       Vlast_code_conversion_error = Qinsufficient_source;
 813       break;
 814     case CODING_RESULT_INVALID_SRC:
 815       Vlast_code_conversion_error = Qinvalid_source;
 816       break;
 817     case CODING_RESULT_INTERRUPT:
 818       Vlast_code_conversion_error = Qinterrupted;
 819       break;
 820     case CODING_RESULT_INSUFFICIENT_DST:
 821       /* Don't record this error in Vlast_code_conversion_error
 822          because it happens just temporarily and is resolved when the
 823          whole conversion is finished.  */
 824       break;
 825     case CODING_RESULT_SUCCESS:
 826       break;
 827     default:
 828       Vlast_code_conversion_error = intern ("Unknown error");
 829     }
 830 }
 831
 832 /* These wrapper macros are used to preserve validity of pointers into
 833    buffer text across calls to decode_char, encode_char, etc, which
 834    could cause relocation of buffers if it loads a charset map,
 835    because loading a charset map allocates large structures.  */
 836
 837 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 838   do {                                                                       \
 839     ptrdiff_t offset;                                                        \
 840                                                                              \
 841     charset_map_loaded = 0;                                                  \
 842     c = DECODE_CHAR (charset, code);                                         \
 843     if (charset_map_loaded                                                   \
 844         && (offset = coding_change_source (coding)))                         \
 845       {                                                                      \
 846         src += offset;                                                       \
 847         src_base += offset;                                                  \
 848         src_end += offset;                                                   \
 849       }                                                                      \
 850   } while (0)
 851
 852 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 853   do {                                                                  \
 854     ptrdiff_t offset;                                                   \
 855                                                                         \
 856     charset_map_loaded = 0;                                             \
 857     code = ENCODE_CHAR (charset, c);                                    \
 858     if (charset_map_loaded                                              \
 859         && (offset = coding_change_destination (coding)))               \
 860       {                                                                 \
 861         dst += offset;                                                  \
 862         dst_end += offset;                                              \
 863       }                                                                 \
 864   } while (0)
 865
 866 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 867   do {                                                                  \
 868     ptrdiff_t offset;                                                   \
 869                                                                         \
 870     charset_map_loaded = 0;                                             \
 871     charset = char_charset (c, charset_list, code_return);              \
 872     if (charset_map_loaded                                              \
 873         && (offset = coding_change_destination (coding)))               \
 874       {                                                                 \
 875         dst += offset;                                                  \
 876         dst_end += offset;                                              \
 877       }                                                                 \
 878   } while (0)
 879
 880 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 881   do {                                                                  \
 882     ptrdiff_t offset;                                                   \
 883                                                                         \
 884     charset_map_loaded = 0;                                             \
 885     result = CHAR_CHARSET_P (c, charset);                               \
 886     if (charset_map_loaded                                              \
 887         && (offset = coding_change_destination (coding)))               \
 888       {                                                                 \
 889         dst += offset;                                                  \
 890         dst_end += offset;                                              \
 891       }                                                                 \
 892   } while (0)
 893
 894
 895 /* If there are at least BYTES length of room at dst, allocate memory
 896    for coding->destination and update dst and dst_end.  We don't have
 897    to take care of coding->source which will be relocated.  It is
 898    handled by calling coding_set_source in encode_coding.  */
 899
 900 #define ASSURE_DESTINATION(bytes)                               \
 901   do {                                                          \
 902     if (dst + (bytes) >= dst_end)                               \
 903       {                                                         \
 904         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 905                                                                 \
 906         dst = alloc_destination (coding, more_bytes, dst);      \
 907         dst_end = coding->destination + coding->dst_bytes;      \
 908       }                                                         \
 909   } while (0)
 910
 911
 912 /* Store multibyte form of the character C in P, and advance P to the
 913    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 914    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 915    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 916
 917 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 918
 919 /* Return the character code of character whose multibyte form is at
 920    P, and advance P to the end of the multibyte form.  This used to be
 921    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 922    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 923
 924 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 925
 926 /* Set coding->source from coding->src_object.  */
 927
 928 static void
 929 coding_set_source (struct coding_system *coding)
 930 {
 931   if (BUFFERP (coding->src_object))
 932     {
 933       struct buffer *buf = XBUFFER (coding->src_object);
 934
 935       if (coding->src_pos < 0)
 936         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 937       else
 938         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 939     }
 940   else if (STRINGP (coding->src_object))
 941     {
 942       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 943     }
 944   else
 945     {
 946       /* Otherwise, the source is C string and is never relocated
 947          automatically.  Thus we don't have to update anything.  */
 948     }
 949 }
 950
 951
 952 /* Set coding->source from coding->src_object, and return how many
 953    bytes coding->source was changed.  */
 954
 955 static ptrdiff_t
 956 coding_change_source (struct coding_system *coding)
 957 {
 958   const unsigned char *orig = coding->source;
 959   coding_set_source (coding);
 960   return coding->source - orig;
 961 }
 962
 963
 964 /* Set coding->destination from coding->dst_object.  */
 965
 966 static void
 967 coding_set_destination (struct coding_system *coding)
 968 {
 969   if (BUFFERP (coding->dst_object))
 970     {
 971       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 972         {
 973           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 974           coding->dst_bytes = (GAP_END_ADDR
 975                                - (coding->src_bytes - coding->consumed)
 976                                - coding->destination);
 977         }
 978       else
 979         {
 980           /* We are sure that coding->dst_pos_byte is before the gap
 981              of the buffer. */
 982           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 983                                  + coding->dst_pos_byte - BEG_BYTE);
 984           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 985                                - coding->destination);
 986         }
 987     }
 988   else
 989     {
 990       /* Otherwise, the destination is C string and is never relocated
 991          automatically.  Thus we don't have to update anything.  */
 992     }
 993 }
 994
 995
 996 /* Set coding->destination from coding->dst_object, and return how
 997    many bytes coding->destination was changed.  */
 998
 999 static ptrdiff_t
1000 coding_change_destination (struct coding_system *coding)
1001 {
1002   const unsigned char *orig = coding->destination;
1003   coding_set_destination (coding);
1004   return coding->destination - orig;
1005 }
1006
1007
1008 static void
1009 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1010 {
1011   ptrdiff_t newbytes;
1012   if (INT_ADD_WRAPV (coding->dst_bytes, bytes, &newbytes)
1013       || SIZE_MAX < newbytes)
1014     string_overflow ();
1015   coding->destination = xrealloc (coding->destination, newbytes);
1016   coding->dst_bytes = newbytes;
1017 }
1018
1019 static void
1020 coding_alloc_by_making_gap (struct coding_system *coding,
1021                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1022 {
1023   if (EQ (coding->src_object, coding->dst_object))
1024     {
1025       /* The gap may contain the produced data at the head and not-yet
1026          consumed data at the tail.  To preserve those data, we at
1027          first make the gap size to zero, then increase the gap
1028          size.  */
1029       ptrdiff_t add = GAP_SIZE;
1030
1031       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1032       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1033       make_gap (bytes);
1034       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1035       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1036     }
1037   else
1038     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1039 }
1040
1041
1042 static unsigned char *
1043 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1044                    unsigned char *dst)
1045 {
1046   ptrdiff_t offset = dst - coding->destination;
1047
1048   if (BUFFERP (coding->dst_object))
1049     {
1050       struct buffer *buf = XBUFFER (coding->dst_object);
1051
1052       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1053     }
1054   else
1055     coding_alloc_by_realloc (coding, nbytes);
1056   coding_set_destination (coding);
1057   dst = coding->destination + offset;
1058   return dst;
1059 }
1060
1061 /** Macros for annotations.  */
1062
1063 /* An annotation data is stored in the array coding->charbuf in this
1064    format:
1065      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1066    LENGTH is the number of elements in the annotation.
1067    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1068    NCHARS is the number of characters in the text annotated.
1069
1070    The format of the following elements depend on ANNOTATION_MASK.
1071
1072    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1073    follows:
1074      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1075
1076    NBYTES is the number of bytes specified in the header part of
1077    old-style emacs-mule encoding, or 0 for the other kind of
1078    composition.
1079
1080    METHOD is one of enum composition_method.
1081
1082    Optional COMPOSITION-COMPONENTS are characters and composition
1083    rules.
1084
1085    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1086    follows.
1087
1088    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1089    recover from an invalid annotation, and should be skipped by
1090    produce_annotation.  */
1091
1092 /* Maximum length of the header of annotation data.  */
1093 #define MAX_ANNOTATION_LENGTH 5
1094
1095 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1096   do {                                                  \
1097     *(buf)++ = -(len);                                  \
1098     *(buf)++ = (mask);                                  \
1099     *(buf)++ = (nchars);                                \
1100     coding->annotated = 1;                              \
1101   } while (0);
1102
1103 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1104   do {                                                                      \
1105     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1106     *buf++ = nbytes;                                                        \
1107     *buf++ = method;                                                        \
1108   } while (0)
1109
1110
1111 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1112   do {                                                                  \
1113     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1114     *buf++ = id;                                                        \
1115   } while (0)
1116
1117
1118 /* Bitmasks for coding->eol_seen.  */
1119
1120 #define EOL_SEEN_NONE   0
1121 #define EOL_SEEN_LF     1
1122 #define EOL_SEEN_CR     2
1123 #define EOL_SEEN_CRLF   4
1124
1125 \f
1126 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1127
1128
1129
1130 \f
1131 /*** 3. UTF-8 ***/
1132
1133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1134    Return true if a text is encoded in UTF-8.  */
1135
1136 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1137 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1138 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1139 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1140 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1141 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1142
1143 #define UTF_8_BOM_1 0xEF
1144 #define UTF_8_BOM_2 0xBB
1145 #define UTF_8_BOM_3 0xBF
1146
1147 /* Unlike the other detect_coding_XXX, this function counts the number
1148    of characters and checks the EOL format.  */
1149
1150 static bool
1151 detect_coding_utf_8 (struct coding_system *coding,
1152                      struct coding_detection_info *detect_info)
1153 {
1154   const unsigned char *src = coding->source, *src_base;
1155   const unsigned char *src_end = coding->source + coding->src_bytes;
1156   bool multibytep = coding->src_multibyte;
1157   ptrdiff_t consumed_chars = 0;
1158   bool bom_found = 0;
1159   ptrdiff_t nchars = coding->head_ascii;
1160   int eol_seen = coding->eol_seen;
1161
1162   detect_info->checked |= CATEGORY_MASK_UTF_8;
1163   /* A coding system of this category is always ASCII compatible.  */
1164   src += nchars;
1165
1166   if (src == coding->source     /* BOM should be at the head.  */
1167       && src + 3 < src_end      /* BOM is 3-byte long.  */
1168       && src[0] == UTF_8_BOM_1
1169       && src[1] == UTF_8_BOM_2
1170       && src[2] == UTF_8_BOM_3)
1171     {
1172       bom_found = 1;
1173       src += 3;
1174       nchars++;
1175     }
1176
1177   while (1)
1178     {
1179       int c, c1, c2, c3, c4;
1180
1181       src_base = src;
1182       ONE_MORE_BYTE (c);
1183       if (c < 0 || UTF_8_1_OCTET_P (c))
1184         {
1185           nchars++;
1186           if (c == '\r')
1187             {
1188               if (src < src_end && *src == '\n')
1189                 {
1190                   eol_seen |= EOL_SEEN_CRLF;
1191                   src++;
1192                   nchars++;
1193                 }
1194               else
1195                 eol_seen |= EOL_SEEN_CR;
1196             }
1197           else if (c == '\n')
1198             eol_seen |= EOL_SEEN_LF;
1199           continue;
1200         }
1201       ONE_MORE_BYTE (c1);
1202       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1203         break;
1204       if (UTF_8_2_OCTET_LEADING_P (c))
1205         {
1206           nchars++;
1207           continue;
1208         }
1209       ONE_MORE_BYTE (c2);
1210       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1211         break;
1212       if (UTF_8_3_OCTET_LEADING_P (c))
1213         {
1214           nchars++;
1215           continue;
1216         }
1217       ONE_MORE_BYTE (c3);
1218       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1219         break;
1220       if (UTF_8_4_OCTET_LEADING_P (c))
1221         {
1222           nchars++;
1223           continue;
1224         }
1225       ONE_MORE_BYTE (c4);
1226       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1227         break;
1228       if (UTF_8_5_OCTET_LEADING_P (c)
1229           /* If we ever need to increase MAX_CHAR, the below may need
1230              to be reviewed.  */
1231           && c < MAX_MULTIBYTE_LEADING_CODE)
1232         {
1233           nchars++;
1234           continue;
1235         }
1236       break;
1237     }
1238   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1239   return 0;
1240
1241  no_more_source:
1242   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1243     {
1244       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1245       return 0;
1246     }
1247   if (bom_found)
1248     {
1249       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1250       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1251     }
1252   else
1253     {
1254       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1255       if (nchars < src_end - coding->source)
1256         /* The found characters are less than source bytes, which
1257            means that we found a valid non-ASCII characters.  */
1258         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1259     }
1260   coding->detected_utf8_bytes = src_base - coding->source;
1261   coding->detected_utf8_chars = nchars;
1262   return 1;
1263 }
1264
1265
1266 static void
1267 decode_coding_utf_8 (struct coding_system *coding)
1268 {
1269   const unsigned char *src = coding->source + coding->consumed;
1270   const unsigned char *src_end = coding->source + coding->src_bytes;
1271   const unsigned char *src_base;
1272   int *charbuf = coding->charbuf + coding->charbuf_used;
1273   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1274   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1275   bool multibytep = coding->src_multibyte;
1276   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1277   bool eol_dos
1278     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1279   int byte_after_cr = -1;
1280
1281   if (bom != utf_without_bom)
1282     {
1283       int c1, c2, c3;
1284
1285       src_base = src;
1286       ONE_MORE_BYTE (c1);
1287       if (! UTF_8_3_OCTET_LEADING_P (c1))
1288         src = src_base;
1289       else
1290         {
1291           ONE_MORE_BYTE (c2);
1292           if (! UTF_8_EXTRA_OCTET_P (c2))
1293             src = src_base;
1294           else
1295             {
1296               ONE_MORE_BYTE (c3);
1297               if (! UTF_8_EXTRA_OCTET_P (c3))
1298                 src = src_base;
1299               else
1300                 {
1301                   if ((c1 != UTF_8_BOM_1)
1302                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1303                     src = src_base;
1304                   else
1305                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1306                 }
1307             }
1308         }
1309     }
1310   CODING_UTF_8_BOM (coding) = utf_without_bom;
1311
1312   while (1)
1313     {
1314       int c, c1, c2, c3, c4, c5;
1315
1316       src_base = src;
1317       consumed_chars_base = consumed_chars;
1318
1319       if (charbuf >= charbuf_end)
1320         {
1321           if (byte_after_cr >= 0)
1322             src_base--;
1323           break;
1324         }
1325
1326       /* In the simple case, rapidly handle ordinary characters */
1327       if (multibytep && ! eol_dos
1328           && charbuf < charbuf_end - 6 && src < src_end - 6)
1329         {
1330           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1331             {
1332               c1 = *src;
1333               if (c1 & 0x80)
1334                 break;
1335               src++;
1336               consumed_chars++;
1337               *charbuf++ = c1;
1338
1339               c1 = *src;
1340               if (c1 & 0x80)
1341                 break;
1342               src++;
1343               consumed_chars++;
1344               *charbuf++ = c1;
1345
1346               c1 = *src;
1347               if (c1 & 0x80)
1348                 break;
1349               src++;
1350               consumed_chars++;
1351               *charbuf++ = c1;
1352
1353               c1 = *src;
1354               if (c1 & 0x80)
1355                 break;
1356               src++;
1357               consumed_chars++;
1358               *charbuf++ = c1;
1359             }
1360           /* If we handled at least one character, restart the main loop.  */
1361           if (src != src_base)
1362             continue;
1363         }
1364
1365       if (byte_after_cr >= 0)
1366         c1 = byte_after_cr, byte_after_cr = -1;
1367       else
1368         ONE_MORE_BYTE (c1);
1369       if (c1 < 0)
1370         {
1371           c = - c1;
1372         }
1373       else if (UTF_8_1_OCTET_P (c1))
1374         {
1375           if (eol_dos && c1 == '\r')
1376             ONE_MORE_BYTE (byte_after_cr);
1377           c = c1;
1378         }
1379       else
1380         {
1381           ONE_MORE_BYTE (c2);
1382           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1383             goto invalid_code;
1384           if (UTF_8_2_OCTET_LEADING_P (c1))
1385             {
1386               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1387               /* Reject overlong sequences here and below.  Encoders
1388                  producing them are incorrect, they can be misleading,
1389                  and they mess up read/write invariance.  */
1390               if (c < 128)
1391                 goto invalid_code;
1392             }
1393           else
1394             {
1395               ONE_MORE_BYTE (c3);
1396               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1397                 goto invalid_code;
1398               if (UTF_8_3_OCTET_LEADING_P (c1))
1399                 {
1400                   c = (((c1 & 0xF) << 12)
1401                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1402                   if (c < 0x800
1403                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1404                     goto invalid_code;
1405                 }
1406               else
1407                 {
1408                   ONE_MORE_BYTE (c4);
1409                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1410                     goto invalid_code;
1411                   if (UTF_8_4_OCTET_LEADING_P (c1))
1412                     {
1413                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1414                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1415                     if (c < 0x10000)
1416                       goto invalid_code;
1417                     }
1418                   else
1419                     {
1420                       ONE_MORE_BYTE (c5);
1421                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1422                         goto invalid_code;
1423                       if (UTF_8_5_OCTET_LEADING_P (c1))
1424                         {
1425                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1426                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1427                                | (c5 & 0x3F));
1428                           if ((c > MAX_CHAR) || (c < 0x200000))
1429                             goto invalid_code;
1430                         }
1431                       else
1432                         goto invalid_code;
1433                     }
1434                 }
1435             }
1436         }
1437
1438       *charbuf++ = c;
1439       continue;
1440
1441     invalid_code:
1442       src = src_base;
1443       consumed_chars = consumed_chars_base;
1444       ONE_MORE_BYTE (c);
1445       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1446     }
1447
1448  no_more_source:
1449   coding->consumed_char += consumed_chars_base;
1450   coding->consumed = src_base - coding->source;
1451   coding->charbuf_used = charbuf - coding->charbuf;
1452 }
1453
1454
1455 bool
1456 encode_coding_utf_8 (struct coding_system *coding)
1457 {
1458   bool multibytep = coding->dst_multibyte;
1459   int *charbuf = coding->charbuf;
1460   int *charbuf_end = charbuf + coding->charbuf_used;
1461   unsigned char *dst = coding->destination + coding->produced;
1462   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1463   ptrdiff_t produced_chars = 0;
1464   int c;
1465
1466   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1467     {
1468       ASSURE_DESTINATION (3);
1469       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1470       CODING_UTF_8_BOM (coding) = utf_without_bom;
1471     }
1472
1473   if (multibytep)
1474     {
1475       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1476
1477       while (charbuf < charbuf_end)
1478         {
1479           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1480
1481           ASSURE_DESTINATION (safe_room);
1482           c = *charbuf++;
1483           if (CHAR_BYTE8_P (c))
1484             {
1485               c = CHAR_TO_BYTE8 (c);
1486               EMIT_ONE_BYTE (c);
1487             }
1488           else
1489             {
1490               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1491               for (p = str; p < pend; p++)
1492                 EMIT_ONE_BYTE (*p);
1493             }
1494         }
1495     }
1496   else
1497     {
1498       int safe_room = MAX_MULTIBYTE_LENGTH;
1499
1500       while (charbuf < charbuf_end)
1501         {
1502           ASSURE_DESTINATION (safe_room);
1503           c = *charbuf++;
1504           if (CHAR_BYTE8_P (c))
1505             *dst++ = CHAR_TO_BYTE8 (c);
1506           else
1507             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1508         }
1509       produced_chars = dst - (coding->destination + coding->produced);
1510     }
1511   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1512   coding->produced_char += produced_chars;
1513   coding->produced = dst - coding->destination;
1514   return 0;
1515 }
1516
1517
1518 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1519    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1520
1521 static bool
1522 detect_coding_utf_16 (struct coding_system *coding,
1523                       struct coding_detection_info *detect_info)
1524 {
1525   const unsigned char *src = coding->source;
1526   const unsigned char *src_end = coding->source + coding->src_bytes;
1527   bool multibytep = coding->src_multibyte;
1528   int c1, c2;
1529
1530   detect_info->checked |= CATEGORY_MASK_UTF_16;
1531   if (coding->mode & CODING_MODE_LAST_BLOCK
1532       && (coding->src_chars & 1))
1533     {
1534       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1535       return 0;
1536     }
1537
1538   TWO_MORE_BYTES (c1, c2);
1539   if ((c1 == 0xFF) && (c2 == 0xFE))
1540     {
1541       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1542                              | CATEGORY_MASK_UTF_16_AUTO);
1543       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1544                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1545                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1546     }
1547   else if ((c1 == 0xFE) && (c2 == 0xFF))
1548     {
1549       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1550                              | CATEGORY_MASK_UTF_16_AUTO);
1551       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1552                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1553                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1554     }
1555   else if (c2 < 0)
1556     {
1557       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1558       return 0;
1559     }
1560   else
1561     {
1562       /* We check the dispersion of Eth and Oth bytes where E is even and
1563          O is odd.  If both are high, we assume binary data.*/
1564       unsigned char e[256], o[256];
1565       unsigned e_num = 1, o_num = 1;
1566
1567       memset (e, 0, 256);
1568       memset (o, 0, 256);
1569       e[c1] = 1;
1570       o[c2] = 1;
1571
1572       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1573                                 |CATEGORY_MASK_UTF_16_BE
1574                                 | CATEGORY_MASK_UTF_16_LE);
1575
1576       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1577              != CATEGORY_MASK_UTF_16)
1578         {
1579           TWO_MORE_BYTES (c1, c2);
1580           if (c2 < 0)
1581             break;
1582           if (! e[c1])
1583             {
1584               e[c1] = 1;
1585               e_num++;
1586               if (e_num >= 128)
1587                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1588             }
1589           if (! o[c2])
1590             {
1591               o[c2] = 1;
1592               o_num++;
1593               if (o_num >= 128)
1594                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1595             }
1596         }
1597       return 0;
1598     }
1599
1600  no_more_source:
1601   return 1;
1602 }
1603
1604 static void
1605 decode_coding_utf_16 (struct coding_system *coding)
1606 {
1607   const unsigned char *src = coding->source + coding->consumed;
1608   const unsigned char *src_end = coding->source + coding->src_bytes;
1609   const unsigned char *src_base;
1610   int *charbuf = coding->charbuf + coding->charbuf_used;
1611   /* We may produces at most 3 chars in one loop.  */
1612   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1613   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1614   bool multibytep = coding->src_multibyte;
1615   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1616   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1617   int surrogate = CODING_UTF_16_SURROGATE (coding);
1618   bool eol_dos
1619     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1620   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1621
1622   if (bom == utf_with_bom)
1623     {
1624       int c, c1, c2;
1625
1626       src_base = src;
1627       ONE_MORE_BYTE (c1);
1628       ONE_MORE_BYTE (c2);
1629       c = (c1 << 8) | c2;
1630
1631       if (endian == utf_16_big_endian
1632           ? c != 0xFEFF : c != 0xFFFE)
1633         {
1634           /* The first two bytes are not BOM.  Treat them as bytes
1635              for a normal character.  */
1636           src = src_base;
1637         }
1638       CODING_UTF_16_BOM (coding) = utf_without_bom;
1639     }
1640   else if (bom == utf_detect_bom)
1641     {
1642       /* We have already tried to detect BOM and failed in
1643          detect_coding.  */
1644       CODING_UTF_16_BOM (coding) = utf_without_bom;
1645     }
1646
1647   while (1)
1648     {
1649       int c, c1, c2;
1650
1651       src_base = src;
1652       consumed_chars_base = consumed_chars;
1653
1654       if (charbuf >= charbuf_end)
1655         {
1656           if (byte_after_cr1 >= 0)
1657             src_base -= 2;
1658           break;
1659         }
1660
1661       if (byte_after_cr1 >= 0)
1662         c1 = byte_after_cr1, byte_after_cr1 = -1;
1663       else
1664         ONE_MORE_BYTE (c1);
1665       if (c1 < 0)
1666         {
1667           *charbuf++ = -c1;
1668           continue;
1669         }
1670       if (byte_after_cr2 >= 0)
1671         c2 = byte_after_cr2, byte_after_cr2 = -1;
1672       else
1673         ONE_MORE_BYTE (c2);
1674       if (c2 < 0)
1675         {
1676           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1677           *charbuf++ = -c2;
1678           continue;
1679         }
1680       c = (endian == utf_16_big_endian
1681            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1682
1683       if (surrogate)
1684         {
1685           if (! UTF_16_LOW_SURROGATE_P (c))
1686             {
1687               if (endian == utf_16_big_endian)
1688                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1689               else
1690                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1691               *charbuf++ = c1;
1692               *charbuf++ = c2;
1693               if (UTF_16_HIGH_SURROGATE_P (c))
1694                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1695               else
1696                 *charbuf++ = c;
1697             }
1698           else
1699             {
1700               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1701               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1702               *charbuf++ = 0x10000 + c;
1703             }
1704         }
1705       else
1706         {
1707           if (UTF_16_HIGH_SURROGATE_P (c))
1708             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1709           else
1710             {
1711               if (eol_dos && c == '\r')
1712                 {
1713                   ONE_MORE_BYTE (byte_after_cr1);
1714                   ONE_MORE_BYTE (byte_after_cr2);
1715                 }
1716               *charbuf++ = c;
1717             }
1718         }
1719     }
1720
1721  no_more_source:
1722   coding->consumed_char += consumed_chars_base;
1723   coding->consumed = src_base - coding->source;
1724   coding->charbuf_used = charbuf - coding->charbuf;
1725 }
1726
1727 static bool
1728 encode_coding_utf_16 (struct coding_system *coding)
1729 {
1730   bool multibytep = coding->dst_multibyte;
1731   int *charbuf = coding->charbuf;
1732   int *charbuf_end = charbuf + coding->charbuf_used;
1733   unsigned char *dst = coding->destination + coding->produced;
1734   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1735   int safe_room = 8;
1736   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1737   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1738   ptrdiff_t produced_chars = 0;
1739   int c;
1740
1741   if (bom != utf_without_bom)
1742     {
1743       ASSURE_DESTINATION (safe_room);
1744       if (big_endian)
1745         EMIT_TWO_BYTES (0xFE, 0xFF);
1746       else
1747         EMIT_TWO_BYTES (0xFF, 0xFE);
1748       CODING_UTF_16_BOM (coding) = utf_without_bom;
1749     }
1750
1751   while (charbuf < charbuf_end)
1752     {
1753       ASSURE_DESTINATION (safe_room);
1754       c = *charbuf++;
1755       if (c > MAX_UNICODE_CHAR)
1756         c = coding->default_char;
1757
1758       if (c < 0x10000)
1759         {
1760           if (big_endian)
1761             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1762           else
1763             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1764         }
1765       else
1766         {
1767           int c1, c2;
1768
1769           c -= 0x10000;
1770           c1 = (c >> 10) + 0xD800;
1771           c2 = (c & 0x3FF) + 0xDC00;
1772           if (big_endian)
1773             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1774           else
1775             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1776         }
1777     }
1778   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1779   coding->produced = dst - coding->destination;
1780   coding->produced_char += produced_chars;
1781   return 0;
1782 }
1783
1784 \f
1785 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1786
1787 /* Emacs' internal format for representation of multiple character
1788    sets is a kind of multi-byte encoding, i.e. characters are
1789    represented by variable-length sequences of one-byte codes.
1790
1791    ASCII characters and control characters (e.g. `tab', `newline') are
1792    represented by one-byte sequences which are their ASCII codes, in
1793    the range 0x00 through 0x7F.
1794
1795    8-bit characters of the range 0x80..0x9F are represented by
1796    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1797    code + 0x20).
1798
1799    8-bit characters of the range 0xA0..0xFF are represented by
1800    one-byte sequences which are their 8-bit code.
1801
1802    The other characters are represented by a sequence of `base
1803    leading-code', optional `extended leading-code', and one or two
1804    `position-code's.  The length of the sequence is determined by the
1805    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1806    whereas extended leading-code and position-code take the range 0xA0
1807    through 0xFF.  See `charset.h' for more details about leading-code
1808    and position-code.
1809
1810    --- CODE RANGE of Emacs' internal format ---
1811    character set        range
1812    -------------        -----
1813    ascii                0x00..0x7F
1814    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1815    eight-bit-graphic    0xA0..0xBF
1816    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1817    ---------------------------------------------
1818
1819    As this is the internal character representation, the format is
1820    usually not used externally (i.e. in a file or in a data sent to a
1821    process).  But, it is possible to have a text externally in this
1822    format (i.e. by encoding by the coding system `emacs-mule').
1823
1824    In that case, a sequence of one-byte codes has a slightly different
1825    form.
1826
1827    At first, all characters in eight-bit-control are represented by
1828    one-byte sequences which are their 8-bit code.
1829
1830    Next, character composition data are represented by the byte
1831    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1832    where,
1833         METHOD is 0xF2 plus one of composition method (enum
1834         composition_method),
1835
1836         BYTES is 0xA0 plus a byte length of this composition data,
1837
1838         CHARS is 0xA0 plus a number of characters composed by this
1839         data,
1840
1841         COMPONENTs are characters of multibyte form or composition
1842         rules encoded by two-byte of ASCII codes.
1843
1844    In addition, for backward compatibility, the following formats are
1845    also recognized as composition data on decoding.
1846
1847    0x80 MSEQ ...
1848    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1849
1850    Here,
1851         MSEQ is a multibyte form but in these special format:
1852           ASCII: 0xA0 ASCII_CODE+0x80,
1853           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1854         RULE is a one byte code of the range 0xA0..0xF0 that
1855         represents a composition rule.
1856   */
1857
1858 char emacs_mule_bytes[256];
1859
1860
1861 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1862    Return true if a text is encoded in 'emacs-mule'.  */
1863
1864 static bool
1865 detect_coding_emacs_mule (struct coding_system *coding,
1866                           struct coding_detection_info *detect_info)
1867 {
1868   const unsigned char *src = coding->source, *src_base;
1869   const unsigned char *src_end = coding->source + coding->src_bytes;
1870   bool multibytep = coding->src_multibyte;
1871   ptrdiff_t consumed_chars = 0;
1872   int c;
1873   int found = 0;
1874
1875   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1876   /* A coding system of this category is always ASCII compatible.  */
1877   src += coding->head_ascii;
1878
1879   while (1)
1880     {
1881       src_base = src;
1882       ONE_MORE_BYTE (c);
1883       if (c < 0)
1884         continue;
1885       if (c == 0x80)
1886         {
1887           /* Perhaps the start of composite character.  We simply skip
1888              it because analyzing it is too heavy for detecting.  But,
1889              at least, we check that the composite character
1890              constitutes of more than 4 bytes.  */
1891           const unsigned char *src_start;
1892
1893         repeat:
1894           src_start = src;
1895           do
1896             {
1897               ONE_MORE_BYTE (c);
1898             }
1899           while (c >= 0xA0);
1900
1901           if (src - src_start <= 4)
1902             break;
1903           found = CATEGORY_MASK_EMACS_MULE;
1904           if (c == 0x80)
1905             goto repeat;
1906         }
1907
1908       if (c < 0x80)
1909         {
1910           if (c < 0x20
1911               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1912             break;
1913         }
1914       else
1915         {
1916           int more_bytes = emacs_mule_bytes[c] - 1;
1917
1918           while (more_bytes > 0)
1919             {
1920               ONE_MORE_BYTE (c);
1921               if (c < 0xA0)
1922                 {
1923                   src--;        /* Unread the last byte.  */
1924                   break;
1925                 }
1926               more_bytes--;
1927             }
1928           if (more_bytes != 0)
1929             break;
1930           found = CATEGORY_MASK_EMACS_MULE;
1931         }
1932     }
1933   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1934   return 0;
1935
1936  no_more_source:
1937   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1938     {
1939       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1940       return 0;
1941     }
1942   detect_info->found |= found;
1943   return 1;
1944 }
1945
1946
1947 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1948    character.  If CMP_STATUS indicates that we must expect MSEQ or
1949    RULE described above, decode it and return the negative value of
1950    the decoded character or rule.  If an invalid byte is found, return
1951    -1.  If SRC is too short, return -2.  */
1952
1953 static int
1954 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1955                  int *nbytes, int *nchars, int *id,
1956                  struct composition_status *cmp_status)
1957 {
1958   const unsigned char *src_end = coding->source + coding->src_bytes;
1959   const unsigned char *src_base = src;
1960   bool multibytep = coding->src_multibyte;
1961   int charset_ID;
1962   unsigned code;
1963   int c;
1964   ptrdiff_t consumed_chars = 0;
1965   bool mseq_found = 0;
1966
1967   ONE_MORE_BYTE (c);
1968   if (c < 0)
1969     {
1970       c = -c;
1971       charset_ID = emacs_mule_charset[0];
1972     }
1973   else
1974     {
1975       if (c >= 0xA0)
1976         {
1977           if (cmp_status->state != COMPOSING_NO
1978               && cmp_status->old_form)
1979             {
1980               if (cmp_status->state == COMPOSING_CHAR)
1981                 {
1982                   if (c == 0xA0)
1983                     {
1984                       ONE_MORE_BYTE (c);
1985                       c -= 0x80;
1986                       if (c < 0)
1987                         goto invalid_code;
1988                     }
1989                   else
1990                     c -= 0x20;
1991                   mseq_found = 1;
1992                 }
1993               else
1994                 {
1995                   *nbytes = src - src_base;
1996                   *nchars = consumed_chars;
1997                   return -c;
1998                 }
1999             }
2000           else
2001             goto invalid_code;
2002         }
2003
2004       switch (emacs_mule_bytes[c])
2005         {
2006         case 2:
2007           if ((charset_ID = emacs_mule_charset[c]) < 0)
2008             goto invalid_code;
2009           ONE_MORE_BYTE (c);
2010           if (c < 0xA0)
2011             goto invalid_code;
2012           code = c & 0x7F;
2013           break;
2014
2015         case 3:
2016           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2017               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2018             {
2019               ONE_MORE_BYTE (c);
2020               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2021                 goto invalid_code;
2022               ONE_MORE_BYTE (c);
2023               if (c < 0xA0)
2024                 goto invalid_code;
2025               code = c & 0x7F;
2026             }
2027           else
2028             {
2029               if ((charset_ID = emacs_mule_charset[c]) < 0)
2030                 goto invalid_code;
2031               ONE_MORE_BYTE (c);
2032               if (c < 0xA0)
2033                 goto invalid_code;
2034               code = (c & 0x7F) << 8;
2035               ONE_MORE_BYTE (c);
2036               if (c < 0xA0)
2037                 goto invalid_code;
2038               code |= c & 0x7F;
2039             }
2040           break;
2041
2042         case 4:
2043           ONE_MORE_BYTE (c);
2044           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2045             goto invalid_code;
2046           ONE_MORE_BYTE (c);
2047           if (c < 0xA0)
2048             goto invalid_code;
2049           code = (c & 0x7F) << 8;
2050           ONE_MORE_BYTE (c);
2051           if (c < 0xA0)
2052             goto invalid_code;
2053           code |= c & 0x7F;
2054           break;
2055
2056         case 1:
2057           code = c;
2058           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2059           break;
2060
2061         default:
2062           emacs_abort ();
2063         }
2064       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2065                           CHARSET_FROM_ID (charset_ID), code, c);
2066       if (c < 0)
2067         goto invalid_code;
2068     }
2069   *nbytes = src - src_base;
2070   *nchars = consumed_chars;
2071   if (id)
2072     *id = charset_ID;
2073   return (mseq_found ? -c : c);
2074
2075  no_more_source:
2076   return -2;
2077
2078  invalid_code:
2079   return -1;
2080 }
2081
2082
2083 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2084
2085 /* Handle these composition sequence ('|': the end of header elements,
2086    BYTES and CHARS >= 0xA0):
2087
2088    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2089    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2090    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2091
2092    and these old form:
2093
2094    (4) relative composition: 0x80 | MSEQ ... MSEQ
2095    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2096
2097    When the starter 0x80 and the following header elements are found,
2098    this annotation header is produced.
2099
2100         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2101
2102    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2103    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2104
2105    Then, upon reading the following elements, these codes are produced
2106    until the composition end is found:
2107
2108    (1) CHAR ... CHAR
2109    (2) ALT ... ALT CHAR ... CHAR
2110    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2111    (4) CHAR ... CHAR
2112    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2113
2114    When the composition end is found, LENGTH and NCHARS in the
2115    annotation header is updated as below:
2116
2117    (1) LENGTH: unchanged, NCHARS: unchanged
2118    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2119    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2120    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2121    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2122
2123    If an error is found while composing, the annotation header is
2124    changed to the original composition header (plus filler -1s) as
2125    below:
2126
2127    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2128    (5)          [ 0x80 0xFF -1 -1- -1 ]
2129
2130    and the sequence [ -2 DECODED-RULE ] is changed to the original
2131    byte sequence as below:
2132         o the original byte sequence is B: [ B -1 ]
2133         o the original byte sequence is B1 B2: [ B1 B2 ]
2134
2135    Most of the routines are implemented by macros because many
2136    variables and labels in the caller decode_coding_emacs_mule must be
2137    accessible, and they are usually called just once (thus doesn't
2138    increase the size of compiled object).  */
2139
2140 /* Decode a composition rule represented by C as a component of
2141    composition sequence of Emacs 20 style.  Set RULE to the decoded
2142    rule. */
2143
2144 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2145   do {                                                  \
2146     int gref, nref;                                     \
2147                                                         \
2148     c -= 0xA0;                                          \
2149     if (c < 0 || c >= 81)                               \
2150       goto invalid_code;                                \
2151     gref = c / 9, nref = c % 9;                         \
2152     if (gref == 4) gref = 10;                           \
2153     if (nref == 4) nref = 10;                           \
2154     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2155   } while (0)
2156
2157
2158 /* Decode a composition rule represented by C and the following byte
2159    at SRC as a component of composition sequence of Emacs 21 style.
2160    Set RULE to the decoded rule.  */
2161
2162 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2163   do {                                                  \
2164     int gref, nref;                                     \
2165                                                         \
2166     gref = c - 0x20;                                    \
2167     if (gref < 0 || gref >= 81)                         \
2168       goto invalid_code;                                \
2169     ONE_MORE_BYTE (c);                                  \
2170     nref = c - 0x20;                                    \
2171     if (nref < 0 || nref >= 81)                         \
2172       goto invalid_code;                                \
2173     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2174   } while (0)
2175
2176
2177 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2178    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2179    byte length of this composition information, CHARS is the number of
2180    characters composed by this composition.  */
2181
2182 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2183   do {                                                                  \
2184     enum composition_method method = c - 0xF2;                          \
2185     int nbytes, nchars;                                                 \
2186                                                                         \
2187     ONE_MORE_BYTE (c);                                                  \
2188     if (c < 0)                                                          \
2189       goto invalid_code;                                                \
2190     nbytes = c - 0xA0;                                                  \
2191     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2192       goto invalid_code;                                                \
2193     ONE_MORE_BYTE (c);                                                  \
2194     nchars = c - 0xA0;                                                  \
2195     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2196       goto invalid_code;                                                \
2197     cmp_status->old_form = 0;                                           \
2198     cmp_status->method = method;                                        \
2199     if (method == COMPOSITION_RELATIVE)                                 \
2200       cmp_status->state = COMPOSING_CHAR;                               \
2201     else                                                                \
2202       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2203     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2204     cmp_status->nchars = nchars;                                        \
2205     cmp_status->ncomps = nbytes - 4;                                    \
2206     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2207   } while (0)
2208
2209
2210 /* Start of Emacs 20 style format for relative composition.  */
2211
2212 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2213   do {                                                          \
2214     cmp_status->old_form = 1;                                   \
2215     cmp_status->method = COMPOSITION_RELATIVE;                  \
2216     cmp_status->state = COMPOSING_CHAR;                         \
2217     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2218     cmp_status->nchars = cmp_status->ncomps = 0;                \
2219     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2220   } while (0)
2221
2222
2223 /* Start of Emacs 20 style format for rule-base composition.  */
2224
2225 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2226   do {                                                          \
2227     cmp_status->old_form = 1;                                   \
2228     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2229     cmp_status->state = COMPOSING_CHAR;                         \
2230     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2231     cmp_status->nchars = cmp_status->ncomps = 0;                \
2232     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2233   } while (0)
2234
2235
2236 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2237   do {                                                  \
2238     const unsigned char *current_src = src;             \
2239                                                         \
2240     ONE_MORE_BYTE (c);                                  \
2241     if (c < 0)                                          \
2242       goto invalid_code;                                \
2243     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2244         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2245       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2246     else if (c < 0xA0)                                  \
2247       goto invalid_code;                                \
2248     else if (c < 0xC0)                                  \
2249       {                                                 \
2250         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2251         /* Re-read C as a composition component.  */    \
2252         src = current_src;                              \
2253       }                                                 \
2254     else if (c == 0xFF)                                 \
2255       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2256     else                                                \
2257       goto invalid_code;                                \
2258   } while (0)
2259
2260 #define EMACS_MULE_COMPOSITION_END()                            \
2261   do {                                                          \
2262     int idx = - cmp_status->length;                             \
2263                                                                 \
2264     if (cmp_status->old_form)                                   \
2265       charbuf[idx + 2] = cmp_status->nchars;                    \
2266     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2267       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2268     cmp_status->state = COMPOSING_NO;                           \
2269   } while (0)
2270
2271
2272 static int
2273 emacs_mule_finish_composition (int *charbuf,
2274                                struct composition_status *cmp_status)
2275 {
2276   int idx = - cmp_status->length;
2277   int new_chars;
2278
2279   if (cmp_status->old_form && cmp_status->nchars > 0)
2280     {
2281       charbuf[idx + 2] = cmp_status->nchars;
2282       new_chars = 0;
2283       if (cmp_status->method == COMPOSITION_WITH_RULE
2284           && cmp_status->state == COMPOSING_CHAR)
2285         {
2286           /* The last rule was invalid.  */
2287           int rule = charbuf[-1] + 0xA0;
2288
2289           charbuf[-2] = BYTE8_TO_CHAR (rule);
2290           charbuf[-1] = -1;
2291           new_chars = 1;
2292         }
2293     }
2294   else
2295     {
2296       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2297
2298       if (cmp_status->method == COMPOSITION_WITH_RULE)
2299         {
2300           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2301           charbuf[idx++] = -3;
2302           charbuf[idx++] = 0;
2303           new_chars = 1;
2304         }
2305       else
2306         {
2307           int nchars = charbuf[idx + 1] + 0xA0;
2308           int nbytes = charbuf[idx + 2] + 0xA0;
2309
2310           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2311           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2312           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2313           charbuf[idx++] = -1;
2314           new_chars = 4;
2315         }
2316     }
2317   cmp_status->state = COMPOSING_NO;
2318   return new_chars;
2319 }
2320
2321 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2322   do {                                                                    \
2323     if (cmp_status->state != COMPOSING_NO)                                \
2324       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2325   } while (0)
2326
2327
2328 static void
2329 decode_coding_emacs_mule (struct coding_system *coding)
2330 {
2331   const unsigned char *src = coding->source + coding->consumed;
2332   const unsigned char *src_end = coding->source + coding->src_bytes;
2333   const unsigned char *src_base;
2334   int *charbuf = coding->charbuf + coding->charbuf_used;
2335   /* We may produce two annotations (charset and composition) in one
2336      loop and one more charset annotation at the end.  */
2337   int *charbuf_end
2338     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2339       /* We can produce up to 2 characters in a loop.  */
2340       - 1;
2341   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2342   bool multibytep = coding->src_multibyte;
2343   ptrdiff_t char_offset = coding->produced_char;
2344   ptrdiff_t last_offset = char_offset;
2345   int last_id = charset_ascii;
2346   bool eol_dos
2347     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2348   int byte_after_cr = -1;
2349   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2350
2351   if (cmp_status->state != COMPOSING_NO)
2352     {
2353       int i;
2354
2355       if (charbuf_end - charbuf < cmp_status->length)
2356         emacs_abort ();
2357       for (i = 0; i < cmp_status->length; i++)
2358         *charbuf++ = cmp_status->carryover[i];
2359       coding->annotated = 1;
2360     }
2361
2362   while (1)
2363     {
2364       int c;
2365       int id UNINIT;
2366
2367       src_base = src;
2368       consumed_chars_base = consumed_chars;
2369
2370       if (charbuf >= charbuf_end)
2371         {
2372           if (byte_after_cr >= 0)
2373             src_base--;
2374           break;
2375         }
2376
2377       if (byte_after_cr >= 0)
2378         c = byte_after_cr, byte_after_cr = -1;
2379       else
2380         ONE_MORE_BYTE (c);
2381
2382       if (c < 0 || c == 0x80)
2383         {
2384           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2385           if (c < 0)
2386             {
2387               *charbuf++ = -c;
2388               char_offset++;
2389             }
2390           else
2391             DECODE_EMACS_MULE_COMPOSITION_START ();
2392           continue;
2393         }
2394
2395       if (c < 0x80)
2396         {
2397           if (eol_dos && c == '\r')
2398             ONE_MORE_BYTE (byte_after_cr);
2399           id = charset_ascii;
2400           if (cmp_status->state != COMPOSING_NO)
2401             {
2402               if (cmp_status->old_form)
2403                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2404               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2405                 cmp_status->ncomps--;
2406             }
2407         }
2408       else
2409         {
2410           int nchars UNINIT, nbytes UNINIT;
2411           /* emacs_mule_char can load a charset map from a file, which
2412              allocates a large structure and might cause buffer text
2413              to be relocated as result.  Thus, we need to remember the
2414              original pointer to buffer text, and fix up all related
2415              pointers after the call.  */
2416           const unsigned char *orig = coding->source;
2417           ptrdiff_t offset;
2418
2419           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2420                                cmp_status);
2421           offset = coding->source - orig;
2422           if (offset)
2423             {
2424               src += offset;
2425               src_base += offset;
2426               src_end += offset;
2427             }
2428           if (c < 0)
2429             {
2430               if (c == -1)
2431                 goto invalid_code;
2432               if (c == -2)
2433                 break;
2434             }
2435           src = src_base + nbytes;
2436           consumed_chars = consumed_chars_base + nchars;
2437           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2438             cmp_status->ncomps -= nchars;
2439         }
2440
2441       /* Now if C >= 0, we found a normally encoded character, if C <
2442          0, we found an old-style composition component character or
2443          rule.  */
2444
2445       if (cmp_status->state == COMPOSING_NO)
2446         {
2447           if (last_id != id)
2448             {
2449               if (last_id != charset_ascii)
2450                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2451                                   last_id);
2452               last_id = id;
2453               last_offset = char_offset;
2454             }
2455           *charbuf++ = c;
2456           char_offset++;
2457         }
2458       else if (cmp_status->state == COMPOSING_CHAR)
2459         {
2460           if (cmp_status->old_form)
2461             {
2462               if (c >= 0)
2463                 {
2464                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2465                   *charbuf++ = c;
2466                   char_offset++;
2467                 }
2468               else
2469                 {
2470                   *charbuf++ = -c;
2471                   cmp_status->nchars++;
2472                   cmp_status->length++;
2473                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2474                     EMACS_MULE_COMPOSITION_END ();
2475                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2476                     cmp_status->state = COMPOSING_RULE;
2477                 }
2478             }
2479           else
2480             {
2481               *charbuf++ = c;
2482               cmp_status->length++;
2483               cmp_status->nchars--;
2484               if (cmp_status->nchars == 0)
2485                 EMACS_MULE_COMPOSITION_END ();
2486             }
2487         }
2488       else if (cmp_status->state == COMPOSING_RULE)
2489         {
2490           int rule;
2491
2492           if (c >= 0)
2493             {
2494               EMACS_MULE_COMPOSITION_END ();
2495               *charbuf++ = c;
2496               char_offset++;
2497             }
2498           else
2499             {
2500               c = -c;
2501               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2502               if (rule < 0)
2503                 goto invalid_code;
2504               *charbuf++ = -2;
2505               *charbuf++ = rule;
2506               cmp_status->length += 2;
2507               cmp_status->state = COMPOSING_CHAR;
2508             }
2509         }
2510       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2511         {
2512           *charbuf++ = c;
2513           cmp_status->length++;
2514           if (cmp_status->ncomps == 0)
2515             cmp_status->state = COMPOSING_CHAR;
2516           else if (cmp_status->ncomps > 0)
2517             {
2518               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2519                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2520             }
2521           else
2522             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2523         }
2524       else                      /* COMPOSING_COMPONENT_RULE */
2525         {
2526           int rule;
2527
2528           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2529           if (rule < 0)
2530             goto invalid_code;
2531           *charbuf++ = -2;
2532           *charbuf++ = rule;
2533           cmp_status->length += 2;
2534           cmp_status->ncomps--;
2535           if (cmp_status->ncomps > 0)
2536             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2537           else
2538             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2539         }
2540       continue;
2541
2542     invalid_code:
2543       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2544       src = src_base;
2545       consumed_chars = consumed_chars_base;
2546       ONE_MORE_BYTE (c);
2547       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2548       char_offset++;
2549     }
2550
2551  no_more_source:
2552   if (cmp_status->state != COMPOSING_NO)
2553     {
2554       if (coding->mode & CODING_MODE_LAST_BLOCK)
2555         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2556       else
2557         {
2558           int i;
2559
2560           charbuf -= cmp_status->length;
2561           for (i = 0; i < cmp_status->length; i++)
2562             cmp_status->carryover[i] = charbuf[i];
2563         }
2564     }
2565   if (last_id != charset_ascii)
2566     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2567   coding->consumed_char += consumed_chars_base;
2568   coding->consumed = src_base - coding->source;
2569   coding->charbuf_used = charbuf - coding->charbuf;
2570 }
2571
2572
2573 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2574   do {                                          \
2575     if (id < 0xA0)                              \
2576       codes[0] = id, codes[1] = 0;              \
2577     else if (id < 0xE0)                         \
2578       codes[0] = 0x9A, codes[1] = id;           \
2579     else if (id < 0xF0)                         \
2580       codes[0] = 0x9B, codes[1] = id;           \
2581     else if (id < 0xF5)                         \
2582       codes[0] = 0x9C, codes[1] = id;           \
2583     else                                        \
2584       codes[0] = 0x9D, codes[1] = id;           \
2585   } while (0);
2586
2587
2588 static bool
2589 encode_coding_emacs_mule (struct coding_system *coding)
2590 {
2591   bool multibytep = coding->dst_multibyte;
2592   int *charbuf = coding->charbuf;
2593   int *charbuf_end = charbuf + coding->charbuf_used;
2594   unsigned char *dst = coding->destination + coding->produced;
2595   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2596   int safe_room = 8;
2597   ptrdiff_t produced_chars = 0;
2598   Lisp_Object attrs, charset_list;
2599   int c;
2600   int preferred_charset_id = -1;
2601
2602   CODING_GET_INFO (coding, attrs, charset_list);
2603   if (! EQ (charset_list, Vemacs_mule_charset_list))
2604     {
2605       charset_list = Vemacs_mule_charset_list;
2606       ASET (attrs, coding_attr_charset_list, charset_list);
2607     }
2608
2609   while (charbuf < charbuf_end)
2610     {
2611       ASSURE_DESTINATION (safe_room);
2612       c = *charbuf++;
2613
2614       if (c < 0)
2615         {
2616           /* Handle an annotation.  */
2617           switch (*charbuf)
2618             {
2619             case CODING_ANNOTATE_COMPOSITION_MASK:
2620               /* Not yet implemented.  */
2621               break;
2622             case CODING_ANNOTATE_CHARSET_MASK:
2623               preferred_charset_id = charbuf[3];
2624               if (preferred_charset_id >= 0
2625                   && NILP (Fmemq (make_number (preferred_charset_id),
2626                                   charset_list)))
2627                 preferred_charset_id = -1;
2628               break;
2629             default:
2630               emacs_abort ();
2631             }
2632           charbuf += -c - 1;
2633           continue;
2634         }
2635
2636       if (ASCII_CHAR_P (c))
2637         EMIT_ONE_ASCII_BYTE (c);
2638       else if (CHAR_BYTE8_P (c))
2639         {
2640           c = CHAR_TO_BYTE8 (c);
2641           EMIT_ONE_BYTE (c);
2642         }
2643       else
2644         {
2645           struct charset *charset;
2646           unsigned code;
2647           int dimension;
2648           int emacs_mule_id;
2649           unsigned char leading_codes[2];
2650
2651           if (preferred_charset_id >= 0)
2652             {
2653               bool result;
2654
2655               charset = CHARSET_FROM_ID (preferred_charset_id);
2656               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2657               if (result)
2658                 code = ENCODE_CHAR (charset, c);
2659               else
2660                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2661                                      &code, charset);
2662             }
2663           else
2664             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2665                                  &code, charset);
2666           if (! charset)
2667             {
2668               c = coding->default_char;
2669               if (ASCII_CHAR_P (c))
2670                 {
2671                   EMIT_ONE_ASCII_BYTE (c);
2672                   continue;
2673                 }
2674               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2675                                    &code, charset);
2676             }
2677           dimension = CHARSET_DIMENSION (charset);
2678           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2679           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2680           EMIT_ONE_BYTE (leading_codes[0]);
2681           if (leading_codes[1])
2682             EMIT_ONE_BYTE (leading_codes[1]);
2683           if (dimension == 1)
2684             EMIT_ONE_BYTE (code | 0x80);
2685           else
2686             {
2687               code |= 0x8080;
2688               EMIT_ONE_BYTE (code >> 8);
2689               EMIT_ONE_BYTE (code & 0xFF);
2690             }
2691         }
2692     }
2693   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2694   coding->produced_char += produced_chars;
2695   coding->produced = dst - coding->destination;
2696   return 0;
2697 }
2698
2699 \f
2700 /*** 7. ISO2022 handlers ***/
2701
2702 /* The following note describes the coding system ISO2022 briefly.
2703    Since the intention of this note is to help understand the
2704    functions in this file, some parts are NOT ACCURATE or are OVERLY
2705    SIMPLIFIED.  For thorough understanding, please refer to the
2706    original document of ISO2022.  This is equivalent to the standard
2707    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2708
2709    ISO2022 provides many mechanisms to encode several character sets
2710    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2711    is encoded using bytes less than 128.  This may make the encoded
2712    text a little bit longer, but the text passes more easily through
2713    several types of gateway, some of which strip off the MSB (Most
2714    Significant Bit).
2715
2716    There are two kinds of character sets: control character sets and
2717    graphic character sets.  The former contain control characters such
2718    as `newline' and `escape' to provide control functions (control
2719    functions are also provided by escape sequences).  The latter
2720    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2721    two control character sets and many graphic character sets.
2722
2723    Graphic character sets are classified into one of the following
2724    four classes, according to the number of bytes (DIMENSION) and
2725    number of characters in one dimension (CHARS) of the set:
2726    - DIMENSION1_CHARS94
2727    - DIMENSION1_CHARS96
2728    - DIMENSION2_CHARS94
2729    - DIMENSION2_CHARS96
2730
2731    In addition, each character set is assigned an identification tag,
2732    unique for each set, called the "final character" (denoted as <F>
2733    hereafter).  The <F> of each character set is decided by ECMA(*)
2734    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2735    (0x30..0x3F are for private use only).
2736
2737    Note (*): ECMA = European Computer Manufacturers Association
2738
2739    Here are examples of graphic character sets [NAME(<F>)]:
2740         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2741         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2742         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2743         o DIMENSION2_CHARS96 -- none for the moment
2744
2745    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2746         C0 [0x00..0x1F] -- control character plane 0
2747         GL [0x20..0x7F] -- graphic character plane 0
2748         C1 [0x80..0x9F] -- control character plane 1
2749         GR [0xA0..0xFF] -- graphic character plane 1
2750
2751    A control character set is directly designated and invoked to C0 or
2752    C1 by an escape sequence.  The most common case is that:
2753    - ISO646's  control character set is designated/invoked to C0, and
2754    - ISO6429's control character set is designated/invoked to C1,
2755    and usually these designations/invocations are omitted in encoded
2756    text.  In a 7-bit environment, only C0 can be used, and a control
2757    character for C1 is encoded by an appropriate escape sequence to
2758    fit into the environment.  All control characters for C1 are
2759    defined to have corresponding escape sequences.
2760
2761    A graphic character set is at first designated to one of four
2762    graphic registers (G0 through G3), then these graphic registers are
2763    invoked to GL or GR.  These designations and invocations can be
2764    done independently.  The most common case is that G0 is invoked to
2765    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2766    these invocations and designations are omitted in encoded text.
2767    In a 7-bit environment, only GL can be used.
2768
2769    When a graphic character set of CHARS94 is invoked to GL, codes
2770    0x20 and 0x7F of the GL area work as control characters SPACE and
2771    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2772    be used.
2773
2774    There are two ways of invocation: locking-shift and single-shift.
2775    With locking-shift, the invocation lasts until the next different
2776    invocation, whereas with single-shift, the invocation affects the
2777    following character only and doesn't affect the locking-shift
2778    state.  Invocations are done by the following control characters or
2779    escape sequences:
2780
2781    ----------------------------------------------------------------------
2782    abbrev  function                  cntrl escape seq   description
2783    ----------------------------------------------------------------------
2784    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2785    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2786    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2787    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2788    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2789    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2790    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2791    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2792    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2793    ----------------------------------------------------------------------
2794    (*) These are not used by any known coding system.
2795
2796    Control characters for these functions are defined by macros
2797    ISO_CODE_XXX in `coding.h'.
2798
2799    Designations are done by the following escape sequences:
2800    ----------------------------------------------------------------------
2801    escape sequence      description
2802    ----------------------------------------------------------------------
2803    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2804    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2805    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2806    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2807    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2808    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2809    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2810    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2811    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2812    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2813    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2814    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2815    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2816    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2817    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2818    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2819    ----------------------------------------------------------------------
2820
2821    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2822    of dimension 1, chars 94, and final character <F>, etc...
2823
2824    Note (*): Although these designations are not allowed in ISO2022,
2825    Emacs accepts them on decoding, and produces them on encoding
2826    CHARS96 character sets in a coding system which is characterized as
2827    7-bit environment, non-locking-shift, and non-single-shift.
2828
2829    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2830    '(' must be omitted.  We refer to this as "short-form" hereafter.
2831
2832    Now you may notice that there are a lot of ways of encoding the
2833    same multilingual text in ISO2022.  Actually, there exist many
2834    coding systems such as Compound Text (used in X11's inter client
2835    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2836    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2837    localized platforms), and all of these are variants of ISO2022.
2838
2839    In addition to the above, Emacs handles two more kinds of escape
2840    sequences: ISO6429's direction specification and Emacs' private
2841    sequence for specifying character composition.
2842
2843    ISO6429's direction specification takes the following form:
2844         o CSI ']'      -- end of the current direction
2845         o CSI '0' ']'  -- end of the current direction
2846         o CSI '1' ']'  -- start of left-to-right text
2847         o CSI '2' ']'  -- start of right-to-left text
2848    The control character CSI (0x9B: control sequence introducer) is
2849    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2850
2851    Character composition specification takes the following form:
2852         o ESC '0' -- start relative composition
2853         o ESC '1' -- end composition
2854         o ESC '2' -- start rule-base composition (*)
2855         o ESC '3' -- start relative composition with alternate chars  (**)
2856         o ESC '4' -- start rule-base composition with alternate chars  (**)
2857   Since these are not standard escape sequences of any ISO standard,
2858   the use of them with these meanings is restricted to Emacs only.
2859
2860   (*) This form is used only in Emacs 20.7 and older versions,
2861   but newer versions can safely decode it.
2862   (**) This form is used only in Emacs 21.1 and newer versions,
2863   and older versions can't decode it.
2864
2865   Here's a list of example usages of these composition escape
2866   sequences (categorized by `enum composition_method').
2867
2868   COMPOSITION_RELATIVE:
2869         ESC 0 CHAR [ CHAR ] ESC 1
2870   COMPOSITION_WITH_RULE:
2871         ESC 2 CHAR [ RULE CHAR ] ESC 1
2872   COMPOSITION_WITH_ALTCHARS:
2873         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2874   COMPOSITION_WITH_RULE_ALTCHARS:
2875         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2876
2877 static enum iso_code_class_type iso_code_class[256];
2878
2879 #define SAFE_CHARSET_P(coding, id)      \
2880   ((id) <= (coding)->max_charset_id     \
2881    && (coding)->safe_charsets[id] != 255)
2882
2883 static void
2884 setup_iso_safe_charsets (Lisp_Object attrs)
2885 {
2886   Lisp_Object charset_list, safe_charsets;
2887   Lisp_Object request;
2888   Lisp_Object reg_usage;
2889   Lisp_Object tail;
2890   EMACS_INT reg94, reg96;
2891   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2892   int max_charset_id;
2893
2894   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2895   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2896       && ! EQ (charset_list, Viso_2022_charset_list))
2897     {
2898       charset_list = Viso_2022_charset_list;
2899       ASET (attrs, coding_attr_charset_list, charset_list);
2900       ASET (attrs, coding_attr_safe_charsets, Qnil);
2901     }
2902
2903   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2904     return;
2905
2906   max_charset_id = 0;
2907   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2908     {
2909       int id = XINT (XCAR (tail));
2910       if (max_charset_id < id)
2911         max_charset_id = id;
2912     }
2913
2914   safe_charsets = make_uninit_string (max_charset_id + 1);
2915   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2916   request = AREF (attrs, coding_attr_iso_request);
2917   reg_usage = AREF (attrs, coding_attr_iso_usage);
2918   reg94 = XINT (XCAR (reg_usage));
2919   reg96 = XINT (XCDR (reg_usage));
2920
2921   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2922     {
2923       Lisp_Object id;
2924       Lisp_Object reg;
2925       struct charset *charset;
2926
2927       id = XCAR (tail);
2928       charset = CHARSET_FROM_ID (XINT (id));
2929       reg = Fcdr (Fassq (id, request));
2930       if (! NILP (reg))
2931         SSET (safe_charsets, XINT (id), XINT (reg));
2932       else if (charset->iso_chars_96)
2933         {
2934           if (reg96 < 4)
2935             SSET (safe_charsets, XINT (id), reg96);
2936         }
2937       else
2938         {
2939           if (reg94 < 4)
2940             SSET (safe_charsets, XINT (id), reg94);
2941         }
2942     }
2943   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2944 }
2945
2946
2947 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2948    Return true if a text is encoded in one of ISO-2022 based coding
2949    systems.  */
2950
2951 static bool
2952 detect_coding_iso_2022 (struct coding_system *coding,
2953                         struct coding_detection_info *detect_info)
2954 {
2955   const unsigned char *src = coding->source, *src_base = src;
2956   const unsigned char *src_end = coding->source + coding->src_bytes;
2957   bool multibytep = coding->src_multibyte;
2958   bool single_shifting = 0;
2959   int id;
2960   int c, c1;
2961   ptrdiff_t consumed_chars = 0;
2962   int i;
2963   int rejected = 0;
2964   int found = 0;
2965   int composition_count = -1;
2966
2967   detect_info->checked |= CATEGORY_MASK_ISO;
2968
2969   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2970     {
2971       struct coding_system *this = &(coding_categories[i]);
2972       Lisp_Object attrs, val;
2973
2974       if (this->id < 0)
2975         continue;
2976       attrs = CODING_ID_ATTRS (this->id);
2977       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2978           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2979         setup_iso_safe_charsets (attrs);
2980       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2981       this->max_charset_id = SCHARS (val) - 1;
2982       this->safe_charsets = SDATA (val);
2983     }
2984
2985   /* A coding system of this category is always ASCII compatible.  */
2986   src += coding->head_ascii;
2987
2988   while (rejected != CATEGORY_MASK_ISO)
2989     {
2990       src_base = src;
2991       ONE_MORE_BYTE (c);
2992       switch (c)
2993         {
2994         case ISO_CODE_ESC:
2995           if (inhibit_iso_escape_detection)
2996             break;
2997           single_shifting = 0;
2998           ONE_MORE_BYTE (c);
2999           if (c == 'N' || c == 'O')
3000             {
3001               /* ESC <Fe> for SS2 or SS3.  */
3002               single_shifting = 1;
3003               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3004             }
3005           else if (c == '1')
3006             {
3007               /* End of composition.  */
3008               if (composition_count < 0
3009                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3010                 /* Invalid */
3011                 break;
3012               composition_count = -1;
3013               found |= CATEGORY_MASK_ISO;
3014             }
3015           else if (c >= '0' && c <= '4')
3016             {
3017               /* ESC <Fp> for start/end composition.  */
3018               composition_count = 0;
3019             }
3020           else
3021             {
3022               if (c >= '(' && c <= '/')
3023                 {
3024                   /* Designation sequence for a charset of dimension 1.  */
3025                   ONE_MORE_BYTE (c1);
3026                   if (c1 < ' ' || c1 >= 0x80
3027                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3028                     {
3029                       /* Invalid designation sequence.  Just ignore.  */
3030                       if (c1 >= 0x80)
3031                         rejected |= (CATEGORY_MASK_ISO_7BIT
3032                                      | CATEGORY_MASK_ISO_7_ELSE);
3033                       break;
3034                     }
3035                 }
3036               else if (c == '$')
3037                 {
3038                   /* Designation sequence for a charset of dimension 2.  */
3039                   ONE_MORE_BYTE (c);
3040                   if (c >= '@' && c <= 'B')
3041                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3042                     id = iso_charset_table[1][0][c];
3043                   else if (c >= '(' && c <= '/')
3044                     {
3045                       ONE_MORE_BYTE (c1);
3046                       if (c1 < ' ' || c1 >= 0x80
3047                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3048                         {
3049                           /* Invalid designation sequence.  Just ignore.  */
3050                           if (c1 >= 0x80)
3051                             rejected |= (CATEGORY_MASK_ISO_7BIT
3052                                          | CATEGORY_MASK_ISO_7_ELSE);
3053                           break;
3054                         }
3055                     }
3056                   else
3057                     {
3058                       /* Invalid designation sequence.  Just ignore it.  */
3059                       if (c >= 0x80)
3060                         rejected |= (CATEGORY_MASK_ISO_7BIT
3061                                      | CATEGORY_MASK_ISO_7_ELSE);
3062                       break;
3063                     }
3064                 }
3065               else
3066                 {
3067                   /* Invalid escape sequence.  Just ignore it.  */
3068                   if (c >= 0x80)
3069                     rejected |= (CATEGORY_MASK_ISO_7BIT
3070                                  | CATEGORY_MASK_ISO_7_ELSE);
3071                   break;
3072                 }
3073
3074               /* We found a valid designation sequence for CHARSET.  */
3075               rejected |= CATEGORY_MASK_ISO_8BIT;
3076               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3077                                   id))
3078                 found |= CATEGORY_MASK_ISO_7;
3079               else
3080                 rejected |= CATEGORY_MASK_ISO_7;
3081               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3082                                   id))
3083                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3084               else
3085                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3086               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3087                                   id))
3088                 found |= CATEGORY_MASK_ISO_7_ELSE;
3089               else
3090                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3091               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3092                                   id))
3093                 found |= CATEGORY_MASK_ISO_8_ELSE;
3094               else
3095                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3096             }
3097           break;
3098
3099         case ISO_CODE_SO:
3100         case ISO_CODE_SI:
3101           /* Locking shift out/in.  */
3102           if (inhibit_iso_escape_detection)
3103             break;
3104           single_shifting = 0;
3105           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3106           break;
3107
3108         case ISO_CODE_CSI:
3109           /* Control sequence introducer.  */
3110           single_shifting = 0;
3111           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3112           found |= CATEGORY_MASK_ISO_8_ELSE;
3113           goto check_extra_latin;
3114
3115         case ISO_CODE_SS2:
3116         case ISO_CODE_SS3:
3117           /* Single shift.   */
3118           if (inhibit_iso_escape_detection)
3119             break;
3120           single_shifting = 0;
3121           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3122           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3123               & CODING_ISO_FLAG_SINGLE_SHIFT)
3124             {
3125               found |= CATEGORY_MASK_ISO_8_1;
3126               single_shifting = 1;
3127             }
3128           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3129               & CODING_ISO_FLAG_SINGLE_SHIFT)
3130             {
3131               found |= CATEGORY_MASK_ISO_8_2;
3132               single_shifting = 1;
3133             }
3134           if (single_shifting)
3135             break;
3136           goto check_extra_latin;
3137
3138         default:
3139           if (c < 0)
3140             continue;
3141           if (c < 0x80)
3142             {
3143               if (composition_count >= 0)
3144                 composition_count++;
3145               single_shifting = 0;
3146               break;
3147             }
3148           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3149           if (c >= 0xA0)
3150             {
3151               found |= CATEGORY_MASK_ISO_8_1;
3152               /* Check the length of succeeding codes of the range
3153                  0xA0..0FF.  If the byte length is even, we include
3154                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3155                  only when we are not single shifting.  */
3156               if (! single_shifting
3157                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3158                 {
3159                   ptrdiff_t len = 1;
3160                   while (src < src_end)
3161                     {
3162                       src_base = src;
3163                       ONE_MORE_BYTE (c);
3164                       if (c < 0xA0)
3165                         {
3166                           src = src_base;
3167                           break;
3168                         }
3169                       len++;
3170                     }
3171
3172                   if (len & 1 && src < src_end)
3173                     {
3174                       rejected |= CATEGORY_MASK_ISO_8_2;
3175                       if (composition_count >= 0)
3176                         composition_count += len;
3177                     }
3178                   else
3179                     {
3180                       found |= CATEGORY_MASK_ISO_8_2;
3181                       if (composition_count >= 0)
3182                         composition_count += len / 2;
3183                     }
3184                 }
3185               break;
3186             }
3187         check_extra_latin:
3188           if (! VECTORP (Vlatin_extra_code_table)
3189               || NILP (AREF (Vlatin_extra_code_table, c)))
3190             {
3191               rejected = CATEGORY_MASK_ISO;
3192               break;
3193             }
3194           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3195               & CODING_ISO_FLAG_LATIN_EXTRA)
3196             found |= CATEGORY_MASK_ISO_8_1;
3197           else
3198             rejected |= CATEGORY_MASK_ISO_8_1;
3199           rejected |= CATEGORY_MASK_ISO_8_2;
3200           break;
3201         }
3202     }
3203   detect_info->rejected |= CATEGORY_MASK_ISO;
3204   return 0;
3205
3206  no_more_source:
3207   detect_info->rejected |= rejected;
3208   detect_info->found |= (found & ~rejected);
3209   return 1;
3210 }
3211
3212
3213 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3214    escape sequence should be kept.  */
3215 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3216   do {                                                                  \
3217     int id, prev;                                                       \
3218                                                                         \
3219     if (final < '0' || final >= 128                                     \
3220         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3221         || !SAFE_CHARSET_P (coding, id))                                \
3222       {                                                                 \
3223         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3224         chars_96 = -1;                                                  \
3225         break;                                                          \
3226       }                                                                 \
3227     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3228     if (id == charset_jisx0201_roman)                                   \
3229       {                                                                 \
3230         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3231           id = charset_ascii;                                           \
3232       }                                                                 \
3233     else if (id == charset_jisx0208_1978)                               \
3234       {                                                                 \
3235         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3236           id = charset_jisx0208;                                        \
3237       }                                                                 \
3238     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3239     /* If there was an invalid designation to REG previously, and this  \
3240        designation is ASCII to REG, we should keep this designation     \
3241        sequence.  */                                                    \
3242     if (prev == -2 && id == charset_ascii)                              \
3243       chars_96 = -1;                                                    \
3244   } while (0)
3245
3246
3247 /* Handle these composition sequence (ALT: alternate char):
3248
3249    (1) relative composition: ESC 0 CHAR ... ESC 1
3250    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3251    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3252    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3253
3254    When the start sequence (ESC 0/2/3/4) is found, this annotation
3255    header is produced.
3256
3257         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3258
3259    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3260    produced until the end sequence (ESC 1) is found:
3261
3262    (1) CHAR ... CHAR
3263    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3264    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3265    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3266
3267    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3268    annotation header is updated as below:
3269
3270    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3271    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3272    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3273    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3274
3275    If an error is found while composing, the annotation header is
3276    changed to:
3277
3278         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3279
3280    and the sequence [ -2 DECODED-RULE ] is changed to the original
3281    byte sequence as below:
3282         o the original byte sequence is B: [ B -1 ]
3283         o the original byte sequence is B1 B2: [ B1 B2 ]
3284    and the sequence [ -1 -1 ] is changed to the original byte
3285    sequence:
3286         [ ESC '0' ]
3287 */
3288
3289 /* Decode a composition rule C1 and maybe one more byte from the
3290    source, and set RULE to the encoded composition rule.  If the rule
3291    is invalid, goto invalid_code.  */
3292
3293 #define DECODE_COMPOSITION_RULE(rule)                                   \
3294   do {                                                                  \
3295     rule = c1 - 32;                                                     \
3296     if (rule < 0)                                                       \
3297       goto invalid_code;                                                \
3298     if (rule < 81)              /* old format (before ver.21) */        \
3299       {                                                                 \
3300         int gref = (rule) / 9;                                          \
3301         int nref = (rule) % 9;                                          \
3302         if (gref == 4) gref = 10;                                       \
3303         if (nref == 4) nref = 10;                                       \
3304         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3305       }                                                                 \
3306     else                        /* new format (after ver.21) */         \
3307       {                                                                 \
3308         int b;                                                          \
3309                                                                         \
3310         ONE_MORE_BYTE (b);                                              \
3311         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3312           goto invalid_code;                                            \
3313         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3314         rule += 0x100;   /* Distinguish it from the old format.  */     \
3315       }                                                                 \
3316   } while (0)
3317
3318 #define ENCODE_COMPOSITION_RULE(rule)                           \
3319   do {                                                          \
3320     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3321                                                                 \
3322     if (rule < 0x100)           /* old format */                \
3323       {                                                         \
3324         if (gref == 10) gref = 4;                               \
3325         if (nref == 10) nref = 4;                               \
3326         charbuf[idx] = 32 + gref * 9 + nref;                    \
3327         charbuf[idx + 1] = -1;                                  \
3328         new_chars++;                                            \
3329       }                                                         \
3330     else                                /* new format */        \
3331       {                                                         \
3332         charbuf[idx] = 32 + 81 + gref;                          \
3333         charbuf[idx + 1] = 32 + nref;                           \
3334         new_chars += 2;                                         \
3335       }                                                         \
3336   } while (0)
3337
3338 /* Finish the current composition as invalid.  */
3339
3340 static int
3341 finish_composition (int *charbuf, struct composition_status *cmp_status)
3342 {
3343   int idx = - cmp_status->length;
3344   int new_chars;
3345
3346   /* Recover the original ESC sequence */
3347   charbuf[idx++] = ISO_CODE_ESC;
3348   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3349                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3350                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3351                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3352                     : '4');
3353   charbuf[idx++] = -2;
3354   charbuf[idx++] = 0;
3355   charbuf[idx++] = -1;
3356   new_chars = cmp_status->nchars;
3357   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3358     for (; idx < 0; idx++)
3359       {
3360         int elt = charbuf[idx];
3361
3362         if (elt == -2)
3363           {
3364             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3365             idx++;
3366           }
3367         else if (elt == -1)
3368           {
3369             charbuf[idx++] = ISO_CODE_ESC;
3370             charbuf[idx] = '0';
3371             new_chars += 2;
3372           }
3373       }
3374   cmp_status->state = COMPOSING_NO;
3375   return new_chars;
3376 }
3377
3378 /* If characters are under composition, finish the composition.  */
3379 #define MAYBE_FINISH_COMPOSITION()                              \
3380   do {                                                          \
3381     if (cmp_status->state != COMPOSING_NO)                      \
3382       char_offset += finish_composition (charbuf, cmp_status);  \
3383   } while (0)
3384
3385 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3386
3387    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3388    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3389    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3390    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3391
3392    Produce this annotation sequence now:
3393
3394    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3395 */
3396
3397 #define DECODE_COMPOSITION_START(c1)                                       \
3398   do {                                                                     \
3399     if (c1 == '0'                                                          \
3400         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3401              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3402             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3403                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3404       {                                                                    \
3405         *charbuf++ = -1;                                                   \
3406         *charbuf++= -1;                                                    \
3407         cmp_status->state = COMPOSING_CHAR;                                \
3408         cmp_status->length += 2;                                           \
3409       }                                                                    \
3410     else                                                                   \
3411       {                                                                    \
3412         MAYBE_FINISH_COMPOSITION ();                                       \
3413         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3414                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3415                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3416                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3417         cmp_status->state                                                  \
3418           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3419         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3420         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3421         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3422         coding->annotated = 1;                                             \
3423       }                                                                    \
3424   } while (0)
3425
3426
3427 /* Handle composition end sequence ESC 1.  */
3428
3429 #define DECODE_COMPOSITION_END()                                        \
3430   do {                                                                  \
3431     if (cmp_status->nchars == 0                                         \
3432         || ((cmp_status->state == COMPOSING_CHAR)                       \
3433             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3434       {                                                                 \
3435         MAYBE_FINISH_COMPOSITION ();                                    \
3436         goto invalid_code;                                              \
3437       }                                                                 \
3438     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3439       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3440     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3441       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3442     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3443     char_offset += cmp_status->nchars;                                  \
3444     cmp_status->state = COMPOSING_NO;                                   \
3445   } while (0)
3446
3447 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3448
3449 #define STORE_COMPOSITION_RULE(rule)    \
3450   do {                                  \
3451     *charbuf++ = -2;                    \
3452     *charbuf++ = rule;                  \
3453     cmp_status->length += 2;            \
3454     cmp_status->state--;                \
3455   } while (0)
3456
3457 /* Store a composed char or a component char C in charbuf, and update
3458    cmp_status.  */
3459
3460 #define STORE_COMPOSITION_CHAR(c)                                       \
3461   do {                                                                  \
3462     *charbuf++ = (c);                                                   \
3463     cmp_status->length++;                                               \
3464     if (cmp_status->state == COMPOSING_CHAR)                            \
3465       cmp_status->nchars++;                                             \
3466     else                                                                \
3467       cmp_status->ncomps++;                                             \
3468     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3469         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3470             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3471       cmp_status->state++;                                              \
3472   } while (0)
3473
3474
3475 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3476
3477 static void
3478 decode_coding_iso_2022 (struct coding_system *coding)
3479 {
3480   const unsigned char *src = coding->source + coding->consumed;
3481   const unsigned char *src_end = coding->source + coding->src_bytes;
3482   const unsigned char *src_base;
3483   int *charbuf = coding->charbuf + coding->charbuf_used;
3484   /* We may produce two annotations (charset and composition) in one
3485      loop and one more charset annotation at the end.  */
3486   int *charbuf_end
3487     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3488   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3489   bool multibytep = coding->src_multibyte;
3490   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3491   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3492   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3493   int charset_id_2, charset_id_3;
3494   struct charset *charset;
3495   int c;
3496   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3497   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3498   ptrdiff_t char_offset = coding->produced_char;
3499   ptrdiff_t last_offset = char_offset;
3500   int last_id = charset_ascii;
3501   bool eol_dos
3502     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3503   int byte_after_cr = -1;
3504   int i;
3505
3506   setup_iso_safe_charsets (attrs);
3507   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3508
3509   if (cmp_status->state != COMPOSING_NO)
3510     {
3511       if (charbuf_end - charbuf < cmp_status->length)
3512         emacs_abort ();
3513       for (i = 0; i < cmp_status->length; i++)
3514         *charbuf++ = cmp_status->carryover[i];
3515       coding->annotated = 1;
3516     }
3517
3518   while (1)
3519     {
3520       int c1, c2, c3;
3521
3522       src_base = src;
3523       consumed_chars_base = consumed_chars;
3524
3525       if (charbuf >= charbuf_end)
3526         {
3527           if (byte_after_cr >= 0)
3528             src_base--;
3529           break;
3530         }
3531
3532       if (byte_after_cr >= 0)
3533         c1 = byte_after_cr, byte_after_cr = -1;
3534       else
3535         ONE_MORE_BYTE (c1);
3536       if (c1 < 0)
3537         goto invalid_code;
3538
3539       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3540         {
3541           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3542           char_offset++;
3543           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3544           continue;
3545         }
3546
3547       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3548         {
3549           if (c1 == ISO_CODE_ESC)
3550             {
3551               if (src + 1 >= src_end)
3552                 goto no_more_source;
3553               *charbuf++ = ISO_CODE_ESC;
3554               char_offset++;
3555               if (src[0] == '%' && src[1] == '@')
3556                 {
3557                   src += 2;
3558                   consumed_chars += 2;
3559                   char_offset += 2;
3560                   /* We are sure charbuf can contain two more chars. */
3561                   *charbuf++ = '%';
3562                   *charbuf++ = '@';
3563                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3564                 }
3565             }
3566           else
3567             {
3568               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3569               char_offset++;
3570             }
3571           continue;
3572         }
3573
3574       if ((cmp_status->state == COMPOSING_RULE
3575            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3576           && c1 != ISO_CODE_ESC)
3577         {
3578           int rule;
3579
3580           DECODE_COMPOSITION_RULE (rule);
3581           STORE_COMPOSITION_RULE (rule);
3582           continue;
3583         }
3584
3585       /* We produce at most one character.  */
3586       switch (iso_code_class [c1])
3587         {
3588         case ISO_0x20_or_0x7F:
3589           if (charset_id_0 < 0
3590               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3591             /* This is SPACE or DEL.  */
3592             charset = CHARSET_FROM_ID (charset_ascii);
3593           else
3594             charset = CHARSET_FROM_ID (charset_id_0);
3595           break;
3596
3597         case ISO_graphic_plane_0:
3598           if (charset_id_0 < 0)
3599             charset = CHARSET_FROM_ID (charset_ascii);
3600           else
3601             charset = CHARSET_FROM_ID (charset_id_0);
3602           break;
3603
3604         case ISO_0xA0_or_0xFF:
3605           if (charset_id_1 < 0
3606               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3607               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3608             goto invalid_code;
3609           /* This is a graphic character, we fall down ... */
3610           FALLTHROUGH;
3611         case ISO_graphic_plane_1:
3612           if (charset_id_1 < 0)
3613             goto invalid_code;
3614           charset = CHARSET_FROM_ID (charset_id_1);
3615           break;
3616
3617         case ISO_control_0:
3618           if (eol_dos && c1 == '\r')
3619             ONE_MORE_BYTE (byte_after_cr);
3620           MAYBE_FINISH_COMPOSITION ();
3621           charset = CHARSET_FROM_ID (charset_ascii);
3622           break;
3623
3624         case ISO_control_1:
3625           goto invalid_code;
3626
3627         case ISO_shift_out:
3628           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3629               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3630             goto invalid_code;
3631           CODING_ISO_INVOCATION (coding, 0) = 1;
3632           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3633           continue;
3634
3635         case ISO_shift_in:
3636           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3637             goto invalid_code;
3638           CODING_ISO_INVOCATION (coding, 0) = 0;
3639           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3640           continue;
3641
3642         case ISO_single_shift_2_7:
3643           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3644             goto invalid_code;
3645           FALLTHROUGH;
3646         case ISO_single_shift_2:
3647           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3648             goto invalid_code;
3649           /* SS2 is handled as an escape sequence of ESC 'N' */
3650           c1 = 'N';
3651           goto label_escape_sequence;
3652
3653         case ISO_single_shift_3:
3654           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3655             goto invalid_code;
3656           /* SS2 is handled as an escape sequence of ESC 'O' */
3657           c1 = 'O';
3658           goto label_escape_sequence;
3659
3660         case ISO_control_sequence_introducer:
3661           /* CSI is handled as an escape sequence of ESC '[' ...  */
3662           c1 = '[';
3663           goto label_escape_sequence;
3664
3665         case ISO_escape:
3666           ONE_MORE_BYTE (c1);
3667         label_escape_sequence:
3668           /* Escape sequences handled here are invocation,
3669              designation, direction specification, and character
3670              composition specification.  */
3671           switch (c1)
3672             {
3673             case '&':           /* revision of following character set */
3674               ONE_MORE_BYTE (c1);
3675               if (!(c1 >= '@' && c1 <= '~'))
3676                 goto invalid_code;
3677               ONE_MORE_BYTE (c1);
3678               if (c1 != ISO_CODE_ESC)
3679                 goto invalid_code;
3680               ONE_MORE_BYTE (c1);
3681               goto label_escape_sequence;
3682
3683             case '$':           /* designation of 2-byte character set */
3684               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3685                 goto invalid_code;
3686               {
3687                 int reg, chars96;
3688
3689                 ONE_MORE_BYTE (c1);
3690                 if (c1 >= '@' && c1 <= 'B')
3691                   {     /* designation of JISX0208.1978, GB2312.1980,
3692                            or JISX0208.1980 */
3693                     reg = 0, chars96 = 0;
3694                   }
3695                 else if (c1 >= 0x28 && c1 <= 0x2B)
3696                   { /* designation of DIMENSION2_CHARS94 character set */
3697                     reg = c1 - 0x28, chars96 = 0;
3698                     ONE_MORE_BYTE (c1);
3699                   }
3700                 else if (c1 >= 0x2C && c1 <= 0x2F)
3701                   { /* designation of DIMENSION2_CHARS96 character set */
3702                     reg = c1 - 0x2C, chars96 = 1;
3703                     ONE_MORE_BYTE (c1);
3704                   }
3705                 else
3706                   goto invalid_code;
3707                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3708                 /* We must update these variables now.  */
3709                 if (reg == 0)
3710                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3711                 else if (reg == 1)
3712                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3713                 if (chars96 < 0)
3714                   goto invalid_code;
3715               }
3716               continue;
3717
3718             case 'n':           /* invocation of locking-shift-2 */
3719               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3720                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3721                 goto invalid_code;
3722               CODING_ISO_INVOCATION (coding, 0) = 2;
3723               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3724               continue;
3725
3726             case 'o':           /* invocation of locking-shift-3 */
3727               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3728                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3729                 goto invalid_code;
3730               CODING_ISO_INVOCATION (coding, 0) = 3;
3731               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3732               continue;
3733
3734             case 'N':           /* invocation of single-shift-2 */
3735               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3736                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3737                 goto invalid_code;
3738               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3739               if (charset_id_2 < 0)
3740                 charset = CHARSET_FROM_ID (charset_ascii);
3741               else
3742                 charset = CHARSET_FROM_ID (charset_id_2);
3743               ONE_MORE_BYTE (c1);
3744               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3745                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3746                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3747                           ? c1 >= 0x80 : c1 < 0x80)))
3748                 goto invalid_code;
3749               break;
3750
3751             case 'O':           /* invocation of single-shift-3 */
3752               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3753                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3754                 goto invalid_code;
3755               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3756               if (charset_id_3 < 0)
3757                 charset = CHARSET_FROM_ID (charset_ascii);
3758               else
3759                 charset = CHARSET_FROM_ID (charset_id_3);
3760               ONE_MORE_BYTE (c1);
3761               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3762                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3763                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3764                           ? c1 >= 0x80 : c1 < 0x80)))
3765                 goto invalid_code;
3766               break;
3767
3768             case '0': case '2': case '3': case '4': /* start composition */
3769               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3770                 goto invalid_code;
3771               if (last_id != charset_ascii)
3772                 {
3773                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3774                   last_id = charset_ascii;
3775                   last_offset = char_offset;
3776                 }
3777               DECODE_COMPOSITION_START (c1);
3778               continue;
3779
3780             case '1':           /* end composition */
3781               if (cmp_status->state == COMPOSING_NO)
3782                 goto invalid_code;
3783               DECODE_COMPOSITION_END ();
3784               continue;
3785
3786             case '[':           /* specification of direction */
3787               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3788                 goto invalid_code;
3789               /* For the moment, nested direction is not supported.
3790                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3791                  left-to-right, and nonzero means right-to-left.  */
3792               ONE_MORE_BYTE (c1);
3793               switch (c1)
3794                 {
3795                 case ']':       /* end of the current direction */
3796                   coding->mode &= ~CODING_MODE_DIRECTION;
3797                   break;
3798
3799                 case '0':       /* end of the current direction */
3800                 case '1':       /* start of left-to-right direction */
3801                   ONE_MORE_BYTE (c1);
3802                   if (c1 == ']')
3803                     coding->mode &= ~CODING_MODE_DIRECTION;
3804                   else
3805                     goto invalid_code;
3806                   break;
3807
3808                 case '2':       /* start of right-to-left direction */
3809                   ONE_MORE_BYTE (c1);
3810                   if (c1 == ']')
3811                     coding->mode |= CODING_MODE_DIRECTION;
3812                   else
3813                     goto invalid_code;
3814                   break;
3815
3816                 default:
3817                   goto invalid_code;
3818                 }
3819               continue;
3820
3821             case '%':
3822               ONE_MORE_BYTE (c1);
3823               if (c1 == '/')
3824                 {
3825                   /* CTEXT extended segment:
3826                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3827                      We keep these bytes as is for the moment.
3828                      They may be decoded by post-read-conversion.  */
3829                   int dim, M, L;
3830                   int size;
3831
3832                   ONE_MORE_BYTE (dim);
3833                   if (dim < '0' || dim > '4')
3834                     goto invalid_code;
3835                   ONE_MORE_BYTE (M);
3836                   if (M < 128)
3837                     goto invalid_code;
3838                   ONE_MORE_BYTE (L);
3839                   if (L < 128)
3840                     goto invalid_code;
3841                   size = ((M - 128) * 128) + (L - 128);
3842                   if (charbuf + 6 > charbuf_end)
3843                     goto break_loop;
3844                   *charbuf++ = ISO_CODE_ESC;
3845                   *charbuf++ = '%';
3846                   *charbuf++ = '/';
3847                   *charbuf++ = dim;
3848                   *charbuf++ = BYTE8_TO_CHAR (M);
3849                   *charbuf++ = BYTE8_TO_CHAR (L);
3850                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3851                 }
3852               else if (c1 == 'G')
3853                 {
3854                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3855                      ESC % G --UTF-8-BYTES-- ESC % @
3856                      We keep these bytes as is for the moment.
3857                      They may be decoded by post-read-conversion.  */
3858                   if (charbuf + 3 > charbuf_end)
3859                     goto break_loop;
3860                   *charbuf++ = ISO_CODE_ESC;
3861                   *charbuf++ = '%';
3862                   *charbuf++ = 'G';
3863                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3864                 }
3865               else
3866                 goto invalid_code;
3867               continue;
3868               break;
3869
3870             default:
3871               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3872                 goto invalid_code;
3873               {
3874                 int reg, chars96;
3875
3876                 if (c1 >= 0x28 && c1 <= 0x2B)
3877                   { /* designation of DIMENSION1_CHARS94 character set */
3878                     reg = c1 - 0x28, chars96 = 0;
3879                     ONE_MORE_BYTE (c1);
3880                   }
3881                 else if (c1 >= 0x2C && c1 <= 0x2F)
3882                   { /* designation of DIMENSION1_CHARS96 character set */
3883                     reg = c1 - 0x2C, chars96 = 1;
3884                     ONE_MORE_BYTE (c1);
3885                   }
3886                 else
3887                   goto invalid_code;
3888                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3889                 /* We must update these variables now.  */
3890                 if (reg == 0)
3891                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3892                 else if (reg == 1)
3893                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3894                 if (chars96 < 0)
3895                   goto invalid_code;
3896               }
3897               continue;
3898             }
3899           break;
3900
3901         default:
3902           emacs_abort ();
3903         }
3904
3905       if (cmp_status->state == COMPOSING_NO
3906           && charset->id != charset_ascii
3907           && last_id != charset->id)
3908         {
3909           if (last_id != charset_ascii)
3910             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3911           last_id = charset->id;
3912           last_offset = char_offset;
3913         }
3914
3915       /* Now we know CHARSET and 1st position code C1 of a character.
3916          Produce a decoded character while getting 2nd and 3rd
3917          position codes C2, C3 if necessary.  */
3918       if (CHARSET_DIMENSION (charset) > 1)
3919         {
3920           ONE_MORE_BYTE (c2);
3921           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3922               || ((c1 & 0x80) != (c2 & 0x80)))
3923             /* C2 is not in a valid range.  */
3924             goto invalid_code;
3925           if (CHARSET_DIMENSION (charset) == 2)
3926             c1 = (c1 << 8) | c2;
3927           else
3928             {
3929               ONE_MORE_BYTE (c3);
3930               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3931                   || ((c1 & 0x80) != (c3 & 0x80)))
3932                 /* C3 is not in a valid range.  */
3933                 goto invalid_code;
3934               c1 = (c1 << 16) | (c2 << 8) | c2;
3935             }
3936         }
3937       c1 &= 0x7F7F7F;
3938       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3939       if (c < 0)
3940         {
3941           MAYBE_FINISH_COMPOSITION ();
3942           for (; src_base < src; src_base++, char_offset++)
3943             {
3944               if (ASCII_CHAR_P (*src_base))
3945                 *charbuf++ = *src_base;
3946               else
3947                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3948             }
3949         }
3950       else if (cmp_status->state == COMPOSING_NO)
3951         {
3952           *charbuf++ = c;
3953           char_offset++;
3954         }
3955       else if ((cmp_status->state == COMPOSING_CHAR
3956                 ? cmp_status->nchars
3957                 : cmp_status->ncomps)
3958                >= MAX_COMPOSITION_COMPONENTS)
3959         {
3960           /* Too long composition.  */
3961           MAYBE_FINISH_COMPOSITION ();
3962           *charbuf++ = c;
3963           char_offset++;
3964         }
3965       else
3966         STORE_COMPOSITION_CHAR (c);
3967       continue;
3968
3969     invalid_code:
3970       MAYBE_FINISH_COMPOSITION ();
3971       src = src_base;
3972       consumed_chars = consumed_chars_base;
3973       ONE_MORE_BYTE (c);
3974       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
3975       char_offset++;
3976       /* Reset the invocation and designation status to the safest
3977          one; i.e. designate ASCII to the graphic register 0, and
3978          invoke that register to the graphic plane 0.  This typically
3979          helps the case that a designation sequence for ASCII "ESC (
3980          B" is somehow broken (e.g. broken by a newline).  */
3981       CODING_ISO_INVOCATION (coding, 0) = 0;
3982       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3983       charset_id_0 = charset_ascii;
3984       continue;
3985
3986     break_loop:
3987       break;
3988     }
3989
3990  no_more_source:
3991   if (cmp_status->state != COMPOSING_NO)
3992     {
3993       if (coding->mode & CODING_MODE_LAST_BLOCK)
3994         MAYBE_FINISH_COMPOSITION ();
3995       else
3996         {
3997           charbuf -= cmp_status->length;
3998           for (i = 0; i < cmp_status->length; i++)
3999             cmp_status->carryover[i] = charbuf[i];
4000         }
4001     }
4002   else if (last_id != charset_ascii)
4003     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4004   coding->consumed_char += consumed_chars_base;
4005   coding->consumed = src_base - coding->source;
4006   coding->charbuf_used = charbuf - coding->charbuf;
4007 }
4008
4009
4010 /* ISO2022 encoding stuff.  */
4011
4012 /*
4013    It is not enough to say just "ISO2022" on encoding, we have to
4014    specify more details.  In Emacs, each coding system of ISO2022
4015    variant has the following specifications:
4016         1. Initial designation to G0 thru G3.
4017         2. Allows short-form designation?
4018         3. ASCII should be designated to G0 before control characters?
4019         4. ASCII should be designated to G0 at end of line?
4020         5. 7-bit environment or 8-bit environment?
4021         6. Use locking-shift?
4022         7. Use Single-shift?
4023    And the following two are only for Japanese:
4024         8. Use ASCII in place of JIS0201-1976-Roman?
4025         9. Use JISX0208-1983 in place of JISX0208-1978?
4026    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4027    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4028    details.
4029 */
4030
4031 /* Produce codes (escape sequence) for designating CHARSET to graphic
4032    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4033    '@', 'A', or 'B' and the coding system CODING allows, produce
4034    designation sequence of short-form.  */
4035
4036 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4037   do {                                                                  \
4038     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4039     const char *intermediate_char_94 = "()*+";                          \
4040     const char *intermediate_char_96 = ",-./";                          \
4041     int revision = -1;                                                  \
4042                                                                         \
4043     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4044       revision = CHARSET_ISO_REVISION (charset);                        \
4045                                                                         \
4046     if (revision >= 0)                                                  \
4047       {                                                                 \
4048         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4049         EMIT_ONE_BYTE ('@' + revision);                                 \
4050       }                                                                 \
4051     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4052     if (CHARSET_DIMENSION (charset) == 1)                               \
4053       {                                                                 \
4054         int b;                                                          \
4055         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4056           b = intermediate_char_94[reg];                                \
4057         else                                                            \
4058           b = intermediate_char_96[reg];                                \
4059         EMIT_ONE_ASCII_BYTE (b);                                        \
4060       }                                                                 \
4061     else                                                                \
4062       {                                                                 \
4063         EMIT_ONE_ASCII_BYTE ('$');                                      \
4064         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4065           {                                                             \
4066             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4067                 || reg != 0                                             \
4068                 || final_char < '@' || final_char > 'B')                \
4069               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4070           }                                                             \
4071         else                                                            \
4072           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4073       }                                                                 \
4074     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4075                                                                         \
4076     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4077   } while (0)
4078
4079
4080 /* The following two macros produce codes (control character or escape
4081    sequence) for ISO2022 single-shift functions (single-shift-2 and
4082    single-shift-3).  */
4083
4084 #define ENCODE_SINGLE_SHIFT_2                                           \
4085   do {                                                                  \
4086     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4087       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4088     else                                                                \
4089       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4090     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4091   } while (0)
4092
4093
4094 #define ENCODE_SINGLE_SHIFT_3                                           \
4095   do {                                                                  \
4096     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4097       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4098     else                                                                \
4099       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4100     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4101   } while (0)
4102
4103
4104 /* The following four macros produce codes (control character or
4105    escape sequence) for ISO2022 locking-shift functions (shift-in,
4106    shift-out, locking-shift-2, and locking-shift-3).  */
4107
4108 #define ENCODE_SHIFT_IN                                 \
4109   do {                                                  \
4110     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4111     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4112   } while (0)
4113
4114
4115 #define ENCODE_SHIFT_OUT                                \
4116   do {                                                  \
4117     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4118     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4119   } while (0)
4120
4121
4122 #define ENCODE_LOCKING_SHIFT_2                          \
4123   do {                                                  \
4124     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4125     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4126   } while (0)
4127
4128
4129 #define ENCODE_LOCKING_SHIFT_3                          \
4130   do {                                                  \
4131     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4132     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4133   } while (0)
4134
4135
4136 /* Produce codes for a DIMENSION1 character whose character set is
4137    CHARSET and whose position-code is C1.  Designation and invocation
4138    sequences are also produced in advance if necessary.  */
4139
4140 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4141   do {                                                                  \
4142     int id = CHARSET_ID (charset);                                      \
4143                                                                         \
4144     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4145         && id == charset_ascii)                                         \
4146       {                                                                 \
4147         id = charset_jisx0201_roman;                                    \
4148         charset = CHARSET_FROM_ID (id);                                 \
4149       }                                                                 \
4150                                                                         \
4151     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4152       {                                                                 \
4153         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4154           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4155         else                                                            \
4156           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4157         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4158         break;                                                          \
4159       }                                                                 \
4160     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4161       {                                                                 \
4162         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4163         break;                                                          \
4164       }                                                                 \
4165     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4166       {                                                                 \
4167         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4168         break;                                                          \
4169       }                                                                 \
4170     else                                                                \
4171       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4172          must invoke it, or, at first, designate it to some graphic     \
4173          register.  Then repeat the loop to actually produce the        \
4174          character.  */                                                 \
4175       dst = encode_invocation_designation (charset, coding, dst,        \
4176                                            &produced_chars);            \
4177   } while (1)
4178
4179
4180 /* Produce codes for a DIMENSION2 character whose character set is
4181    CHARSET and whose position-codes are C1 and C2.  Designation and
4182    invocation codes are also produced in advance if necessary.  */
4183
4184 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4185   do {                                                                  \
4186     int id = CHARSET_ID (charset);                                      \
4187                                                                         \
4188     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4189         && id == charset_jisx0208)                                      \
4190       {                                                                 \
4191         id = charset_jisx0208_1978;                                     \
4192         charset = CHARSET_FROM_ID (id);                                 \
4193       }                                                                 \
4194                                                                         \
4195     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4196       {                                                                 \
4197         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4198           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4199         else                                                            \
4200           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4201         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4202         break;                                                          \
4203       }                                                                 \
4204     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4205       {                                                                 \
4206         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4207         break;                                                          \
4208       }                                                                 \
4209     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4210       {                                                                 \
4211         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4212         break;                                                          \
4213       }                                                                 \
4214     else                                                                \
4215       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4216          must invoke it, or, at first, designate it to some graphic     \
4217          register.  Then repeat the loop to actually produce the        \
4218          character.  */                                                 \
4219       dst = encode_invocation_designation (charset, coding, dst,        \
4220                                            &produced_chars);            \
4221   } while (1)
4222
4223
4224 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4225   do {                                                                     \
4226     unsigned code;                                                         \
4227     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4228                                                                            \
4229     if (CHARSET_DIMENSION (charset) == 1)                                  \
4230       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4231     else                                                                   \
4232       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4233   } while (0)
4234
4235
4236 /* Produce designation and invocation codes at a place pointed by DST
4237    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4238    Return new DST.  */
4239
4240 static unsigned char *
4241 encode_invocation_designation (struct charset *charset,
4242                                struct coding_system *coding,
4243                                unsigned char *dst, ptrdiff_t *p_nchars)
4244 {
4245   bool multibytep = coding->dst_multibyte;
4246   ptrdiff_t produced_chars = *p_nchars;
4247   int reg;                      /* graphic register number */
4248   int id = CHARSET_ID (charset);
4249
4250   /* At first, check designations.  */
4251   for (reg = 0; reg < 4; reg++)
4252     if (id == CODING_ISO_DESIGNATION (coding, reg))
4253       break;
4254
4255   if (reg >= 4)
4256     {
4257       /* CHARSET is not yet designated to any graphic registers.  */
4258       /* At first check the requested designation.  */
4259       reg = CODING_ISO_REQUEST (coding, id);
4260       if (reg < 0)
4261         /* Since CHARSET requests no special designation, designate it
4262            to graphic register 0.  */
4263         reg = 0;
4264
4265       ENCODE_DESIGNATION (charset, reg, coding);
4266     }
4267
4268   if (CODING_ISO_INVOCATION (coding, 0) != reg
4269       && CODING_ISO_INVOCATION (coding, 1) != reg)
4270     {
4271       /* Since the graphic register REG is not invoked to any graphic
4272          planes, invoke it to graphic plane 0.  */
4273       switch (reg)
4274         {
4275         case 0:                 /* graphic register 0 */
4276           ENCODE_SHIFT_IN;
4277           break;
4278
4279         case 1:                 /* graphic register 1 */
4280           ENCODE_SHIFT_OUT;
4281           break;
4282
4283         case 2:                 /* graphic register 2 */
4284           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4285             ENCODE_SINGLE_SHIFT_2;
4286           else
4287             ENCODE_LOCKING_SHIFT_2;
4288           break;
4289
4290         case 3:                 /* graphic register 3 */
4291           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4292             ENCODE_SINGLE_SHIFT_3;
4293           else
4294             ENCODE_LOCKING_SHIFT_3;
4295           break;
4296
4297         default:
4298           break;
4299         }
4300     }
4301
4302   *p_nchars = produced_chars;
4303   return dst;
4304 }
4305
4306
4307 /* Produce codes for designation and invocation to reset the graphic
4308    planes and registers to initial state.  */
4309 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4310   do {                                                                  \
4311     int reg;                                                            \
4312     struct charset *charset;                                            \
4313                                                                         \
4314     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4315       ENCODE_SHIFT_IN;                                                  \
4316     for (reg = 0; reg < 4; reg++)                                       \
4317       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4318           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4319               != CODING_ISO_INITIAL (coding, reg)))                     \
4320         {                                                               \
4321           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4322           ENCODE_DESIGNATION (charset, reg, coding);                    \
4323         }                                                               \
4324   } while (0)
4325
4326
4327 /* Produce designation sequences of charsets in the line started from
4328    CHARBUF to a place pointed by DST, and return the number of
4329    produced bytes.  DST should not directly point a buffer text area
4330    which may be relocated by char_charset call.
4331
4332    If the current block ends before any end-of-line, we may fail to
4333    find all the necessary designations.  */
4334
4335 static ptrdiff_t
4336 encode_designation_at_bol (struct coding_system *coding,
4337                            int *charbuf, int *charbuf_end,
4338                            unsigned char *dst)
4339 {
4340   unsigned char *orig = dst;
4341   struct charset *charset;
4342   /* Table of charsets to be designated to each graphic register.  */
4343   int r[4];
4344   int c, found = 0, reg;
4345   ptrdiff_t produced_chars = 0;
4346   bool multibytep = coding->dst_multibyte;
4347   Lisp_Object attrs;
4348   Lisp_Object charset_list;
4349
4350   attrs = CODING_ID_ATTRS (coding->id);
4351   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4352   if (EQ (charset_list, Qiso_2022))
4353     charset_list = Viso_2022_charset_list;
4354
4355   for (reg = 0; reg < 4; reg++)
4356     r[reg] = -1;
4357
4358   while (charbuf < charbuf_end && found < 4)
4359     {
4360       int id;
4361
4362       c = *charbuf++;
4363       if (c == '\n')
4364         break;
4365       charset = char_charset (c, charset_list, NULL);
4366       id = CHARSET_ID (charset);
4367       reg = CODING_ISO_REQUEST (coding, id);
4368       if (reg >= 0 && r[reg] < 0)
4369         {
4370           found++;
4371           r[reg] = id;
4372         }
4373     }
4374
4375   if (found)
4376     {
4377       for (reg = 0; reg < 4; reg++)
4378         if (r[reg] >= 0
4379             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4380           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4381     }
4382
4383   return dst - orig;
4384 }
4385
4386 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4387
4388 static bool
4389 encode_coding_iso_2022 (struct coding_system *coding)
4390 {
4391   bool multibytep = coding->dst_multibyte;
4392   int *charbuf = coding->charbuf;
4393   int *charbuf_end = charbuf + coding->charbuf_used;
4394   unsigned char *dst = coding->destination + coding->produced;
4395   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4396   int safe_room = 16;
4397   bool bol_designation
4398     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4399        && CODING_ISO_BOL (coding));
4400   ptrdiff_t produced_chars = 0;
4401   Lisp_Object attrs, eol_type, charset_list;
4402   bool ascii_compatible;
4403   int c;
4404   int preferred_charset_id = -1;
4405
4406   CODING_GET_INFO (coding, attrs, charset_list);
4407   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4408   if (VECTORP (eol_type))
4409     eol_type = Qunix;
4410
4411   setup_iso_safe_charsets (attrs);
4412   /* Charset list may have been changed.  */
4413   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4414   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4415
4416   ascii_compatible
4417     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4418        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4419                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4420
4421   while (charbuf < charbuf_end)
4422     {
4423       ASSURE_DESTINATION (safe_room);
4424
4425       if (bol_designation)
4426         {
4427           /* We have to produce designation sequences if any now.  */
4428           unsigned char desig_buf[16];
4429           ptrdiff_t nbytes;
4430           ptrdiff_t offset;
4431
4432           charset_map_loaded = 0;
4433           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4434                                               desig_buf);
4435           if (charset_map_loaded
4436               && (offset = coding_change_destination (coding)))
4437             {
4438               dst += offset;
4439               dst_end += offset;
4440             }
4441           memcpy (dst, desig_buf, nbytes);
4442           dst += nbytes;
4443           /* We are sure that designation sequences are all ASCII bytes.  */
4444           produced_chars += nbytes;
4445           bol_designation = 0;
4446           ASSURE_DESTINATION (safe_room);
4447         }
4448
4449       c = *charbuf++;
4450
4451       if (c < 0)
4452         {
4453           /* Handle an annotation.  */
4454           switch (*charbuf)
4455             {
4456             case CODING_ANNOTATE_COMPOSITION_MASK:
4457               /* Not yet implemented.  */
4458               break;
4459             case CODING_ANNOTATE_CHARSET_MASK:
4460               preferred_charset_id = charbuf[2];
4461               if (preferred_charset_id >= 0
4462                   && NILP (Fmemq (make_number (preferred_charset_id),
4463                                   charset_list)))
4464                 preferred_charset_id = -1;
4465               break;
4466             default:
4467               emacs_abort ();
4468             }
4469           charbuf += -c - 1;
4470           continue;
4471         }
4472
4473       /* Now encode the character C.  */
4474       if (c < 0x20 || c == 0x7F)
4475         {
4476           if (c == '\n'
4477               || (c == '\r' && EQ (eol_type, Qmac)))
4478             {
4479               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4480                 ENCODE_RESET_PLANE_AND_REGISTER ();
4481               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4482                 {
4483                   int i;
4484
4485                   for (i = 0; i < 4; i++)
4486                     CODING_ISO_DESIGNATION (coding, i)
4487                       = CODING_ISO_INITIAL (coding, i);
4488                 }
4489               bol_designation = ((CODING_ISO_FLAGS (coding)
4490                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4491                                  != 0);
4492             }
4493           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4494             ENCODE_RESET_PLANE_AND_REGISTER ();
4495           EMIT_ONE_ASCII_BYTE (c);
4496         }
4497       else if (ASCII_CHAR_P (c))
4498         {
4499           if (ascii_compatible)
4500             EMIT_ONE_ASCII_BYTE (c);
4501           else
4502             {
4503               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4504               ENCODE_ISO_CHARACTER (charset, c);
4505             }
4506         }
4507       else if (CHAR_BYTE8_P (c))
4508         {
4509           c = CHAR_TO_BYTE8 (c);
4510           EMIT_ONE_BYTE (c);
4511         }
4512       else
4513         {
4514           struct charset *charset;
4515
4516           if (preferred_charset_id >= 0)
4517             {
4518               bool result;
4519
4520               charset = CHARSET_FROM_ID (preferred_charset_id);
4521               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4522               if (! result)
4523                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4524                                      NULL, charset);
4525             }
4526           else
4527             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4528                                  NULL, charset);
4529           if (!charset)
4530             {
4531               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4532                 {
4533                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4534                   charset = CHARSET_FROM_ID (charset_ascii);
4535                 }
4536               else
4537                 {
4538                   c = coding->default_char;
4539                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4540                                        charset_list, NULL, charset);
4541                 }
4542             }
4543           ENCODE_ISO_CHARACTER (charset, c);
4544         }
4545     }
4546
4547   if (coding->mode & CODING_MODE_LAST_BLOCK
4548       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4549     {
4550       ASSURE_DESTINATION (safe_room);
4551       ENCODE_RESET_PLANE_AND_REGISTER ();
4552     }
4553   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4554   CODING_ISO_BOL (coding) = bol_designation;
4555   coding->produced_char += produced_chars;
4556   coding->produced = dst - coding->destination;
4557   return 0;
4558 }
4559
4560 \f
4561 /*** 8,9. SJIS and BIG5 handlers ***/
4562
4563 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4564    quite widely.  So, for the moment, Emacs supports them in the bare
4565    C code.  But, in the future, they may be supported only by CCL.  */
4566
4567 /* SJIS is a coding system encoding three character sets: ASCII, right
4568    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4569    as is.  A character of charset katakana-jisx0201 is encoded by
4570    "position-code + 0x80".  A character of charset japanese-jisx0208
4571    is encoded in 2-byte but two position-codes are divided and shifted
4572    so that it fit in the range below.
4573
4574    --- CODE RANGE of SJIS ---
4575    (character set)      (range)
4576    ASCII                0x00 .. 0x7F
4577    KATAKANA-JISX0201    0xA0 .. 0xDF
4578    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4579             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4580    -------------------------------
4581
4582 */
4583
4584 /* BIG5 is a coding system encoding two character sets: ASCII and
4585    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4586    character set and is encoded in two-byte.
4587
4588    --- CODE RANGE of BIG5 ---
4589    (character set)      (range)
4590    ASCII                0x00 .. 0x7F
4591    Big5 (1st byte)      0xA1 .. 0xFE
4592         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4593    --------------------------
4594
4595   */
4596
4597 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4598    Return true if a text is encoded in SJIS.  */
4599
4600 static bool
4601 detect_coding_sjis (struct coding_system *coding,
4602                     struct coding_detection_info *detect_info)
4603 {
4604   const unsigned char *src = coding->source, *src_base;
4605   const unsigned char *src_end = coding->source + coding->src_bytes;
4606   bool multibytep = coding->src_multibyte;
4607   ptrdiff_t consumed_chars = 0;
4608   int found = 0;
4609   int c;
4610   Lisp_Object attrs, charset_list;
4611   int max_first_byte_of_2_byte_code;
4612
4613   CODING_GET_INFO (coding, attrs, charset_list);
4614   max_first_byte_of_2_byte_code
4615     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4616
4617   detect_info->checked |= CATEGORY_MASK_SJIS;
4618   /* A coding system of this category is always ASCII compatible.  */
4619   src += coding->head_ascii;
4620
4621   while (1)
4622     {
4623       src_base = src;
4624       ONE_MORE_BYTE (c);
4625       if (c < 0x80)
4626         continue;
4627       if ((c >= 0x81 && c <= 0x9F)
4628           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4629         {
4630           ONE_MORE_BYTE (c);
4631           if (c < 0x40 || c == 0x7F || c > 0xFC)
4632             break;
4633           found = CATEGORY_MASK_SJIS;
4634         }
4635       else if (c >= 0xA0 && c < 0xE0)
4636         found = CATEGORY_MASK_SJIS;
4637       else
4638         break;
4639     }
4640   detect_info->rejected |= CATEGORY_MASK_SJIS;
4641   return 0;
4642
4643  no_more_source:
4644   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4645     {
4646       detect_info->rejected |= CATEGORY_MASK_SJIS;
4647       return 0;
4648     }
4649   detect_info->found |= found;
4650   return 1;
4651 }
4652
4653 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4654    Return true if a text is encoded in BIG5.  */
4655
4656 static bool
4657 detect_coding_big5 (struct coding_system *coding,
4658                     struct coding_detection_info *detect_info)
4659 {
4660   const unsigned char *src = coding->source, *src_base;
4661   const unsigned char *src_end = coding->source + coding->src_bytes;
4662   bool multibytep = coding->src_multibyte;
4663   ptrdiff_t consumed_chars = 0;
4664   int found = 0;
4665   int c;
4666
4667   detect_info->checked |= CATEGORY_MASK_BIG5;
4668   /* A coding system of this category is always ASCII compatible.  */
4669   src += coding->head_ascii;
4670
4671   while (1)
4672     {
4673       src_base = src;
4674       ONE_MORE_BYTE (c);
4675       if (c < 0x80)
4676         continue;
4677       if (c >= 0xA1)
4678         {
4679           ONE_MORE_BYTE (c);
4680           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4681             return 0;
4682           found = CATEGORY_MASK_BIG5;
4683         }
4684       else
4685         break;
4686     }
4687   detect_info->rejected |= CATEGORY_MASK_BIG5;
4688   return 0;
4689
4690  no_more_source:
4691   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4692     {
4693       detect_info->rejected |= CATEGORY_MASK_BIG5;
4694       return 0;
4695     }
4696   detect_info->found |= found;
4697   return 1;
4698 }
4699
4700 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4701
4702 static void
4703 decode_coding_sjis (struct coding_system *coding)
4704 {
4705   const unsigned char *src = coding->source + coding->consumed;
4706   const unsigned char *src_end = coding->source + coding->src_bytes;
4707   const unsigned char *src_base;
4708   int *charbuf = coding->charbuf + coding->charbuf_used;
4709   /* We may produce one charset annotation in one loop and one more at
4710      the end.  */
4711   int *charbuf_end
4712     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4713   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4714   bool multibytep = coding->src_multibyte;
4715   struct charset *charset_roman, *charset_kanji, *charset_kana;
4716   struct charset *charset_kanji2;
4717   Lisp_Object attrs, charset_list, val;
4718   ptrdiff_t char_offset = coding->produced_char;
4719   ptrdiff_t last_offset = char_offset;
4720   int last_id = charset_ascii;
4721   bool eol_dos
4722     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4723   int byte_after_cr = -1;
4724
4725   CODING_GET_INFO (coding, attrs, charset_list);
4726
4727   val = charset_list;
4728   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4729   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4730   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4731   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4732
4733   while (1)
4734     {
4735       int c, c1;
4736       struct charset *charset;
4737
4738       src_base = src;
4739       consumed_chars_base = consumed_chars;
4740
4741       if (charbuf >= charbuf_end)
4742         {
4743           if (byte_after_cr >= 0)
4744             src_base--;
4745           break;
4746         }
4747
4748       if (byte_after_cr >= 0)
4749         c = byte_after_cr, byte_after_cr = -1;
4750       else
4751         ONE_MORE_BYTE (c);
4752       if (c < 0)
4753         goto invalid_code;
4754       if (c < 0x80)
4755         {
4756           if (eol_dos && c == '\r')
4757             ONE_MORE_BYTE (byte_after_cr);
4758           charset = charset_roman;
4759         }
4760       else if (c == 0x80 || c == 0xA0)
4761         goto invalid_code;
4762       else if (c >= 0xA1 && c <= 0xDF)
4763         {
4764           /* SJIS -> JISX0201-Kana */
4765           c &= 0x7F;
4766           charset = charset_kana;
4767         }
4768       else if (c <= 0xEF)
4769         {
4770           /* SJIS -> JISX0208 */
4771           ONE_MORE_BYTE (c1);
4772           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4773             goto invalid_code;
4774           c = (c << 8) | c1;
4775           SJIS_TO_JIS (c);
4776           charset = charset_kanji;
4777         }
4778       else if (c <= 0xFC && charset_kanji2)
4779         {
4780           /* SJIS -> JISX0213-2 */
4781           ONE_MORE_BYTE (c1);
4782           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4783             goto invalid_code;
4784           c = (c << 8) | c1;
4785           SJIS_TO_JIS2 (c);
4786           charset = charset_kanji2;
4787         }
4788       else
4789         goto invalid_code;
4790       if (charset->id != charset_ascii
4791           && last_id != charset->id)
4792         {
4793           if (last_id != charset_ascii)
4794             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4795           last_id = charset->id;
4796           last_offset = char_offset;
4797         }
4798       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4799       *charbuf++ = c;
4800       char_offset++;
4801       continue;
4802
4803     invalid_code:
4804       src = src_base;
4805       consumed_chars = consumed_chars_base;
4806       ONE_MORE_BYTE (c);
4807       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4808       char_offset++;
4809     }
4810
4811  no_more_source:
4812   if (last_id != charset_ascii)
4813     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4814   coding->consumed_char += consumed_chars_base;
4815   coding->consumed = src_base - coding->source;
4816   coding->charbuf_used = charbuf - coding->charbuf;
4817 }
4818
4819 static void
4820 decode_coding_big5 (struct coding_system *coding)
4821 {
4822   const unsigned char *src = coding->source + coding->consumed;
4823   const unsigned char *src_end = coding->source + coding->src_bytes;
4824   const unsigned char *src_base;
4825   int *charbuf = coding->charbuf + coding->charbuf_used;
4826   /* We may produce one charset annotation in one loop and one more at
4827      the end.  */
4828   int *charbuf_end
4829     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4830   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4831   bool multibytep = coding->src_multibyte;
4832   struct charset *charset_roman, *charset_big5;
4833   Lisp_Object attrs, charset_list, val;
4834   ptrdiff_t char_offset = coding->produced_char;
4835   ptrdiff_t last_offset = char_offset;
4836   int last_id = charset_ascii;
4837   bool eol_dos
4838     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4839   int byte_after_cr = -1;
4840
4841   CODING_GET_INFO (coding, attrs, charset_list);
4842   val = charset_list;
4843   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4844   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4845
4846   while (1)
4847     {
4848       int c, c1;
4849       struct charset *charset;
4850
4851       src_base = src;
4852       consumed_chars_base = consumed_chars;
4853
4854       if (charbuf >= charbuf_end)
4855         {
4856           if (byte_after_cr >= 0)
4857             src_base--;
4858           break;
4859         }
4860
4861       if (byte_after_cr >= 0)
4862         c = byte_after_cr, byte_after_cr = -1;
4863       else
4864         ONE_MORE_BYTE (c);
4865
4866       if (c < 0)
4867         goto invalid_code;
4868       if (c < 0x80)
4869         {
4870           if (eol_dos && c == '\r')
4871             ONE_MORE_BYTE (byte_after_cr);
4872           charset = charset_roman;
4873         }
4874       else
4875         {
4876           /* BIG5 -> Big5 */
4877           if (c < 0xA1 || c > 0xFE)
4878             goto invalid_code;
4879           ONE_MORE_BYTE (c1);
4880           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4881             goto invalid_code;
4882           c = c << 8 | c1;
4883           charset = charset_big5;
4884         }
4885       if (charset->id != charset_ascii
4886           && last_id != charset->id)
4887         {
4888           if (last_id != charset_ascii)
4889             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4890           last_id = charset->id;
4891           last_offset = char_offset;
4892         }
4893       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4894       *charbuf++ = c;
4895       char_offset++;
4896       continue;
4897
4898     invalid_code:
4899       src = src_base;
4900       consumed_chars = consumed_chars_base;
4901       ONE_MORE_BYTE (c);
4902       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4903       char_offset++;
4904     }
4905
4906  no_more_source:
4907   if (last_id != charset_ascii)
4908     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4909   coding->consumed_char += consumed_chars_base;
4910   coding->consumed = src_base - coding->source;
4911   coding->charbuf_used = charbuf - coding->charbuf;
4912 }
4913
4914 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4915    This function can encode charsets `ascii', `katakana-jisx0201',
4916    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4917    are sure that all these charsets are registered as official charset
4918    (i.e. do not have extended leading-codes).  Characters of other
4919    charsets are produced without any encoding.  */
4920
4921 static bool
4922 encode_coding_sjis (struct coding_system *coding)
4923 {
4924   bool multibytep = coding->dst_multibyte;
4925   int *charbuf = coding->charbuf;
4926   int *charbuf_end = charbuf + coding->charbuf_used;
4927   unsigned char *dst = coding->destination + coding->produced;
4928   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4929   int safe_room = 4;
4930   ptrdiff_t produced_chars = 0;
4931   Lisp_Object attrs, charset_list, val;
4932   bool ascii_compatible;
4933   struct charset *charset_kanji, *charset_kana;
4934   struct charset *charset_kanji2;
4935   int c;
4936
4937   CODING_GET_INFO (coding, attrs, charset_list);
4938   val = XCDR (charset_list);
4939   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4940   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4941   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4942
4943   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4944
4945   while (charbuf < charbuf_end)
4946     {
4947       ASSURE_DESTINATION (safe_room);
4948       c = *charbuf++;
4949       /* Now encode the character C.  */
4950       if (ASCII_CHAR_P (c) && ascii_compatible)
4951         EMIT_ONE_ASCII_BYTE (c);
4952       else if (CHAR_BYTE8_P (c))
4953         {
4954           c = CHAR_TO_BYTE8 (c);
4955           EMIT_ONE_BYTE (c);
4956         }
4957       else
4958         {
4959           unsigned code;
4960           struct charset *charset;
4961           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4962                                &code, charset);
4963
4964           if (!charset)
4965             {
4966               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4967                 {
4968                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4969                   charset = CHARSET_FROM_ID (charset_ascii);
4970                 }
4971               else
4972                 {
4973                   c = coding->default_char;
4974                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4975                                        charset_list, &code, charset);
4976                 }
4977             }
4978           if (code == CHARSET_INVALID_CODE (charset))
4979             emacs_abort ();
4980           if (charset == charset_kanji)
4981             {
4982               int c1, c2;
4983               JIS_TO_SJIS (code);
4984               c1 = code >> 8, c2 = code & 0xFF;
4985               EMIT_TWO_BYTES (c1, c2);
4986             }
4987           else if (charset == charset_kana)
4988             EMIT_ONE_BYTE (code | 0x80);
4989           else if (charset_kanji2 && charset == charset_kanji2)
4990             {
4991               int c1, c2;
4992
4993               c1 = code >> 8;
4994               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4995                   || c1 == 0x28
4996                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4997                 {
4998                   JIS_TO_SJIS2 (code);
4999                   c1 = code >> 8, c2 = code & 0xFF;
5000                   EMIT_TWO_BYTES (c1, c2);
5001                 }
5002               else
5003                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5004             }
5005           else
5006             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5007         }
5008     }
5009   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5010   coding->produced_char += produced_chars;
5011   coding->produced = dst - coding->destination;
5012   return 0;
5013 }
5014
5015 static bool
5016 encode_coding_big5 (struct coding_system *coding)
5017 {
5018   bool multibytep = coding->dst_multibyte;
5019   int *charbuf = coding->charbuf;
5020   int *charbuf_end = charbuf + coding->charbuf_used;
5021   unsigned char *dst = coding->destination + coding->produced;
5022   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5023   int safe_room = 4;
5024   ptrdiff_t produced_chars = 0;
5025   Lisp_Object attrs, charset_list, val;
5026   bool ascii_compatible;
5027   struct charset *charset_big5;
5028   int c;
5029
5030   CODING_GET_INFO (coding, attrs, charset_list);
5031   val = XCDR (charset_list);
5032   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5033   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5034
5035   while (charbuf < charbuf_end)
5036     {
5037       ASSURE_DESTINATION (safe_room);
5038       c = *charbuf++;
5039       /* Now encode the character C.  */
5040       if (ASCII_CHAR_P (c) && ascii_compatible)
5041         EMIT_ONE_ASCII_BYTE (c);
5042       else if (CHAR_BYTE8_P (c))
5043         {
5044           c = CHAR_TO_BYTE8 (c);
5045           EMIT_ONE_BYTE (c);
5046         }
5047       else
5048         {
5049           unsigned code;
5050           struct charset *charset;
5051           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5052                                &code, charset);
5053
5054           if (! charset)
5055             {
5056               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5057                 {
5058                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5059                   charset = CHARSET_FROM_ID (charset_ascii);
5060                 }
5061               else
5062                 {
5063                   c = coding->default_char;
5064                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5065                                        charset_list, &code, charset);
5066                 }
5067             }
5068           if (code == CHARSET_INVALID_CODE (charset))
5069             emacs_abort ();
5070           if (charset == charset_big5)
5071             {
5072               int c1, c2;
5073
5074               c1 = code >> 8, c2 = code & 0xFF;
5075               EMIT_TWO_BYTES (c1, c2);
5076             }
5077           else
5078             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5079         }
5080     }
5081   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5082   coding->produced_char += produced_chars;
5083   coding->produced = dst - coding->destination;
5084   return 0;
5085 }
5086
5087 \f
5088 /*** 10. CCL handlers ***/
5089
5090 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5091    Return true if a text is encoded in a coding system of which
5092    encoder/decoder are written in CCL program.  */
5093
5094 static bool
5095 detect_coding_ccl (struct coding_system *coding,
5096                    struct coding_detection_info *detect_info)
5097 {
5098   const unsigned char *src = coding->source, *src_base;
5099   const unsigned char *src_end = coding->source + coding->src_bytes;
5100   bool multibytep = coding->src_multibyte;
5101   ptrdiff_t consumed_chars = 0;
5102   int found = 0;
5103   unsigned char *valids;
5104   ptrdiff_t head_ascii = coding->head_ascii;
5105   Lisp_Object attrs;
5106
5107   detect_info->checked |= CATEGORY_MASK_CCL;
5108
5109   coding = &coding_categories[coding_category_ccl];
5110   valids = CODING_CCL_VALIDS (coding);
5111   attrs = CODING_ID_ATTRS (coding->id);
5112   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5113     src += head_ascii;
5114
5115   while (1)
5116     {
5117       int c;
5118
5119       src_base = src;
5120       ONE_MORE_BYTE (c);
5121       if (c < 0 || ! valids[c])
5122         break;
5123       if ((valids[c] > 1))
5124         found = CATEGORY_MASK_CCL;
5125     }
5126   detect_info->rejected |= CATEGORY_MASK_CCL;
5127   return 0;
5128
5129  no_more_source:
5130   detect_info->found |= found;
5131   return 1;
5132 }
5133
5134 static void
5135 decode_coding_ccl (struct coding_system *coding)
5136 {
5137   const unsigned char *src = coding->source + coding->consumed;
5138   const unsigned char *src_end = coding->source + coding->src_bytes;
5139   int *charbuf = coding->charbuf + coding->charbuf_used;
5140   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5141   ptrdiff_t consumed_chars = 0;
5142   bool multibytep = coding->src_multibyte;
5143   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5144   int source_charbuf[1024];
5145   int source_byteidx[1025];
5146   Lisp_Object attrs, charset_list;
5147
5148   CODING_GET_INFO (coding, attrs, charset_list);
5149
5150   while (1)
5151     {
5152       const unsigned char *p = src;
5153       ptrdiff_t offset;
5154       int i = 0;
5155
5156       if (multibytep)
5157         {
5158           while (i < 1024 && p < src_end)
5159             {
5160               source_byteidx[i] = p - src;
5161               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5162             }
5163           source_byteidx[i] = p - src;
5164         }
5165       else
5166         while (i < 1024 && p < src_end)
5167           source_charbuf[i++] = *p++;
5168
5169       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5170         ccl->last_block = true;
5171       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5172       charset_map_loaded = 0;
5173       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5174                   charset_list);
5175       if (charset_map_loaded
5176           && (offset = coding_change_source (coding)))
5177         {
5178           p += offset;
5179           src += offset;
5180           src_end += offset;
5181         }
5182       charbuf += ccl->produced;
5183       if (multibytep)
5184         src += source_byteidx[ccl->consumed];
5185       else
5186         src += ccl->consumed;
5187       consumed_chars += ccl->consumed;
5188       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5189         break;
5190     }
5191
5192   switch (ccl->status)
5193     {
5194     case CCL_STAT_SUSPEND_BY_SRC:
5195       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5196       break;
5197     case CCL_STAT_SUSPEND_BY_DST:
5198       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5199       break;
5200     case CCL_STAT_QUIT:
5201     case CCL_STAT_INVALID_CMD:
5202       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5203       break;
5204     default:
5205       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5206       break;
5207     }
5208   coding->consumed_char += consumed_chars;
5209   coding->consumed = src - coding->source;
5210   coding->charbuf_used = charbuf - coding->charbuf;
5211 }
5212
5213 static bool
5214 encode_coding_ccl (struct coding_system *coding)
5215 {
5216   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5217   bool multibytep = coding->dst_multibyte;
5218   int *charbuf = coding->charbuf;
5219   int *charbuf_end = charbuf + coding->charbuf_used;
5220   unsigned char *dst = coding->destination + coding->produced;
5221   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5222   int destination_charbuf[1024];
5223   ptrdiff_t produced_chars = 0;
5224   int i;
5225   Lisp_Object attrs, charset_list;
5226
5227   CODING_GET_INFO (coding, attrs, charset_list);
5228   if (coding->consumed_char == coding->src_chars
5229       && coding->mode & CODING_MODE_LAST_BLOCK)
5230     ccl->last_block = true;
5231
5232   do
5233     {
5234       ptrdiff_t offset;
5235
5236       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5237       charset_map_loaded = 0;
5238       ccl_driver (ccl, charbuf, destination_charbuf,
5239                   charbuf_end - charbuf, 1024, charset_list);
5240       if (charset_map_loaded
5241           && (offset = coding_change_destination (coding)))
5242         dst += offset;
5243       if (multibytep)
5244         {
5245           ASSURE_DESTINATION (ccl->produced * 2);
5246           for (i = 0; i < ccl->produced; i++)
5247             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5248         }
5249       else
5250         {
5251           ASSURE_DESTINATION (ccl->produced);
5252           for (i = 0; i < ccl->produced; i++)
5253             *dst++ = destination_charbuf[i] & 0xFF;
5254           produced_chars += ccl->produced;
5255         }
5256       charbuf += ccl->consumed;
5257       if (ccl->status == CCL_STAT_QUIT
5258           || ccl->status == CCL_STAT_INVALID_CMD)
5259         break;
5260     }
5261   while (charbuf < charbuf_end);
5262
5263   switch (ccl->status)
5264     {
5265     case CCL_STAT_SUSPEND_BY_SRC:
5266       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5267       break;
5268     case CCL_STAT_SUSPEND_BY_DST:
5269       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5270       break;
5271     case CCL_STAT_QUIT:
5272     case CCL_STAT_INVALID_CMD:
5273       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5274       break;
5275     default:
5276       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5277       break;
5278     }
5279
5280   coding->produced_char += produced_chars;
5281   coding->produced = dst - coding->destination;
5282   return 0;
5283 }
5284
5285 \f
5286 /*** 10, 11. no-conversion handlers ***/
5287
5288 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5289
5290 static void
5291 decode_coding_raw_text (struct coding_system *coding)
5292 {
5293   bool eol_dos
5294     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5295
5296   coding->chars_at_source = 1;
5297   coding->consumed_char = coding->src_chars;
5298   coding->consumed = coding->src_bytes;
5299   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5300     {
5301       coding->consumed_char--;
5302       coding->consumed--;
5303       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5304     }
5305   else
5306     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5307 }
5308
5309 static bool
5310 encode_coding_raw_text (struct coding_system *coding)
5311 {
5312   bool multibytep = coding->dst_multibyte;
5313   int *charbuf = coding->charbuf;
5314   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5315   unsigned char *dst = coding->destination + coding->produced;
5316   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5317   ptrdiff_t produced_chars = 0;
5318   int c;
5319
5320   if (multibytep)
5321     {
5322       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5323
5324       if (coding->src_multibyte)
5325         while (charbuf < charbuf_end)
5326           {
5327             ASSURE_DESTINATION (safe_room);
5328             c = *charbuf++;
5329             if (ASCII_CHAR_P (c))
5330               EMIT_ONE_ASCII_BYTE (c);
5331             else if (CHAR_BYTE8_P (c))
5332               {
5333                 c = CHAR_TO_BYTE8 (c);
5334                 EMIT_ONE_BYTE (c);
5335               }
5336             else
5337               {
5338                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5339
5340                 CHAR_STRING_ADVANCE (c, p1);
5341                 do
5342                   {
5343                     EMIT_ONE_BYTE (*p0);
5344                     p0++;
5345                   }
5346                 while (p0 < p1);
5347               }
5348           }
5349       else
5350         while (charbuf < charbuf_end)
5351           {
5352             ASSURE_DESTINATION (safe_room);
5353             c = *charbuf++;
5354             EMIT_ONE_BYTE (c);
5355           }
5356     }
5357   else
5358     {
5359       if (coding->src_multibyte)
5360         {
5361           int safe_room = MAX_MULTIBYTE_LENGTH;
5362
5363           while (charbuf < charbuf_end)
5364             {
5365               ASSURE_DESTINATION (safe_room);
5366               c = *charbuf++;
5367               if (ASCII_CHAR_P (c))
5368                 *dst++ = c;
5369               else if (CHAR_BYTE8_P (c))
5370                 *dst++ = CHAR_TO_BYTE8 (c);
5371               else
5372                 CHAR_STRING_ADVANCE (c, dst);
5373             }
5374         }
5375       else
5376         {
5377           ASSURE_DESTINATION (charbuf_end - charbuf);
5378           while (charbuf < charbuf_end && dst < dst_end)
5379             *dst++ = *charbuf++;
5380         }
5381       produced_chars = dst - (coding->destination + coding->produced);
5382     }
5383   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5384   coding->produced_char += produced_chars;
5385   coding->produced = dst - coding->destination;
5386   return 0;
5387 }
5388
5389 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5390    Return true if a text is encoded in a charset-based coding system.  */
5391
5392 static bool
5393 detect_coding_charset (struct coding_system *coding,
5394                        struct coding_detection_info *detect_info)
5395 {
5396   const unsigned char *src = coding->source, *src_base;
5397   const unsigned char *src_end = coding->source + coding->src_bytes;
5398   bool multibytep = coding->src_multibyte;
5399   ptrdiff_t consumed_chars = 0;
5400   Lisp_Object attrs, valids, name;
5401   int found = 0;
5402   ptrdiff_t head_ascii = coding->head_ascii;
5403   bool check_latin_extra = 0;
5404
5405   detect_info->checked |= CATEGORY_MASK_CHARSET;
5406
5407   coding = &coding_categories[coding_category_charset];
5408   attrs = CODING_ID_ATTRS (coding->id);
5409   valids = AREF (attrs, coding_attr_charset_valids);
5410   name = CODING_ID_NAME (coding->id);
5411   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5412                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5413       || strncmp (SSDATA (SYMBOL_NAME (name)),
5414                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5415     check_latin_extra = 1;
5416
5417   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5418     src += head_ascii;
5419
5420   while (1)
5421     {
5422       int c;
5423       Lisp_Object val;
5424       struct charset *charset;
5425       int dim, idx;
5426
5427       src_base = src;
5428       ONE_MORE_BYTE (c);
5429       if (c < 0)
5430         continue;
5431       val = AREF (valids, c);
5432       if (NILP (val))
5433         break;
5434       if (c >= 0x80)
5435         {
5436           if (c < 0xA0
5437               && check_latin_extra
5438               && (!VECTORP (Vlatin_extra_code_table)
5439                   || NILP (AREF (Vlatin_extra_code_table, c))))
5440             break;
5441           found = CATEGORY_MASK_CHARSET;
5442         }
5443       if (INTEGERP (val))
5444         {
5445           charset = CHARSET_FROM_ID (XFASTINT (val));
5446           dim = CHARSET_DIMENSION (charset);
5447           for (idx = 1; idx < dim; idx++)
5448             {
5449               if (src == src_end)
5450                 goto too_short;
5451               ONE_MORE_BYTE (c);
5452               if (c < charset->code_space[(dim - 1 - idx) * 4]
5453                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5454                 break;
5455             }
5456           if (idx < dim)
5457             break;
5458         }
5459       else
5460         {
5461           idx = 1;
5462           for (; CONSP (val); val = XCDR (val))
5463             {
5464               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5465               dim = CHARSET_DIMENSION (charset);
5466               while (idx < dim)
5467                 {
5468                   if (src == src_end)
5469                     goto too_short;
5470                   ONE_MORE_BYTE (c);
5471                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5472                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5473                     break;
5474                   idx++;
5475                 }
5476               if (idx == dim)
5477                 {
5478                   val = Qnil;
5479                   break;
5480                 }
5481             }
5482           if (CONSP (val))
5483             break;
5484         }
5485     }
5486  too_short:
5487   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5488   return 0;
5489
5490  no_more_source:
5491   detect_info->found |= found;
5492   return 1;
5493 }
5494
5495 static void
5496 decode_coding_charset (struct coding_system *coding)
5497 {
5498   const unsigned char *src = coding->source + coding->consumed;
5499   const unsigned char *src_end = coding->source + coding->src_bytes;
5500   const unsigned char *src_base;
5501   int *charbuf = coding->charbuf + coding->charbuf_used;
5502   /* We may produce one charset annotation in one loop and one more at
5503      the end.  */
5504   int *charbuf_end
5505     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5506   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5507   bool multibytep = coding->src_multibyte;
5508   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5509   Lisp_Object valids;
5510   ptrdiff_t char_offset = coding->produced_char;
5511   ptrdiff_t last_offset = char_offset;
5512   int last_id = charset_ascii;
5513   bool eol_dos
5514     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5515   int byte_after_cr = -1;
5516
5517   valids = AREF (attrs, coding_attr_charset_valids);
5518
5519   while (1)
5520     {
5521       int c;
5522       Lisp_Object val;
5523       struct charset *charset;
5524       int dim;
5525       int len = 1;
5526       unsigned code;
5527
5528       src_base = src;
5529       consumed_chars_base = consumed_chars;
5530
5531       if (charbuf >= charbuf_end)
5532         {
5533           if (byte_after_cr >= 0)
5534             src_base--;
5535           break;
5536         }
5537
5538       if (byte_after_cr >= 0)
5539         {
5540           c = byte_after_cr;
5541           byte_after_cr = -1;
5542         }
5543       else
5544         {
5545           ONE_MORE_BYTE (c);
5546           if (eol_dos && c == '\r')
5547             ONE_MORE_BYTE (byte_after_cr);
5548         }
5549       if (c < 0)
5550         goto invalid_code;
5551       code = c;
5552
5553       val = AREF (valids, c);
5554       if (! INTEGERP (val) && ! CONSP (val))
5555         goto invalid_code;
5556       if (INTEGERP (val))
5557         {
5558           charset = CHARSET_FROM_ID (XFASTINT (val));
5559           dim = CHARSET_DIMENSION (charset);
5560           while (len < dim)
5561             {
5562               ONE_MORE_BYTE (c);
5563               code = (code << 8) | c;
5564               len++;
5565             }
5566           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5567                               charset, code, c);
5568         }
5569       else
5570         {
5571           /* VAL is a list of charset IDs.  It is assured that the
5572              list is sorted by charset dimensions (smaller one
5573              comes first).  */
5574           while (CONSP (val))
5575             {
5576               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5577               dim = CHARSET_DIMENSION (charset);
5578               while (len < dim)
5579                 {
5580                   ONE_MORE_BYTE (c);
5581                   code = (code << 8) | c;
5582                   len++;
5583                 }
5584               CODING_DECODE_CHAR (coding, src, src_base,
5585                                   src_end, charset, code, c);
5586               if (c >= 0)
5587                 break;
5588               val = XCDR (val);
5589             }
5590         }
5591       if (c < 0)
5592         goto invalid_code;
5593       if (charset->id != charset_ascii
5594           && last_id != charset->id)
5595         {
5596           if (last_id != charset_ascii)
5597             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5598           last_id = charset->id;
5599           last_offset = char_offset;
5600         }
5601
5602       *charbuf++ = c;
5603       char_offset++;
5604       continue;
5605
5606     invalid_code:
5607       src = src_base;
5608       consumed_chars = consumed_chars_base;
5609       ONE_MORE_BYTE (c);
5610       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5611       char_offset++;
5612     }
5613
5614  no_more_source:
5615   if (last_id != charset_ascii)
5616     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5617   coding->consumed_char += consumed_chars_base;
5618   coding->consumed = src_base - coding->source;
5619   coding->charbuf_used = charbuf - coding->charbuf;
5620 }
5621
5622 static bool
5623 encode_coding_charset (struct coding_system *coding)
5624 {
5625   bool multibytep = coding->dst_multibyte;
5626   int *charbuf = coding->charbuf;
5627   int *charbuf_end = charbuf + coding->charbuf_used;
5628   unsigned char *dst = coding->destination + coding->produced;
5629   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5630   int safe_room = MAX_MULTIBYTE_LENGTH;
5631   ptrdiff_t produced_chars = 0;
5632   Lisp_Object attrs, charset_list;
5633   bool ascii_compatible;
5634   int c;
5635
5636   CODING_GET_INFO (coding, attrs, charset_list);
5637   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5638
5639   while (charbuf < charbuf_end)
5640     {
5641       struct charset *charset;
5642       unsigned code;
5643
5644       ASSURE_DESTINATION (safe_room);
5645       c = *charbuf++;
5646       if (ascii_compatible && ASCII_CHAR_P (c))
5647         EMIT_ONE_ASCII_BYTE (c);
5648       else if (CHAR_BYTE8_P (c))
5649         {
5650           c = CHAR_TO_BYTE8 (c);
5651           EMIT_ONE_BYTE (c);
5652         }
5653       else
5654         {
5655           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5656                                &code, charset);
5657
5658           if (charset)
5659             {
5660               if (CHARSET_DIMENSION (charset) == 1)
5661                 EMIT_ONE_BYTE (code);
5662               else if (CHARSET_DIMENSION (charset) == 2)
5663                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5664               else if (CHARSET_DIMENSION (charset) == 3)
5665                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5666               else
5667                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5668                                  (code >> 8) & 0xFF, code & 0xFF);
5669             }
5670           else
5671             {
5672               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5673                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5674               else
5675                 c = coding->default_char;
5676               EMIT_ONE_BYTE (c);
5677             }
5678         }
5679     }
5680
5681   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5682   coding->produced_char += produced_chars;
5683   coding->produced = dst - coding->destination;
5684   return 0;
5685 }
5686
5687 \f
5688 /*** 7. C library functions ***/
5689
5690 /* Setup coding context CODING from information about CODING_SYSTEM.
5691    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5692    CODING_SYSTEM is invalid, signal an error.  */
5693
5694 void
5695 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5696 {
5697   Lisp_Object attrs;
5698   Lisp_Object eol_type;
5699   Lisp_Object coding_type;
5700   Lisp_Object val;
5701
5702   if (NILP (coding_system))
5703     coding_system = Qundecided;
5704
5705   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5706
5707   attrs = CODING_ID_ATTRS (coding->id);
5708   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5709
5710   coding->mode = 0;
5711   if (VECTORP (eol_type))
5712     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5713                             | CODING_REQUIRE_DETECTION_MASK);
5714   else if (! EQ (eol_type, Qunix))
5715     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5716                             | CODING_REQUIRE_ENCODING_MASK);
5717   else
5718     coding->common_flags = 0;
5719   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5720     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5721   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5722     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5723   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5724     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5725
5726   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5727   coding->max_charset_id = SCHARS (val) - 1;
5728   coding->safe_charsets = SDATA (val);
5729   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5730   coding->carryover_bytes = 0;
5731   coding->raw_destination = 0;
5732
5733   coding_type = CODING_ATTR_TYPE (attrs);
5734   if (EQ (coding_type, Qundecided))
5735     {
5736       coding->detector = NULL;
5737       coding->decoder = decode_coding_raw_text;
5738       coding->encoder = encode_coding_raw_text;
5739       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5740       coding->spec.undecided.inhibit_nbd
5741         = (encode_inhibit_flag
5742            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5743       coding->spec.undecided.inhibit_ied
5744         = (encode_inhibit_flag
5745            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5746       coding->spec.undecided.prefer_utf_8
5747         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5748     }
5749   else if (EQ (coding_type, Qiso_2022))
5750     {
5751       int i;
5752       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5753
5754       /* Invoke graphic register 0 to plane 0.  */
5755       CODING_ISO_INVOCATION (coding, 0) = 0;
5756       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5757       CODING_ISO_INVOCATION (coding, 1)
5758         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5759       /* Setup the initial status of designation.  */
5760       for (i = 0; i < 4; i++)
5761         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5762       /* Not single shifting initially.  */
5763       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5764       /* Beginning of buffer should also be regarded as bol. */
5765       CODING_ISO_BOL (coding) = 1;
5766       coding->detector = detect_coding_iso_2022;
5767       coding->decoder = decode_coding_iso_2022;
5768       coding->encoder = encode_coding_iso_2022;
5769       if (flags & CODING_ISO_FLAG_SAFE)
5770         coding->mode |= CODING_MODE_SAFE_ENCODING;
5771       coding->common_flags
5772         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5773             | CODING_REQUIRE_FLUSHING_MASK);
5774       if (flags & CODING_ISO_FLAG_COMPOSITION)
5775         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5776       if (flags & CODING_ISO_FLAG_DESIGNATION)
5777         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5778       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5779         {
5780           setup_iso_safe_charsets (attrs);
5781           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5782           coding->max_charset_id = SCHARS (val) - 1;
5783           coding->safe_charsets = SDATA (val);
5784         }
5785       CODING_ISO_FLAGS (coding) = flags;
5786       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5787       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5788       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5789       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5790     }
5791   else if (EQ (coding_type, Qcharset))
5792     {
5793       coding->detector = detect_coding_charset;
5794       coding->decoder = decode_coding_charset;
5795       coding->encoder = encode_coding_charset;
5796       coding->common_flags
5797         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5798     }
5799   else if (EQ (coding_type, Qutf_8))
5800     {
5801       val = AREF (attrs, coding_attr_utf_bom);
5802       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5803                                    : EQ (val, Qt) ? utf_with_bom
5804                                    : utf_without_bom);
5805       coding->detector = detect_coding_utf_8;
5806       coding->decoder = decode_coding_utf_8;
5807       coding->encoder = encode_coding_utf_8;
5808       coding->common_flags
5809         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5810       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5811         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5812     }
5813   else if (EQ (coding_type, Qutf_16))
5814     {
5815       val = AREF (attrs, coding_attr_utf_bom);
5816       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5817                                     : EQ (val, Qt) ? utf_with_bom
5818                                     : utf_without_bom);
5819       val = AREF (attrs, coding_attr_utf_16_endian);
5820       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5821                                        : utf_16_little_endian);
5822       CODING_UTF_16_SURROGATE (coding) = 0;
5823       coding->detector = detect_coding_utf_16;
5824       coding->decoder = decode_coding_utf_16;
5825       coding->encoder = encode_coding_utf_16;
5826       coding->common_flags
5827         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5828       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5829         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5830     }
5831   else if (EQ (coding_type, Qccl))
5832     {
5833       coding->detector = detect_coding_ccl;
5834       coding->decoder = decode_coding_ccl;
5835       coding->encoder = encode_coding_ccl;
5836       coding->common_flags
5837         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5838             | CODING_REQUIRE_FLUSHING_MASK);
5839     }
5840   else if (EQ (coding_type, Qemacs_mule))
5841     {
5842       coding->detector = detect_coding_emacs_mule;
5843       coding->decoder = decode_coding_emacs_mule;
5844       coding->encoder = encode_coding_emacs_mule;
5845       coding->common_flags
5846         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5847       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5848           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5849         {
5850           Lisp_Object tail, safe_charsets;
5851           int max_charset_id = 0;
5852
5853           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5854                tail = XCDR (tail))
5855             if (max_charset_id < XFASTINT (XCAR (tail)))
5856               max_charset_id = XFASTINT (XCAR (tail));
5857           safe_charsets = make_uninit_string (max_charset_id + 1);
5858           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5859           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5860                tail = XCDR (tail))
5861             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5862           coding->max_charset_id = max_charset_id;
5863           coding->safe_charsets = SDATA (safe_charsets);
5864         }
5865       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5866       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5867     }
5868   else if (EQ (coding_type, Qshift_jis))
5869     {
5870       coding->detector = detect_coding_sjis;
5871       coding->decoder = decode_coding_sjis;
5872       coding->encoder = encode_coding_sjis;
5873       coding->common_flags
5874         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5875     }
5876   else if (EQ (coding_type, Qbig5))
5877     {
5878       coding->detector = detect_coding_big5;
5879       coding->decoder = decode_coding_big5;
5880       coding->encoder = encode_coding_big5;
5881       coding->common_flags
5882         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5883     }
5884   else                          /* EQ (coding_type, Qraw_text) */
5885     {
5886       coding->detector = NULL;
5887       coding->decoder = decode_coding_raw_text;
5888       coding->encoder = encode_coding_raw_text;
5889       if (! EQ (eol_type, Qunix))
5890         {
5891           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5892           if (! VECTORP (eol_type))
5893             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5894         }
5895
5896     }
5897
5898   return;
5899 }
5900
5901 /* Return a list of charsets supported by CODING.  */
5902
5903 Lisp_Object
5904 coding_charset_list (struct coding_system *coding)
5905 {
5906   Lisp_Object attrs, charset_list;
5907
5908   CODING_GET_INFO (coding, attrs, charset_list);
5909   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5910     {
5911       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5912
5913       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5914         charset_list = Viso_2022_charset_list;
5915     }
5916   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5917     {
5918       charset_list = Vemacs_mule_charset_list;
5919     }
5920   return charset_list;
5921 }
5922
5923
5924 /* Return a list of charsets supported by CODING-SYSTEM.  */
5925
5926 Lisp_Object
5927 coding_system_charset_list (Lisp_Object coding_system)
5928 {
5929   ptrdiff_t id;
5930   Lisp_Object attrs, charset_list;
5931
5932   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5933   attrs = CODING_ID_ATTRS (id);
5934
5935   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5936     {
5937       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5938
5939       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5940         charset_list = Viso_2022_charset_list;
5941       else
5942         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5943     }
5944   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5945     {
5946       charset_list = Vemacs_mule_charset_list;
5947     }
5948   else
5949     {
5950       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5951     }
5952   return charset_list;
5953 }
5954
5955
5956 /* Return raw-text or one of its subsidiaries that has the same
5957    eol_type as CODING-SYSTEM.  */
5958
5959 Lisp_Object
5960 raw_text_coding_system (Lisp_Object coding_system)
5961 {
5962   Lisp_Object spec, attrs;
5963   Lisp_Object eol_type, raw_text_eol_type;
5964
5965   if (NILP (coding_system))
5966     return Qraw_text;
5967   spec = CODING_SYSTEM_SPEC (coding_system);
5968   attrs = AREF (spec, 0);
5969
5970   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5971     return coding_system;
5972
5973   eol_type = AREF (spec, 2);
5974   if (VECTORP (eol_type))
5975     return Qraw_text;
5976   spec = CODING_SYSTEM_SPEC (Qraw_text);
5977   raw_text_eol_type = AREF (spec, 2);
5978   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5979           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5980           : AREF (raw_text_eol_type, 2));
5981 }
5982
5983 /* Return true if CODING corresponds to raw-text coding-system.  */
5984
5985 bool
5986 raw_text_coding_system_p (struct coding_system *coding)
5987 {
5988   return (coding->decoder == decode_coding_raw_text
5989           && coding->encoder == encode_coding_raw_text) ? true : false;
5990 }
5991
5992
5993 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5994    the subsidiary that has the same eol-spec as PARENT (if it is not
5995    nil and specifies end-of-line format) or the system's setting
5996    (system_eol_type).  */
5997
5998 Lisp_Object
5999 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6000 {
6001   Lisp_Object spec, eol_type;
6002
6003   if (NILP (coding_system))
6004     coding_system = Qraw_text;
6005   else
6006     CHECK_CODING_SYSTEM (coding_system);
6007   spec = CODING_SYSTEM_SPEC (coding_system);
6008   eol_type = AREF (spec, 2);
6009   if (VECTORP (eol_type))
6010     {
6011       Lisp_Object parent_eol_type;
6012
6013       if (! NILP (parent))
6014         {
6015           Lisp_Object parent_spec;
6016
6017           CHECK_CODING_SYSTEM (parent);
6018           parent_spec = CODING_SYSTEM_SPEC (parent);
6019           parent_eol_type = AREF (parent_spec, 2);
6020           if (VECTORP (parent_eol_type))
6021             parent_eol_type = system_eol_type;
6022         }
6023       else
6024         parent_eol_type = system_eol_type;
6025       if (EQ (parent_eol_type, Qunix))
6026         coding_system = AREF (eol_type, 0);
6027       else if (EQ (parent_eol_type, Qdos))
6028         coding_system = AREF (eol_type, 1);
6029       else if (EQ (parent_eol_type, Qmac))
6030         coding_system = AREF (eol_type, 2);
6031     }
6032   return coding_system;
6033 }
6034
6035
6036 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6037    decided for writing to a process.  If not, complement them, and
6038    return a new coding system.  */
6039
6040 Lisp_Object
6041 complement_process_encoding_system (Lisp_Object coding_system)
6042 {
6043   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6044   Lisp_Object spec, attrs;
6045   int i;
6046
6047   for (i = 0; i < 3; i++)
6048     {
6049       if (i == 1)
6050         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6051       else if (i == 2)
6052         coding_system = preferred_coding_system ();
6053       spec = CODING_SYSTEM_SPEC (coding_system);
6054       if (NILP (spec))
6055         continue;
6056       attrs = AREF (spec, 0);
6057       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6058         coding_base = CODING_ATTR_BASE_NAME (attrs);
6059       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6060         eol_base = coding_system;
6061       if (! NILP (coding_base) && ! NILP (eol_base))
6062         break;
6063     }
6064
6065   if (i > 0)
6066     /* The original CODING_SYSTEM didn't specify text-conversion or
6067        eol-conversion.  Be sure that we return a fully complemented
6068        coding system.  */
6069     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6070   return coding_system;
6071 }
6072
6073
6074 /* Emacs has a mechanism to automatically detect a coding system if it
6075    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6076    it's impossible to distinguish some coding systems accurately
6077    because they use the same range of codes.  So, at first, coding
6078    systems are categorized into 7, those are:
6079
6080    o coding-category-emacs-mule
6081
6082         The category for a coding system which has the same code range
6083         as Emacs' internal format.  Assigned the coding-system (Lisp
6084         symbol) `emacs-mule' by default.
6085
6086    o coding-category-sjis
6087
6088         The category for a coding system which has the same code range
6089         as SJIS.  Assigned the coding-system (Lisp
6090         symbol) `japanese-shift-jis' by default.
6091
6092    o coding-category-iso-7
6093
6094         The category for a coding system which has the same code range
6095         as ISO2022 of 7-bit environment.  This doesn't use any locking
6096         shift and single shift functions.  This can encode/decode all
6097         charsets.  Assigned the coding-system (Lisp symbol)
6098         `iso-2022-7bit' by default.
6099
6100    o coding-category-iso-7-tight
6101
6102         Same as coding-category-iso-7 except that this can
6103         encode/decode only the specified charsets.
6104
6105    o coding-category-iso-8-1
6106
6107         The category for a coding system which has the same code range
6108         as ISO2022 of 8-bit environment and graphic plane 1 used only
6109         for DIMENSION1 charset.  This doesn't use any locking shift
6110         and single shift functions.  Assigned the coding-system (Lisp
6111         symbol) `iso-latin-1' by default.
6112
6113    o coding-category-iso-8-2
6114
6115         The category for a coding system which has the same code range
6116         as ISO2022 of 8-bit environment and graphic plane 1 used only
6117         for DIMENSION2 charset.  This doesn't use any locking shift
6118         and single shift functions.  Assigned the coding-system (Lisp
6119         symbol) `japanese-iso-8bit' by default.
6120
6121    o coding-category-iso-7-else
6122
6123         The category for a coding system which has the same code range
6124         as ISO2022 of 7-bit environment but uses locking shift or
6125         single shift functions.  Assigned the coding-system (Lisp
6126         symbol) `iso-2022-7bit-lock' by default.
6127
6128    o coding-category-iso-8-else
6129
6130         The category for a coding system which has the same code range
6131         as ISO2022 of 8-bit environment but uses locking shift or
6132         single shift functions.  Assigned the coding-system (Lisp
6133         symbol) `iso-2022-8bit-ss2' by default.
6134
6135    o coding-category-big5
6136
6137         The category for a coding system which has the same code range
6138         as BIG5.  Assigned the coding-system (Lisp symbol)
6139         `cn-big5' by default.
6140
6141    o coding-category-utf-8
6142
6143         The category for a coding system which has the same code range
6144         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6145         symbol) `utf-8' by default.
6146
6147    o coding-category-utf-16-be
6148
6149         The category for a coding system in which a text has an
6150         Unicode signature (cf. Unicode Standard) in the order of BIG
6151         endian at the head.  Assigned the coding-system (Lisp symbol)
6152         `utf-16-be' by default.
6153
6154    o coding-category-utf-16-le
6155
6156         The category for a coding system in which a text has an
6157         Unicode signature (cf. Unicode Standard) in the order of
6158         LITTLE endian at the head.  Assigned the coding-system (Lisp
6159         symbol) `utf-16-le' by default.
6160
6161    o coding-category-ccl
6162
6163         The category for a coding system of which encoder/decoder is
6164         written in CCL programs.  The default value is nil, i.e., no
6165         coding system is assigned.
6166
6167    o coding-category-binary
6168
6169         The category for a coding system not categorized in any of the
6170         above.  Assigned the coding-system (Lisp symbol)
6171         `no-conversion' by default.
6172
6173    Each of them is a Lisp symbol and the value is an actual
6174    `coding-system's (this is also a Lisp symbol) assigned by a user.
6175    What Emacs does actually is to detect a category of coding system.
6176    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6177    decide only one possible category, it selects a category of the
6178    highest priority.  Priorities of categories are also specified by a
6179    user in a Lisp variable `coding-category-list'.
6180
6181 */
6182
6183 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6184                                            int eol_seen);
6185
6186
6187 /* Return the number of ASCII characters at the head of the source.
6188    By side effects, set coding->head_ascii and update
6189    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6190    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6191    reliable only when all the source bytes are ASCII.  */
6192
6193 static ptrdiff_t
6194 check_ascii (struct coding_system *coding)
6195 {
6196   const unsigned char *src, *end;
6197   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6198   int eol_seen = coding->eol_seen;
6199
6200   coding_set_source (coding);
6201   src = coding->source;
6202   end = src + coding->src_bytes;
6203
6204   if (inhibit_eol_conversion
6205       || SYMBOLP (eol_type))
6206     {
6207       /* We don't have to check EOL format.  */
6208       while (src < end && !( *src & 0x80))
6209         {
6210           if (*src++ == '\n')
6211             eol_seen |= EOL_SEEN_LF;
6212         }
6213     }
6214   else
6215     {
6216       end--;                /* We look ahead one byte for "CR LF".  */
6217       while (src < end)
6218         {
6219           int c = *src;
6220
6221           if (c & 0x80)
6222             break;
6223           src++;
6224           if (c == '\r')
6225             {
6226               if (*src == '\n')
6227                 {
6228                   eol_seen |= EOL_SEEN_CRLF;
6229                   src++;
6230                 }
6231               else
6232                 eol_seen |= EOL_SEEN_CR;
6233             }
6234           else if (c == '\n')
6235             eol_seen |= EOL_SEEN_LF;
6236         }
6237       if (src == end)
6238         {
6239           int c = *src;
6240
6241           /* All bytes but the last one C are ASCII.  */
6242           if (! (c & 0x80))
6243             {
6244               if (c == '\r')
6245                 eol_seen |= EOL_SEEN_CR;
6246               else if (c  == '\n')
6247                 eol_seen |= EOL_SEEN_LF;
6248               src++;
6249             }
6250         }
6251     }
6252   coding->head_ascii = src - coding->source;
6253   coding->eol_seen = eol_seen;
6254   return (coding->head_ascii);
6255 }
6256
6257
6258 /* Return the number of characters at the source if all the bytes are
6259    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6260    effects, update coding->eol_seen.  The value of coding->eol_seen is
6261    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6262    the value is reliable only when all the source bytes are valid
6263    UTF-8.  */
6264
6265 static ptrdiff_t
6266 check_utf_8 (struct coding_system *coding)
6267 {
6268   const unsigned char *src, *end;
6269   int eol_seen;
6270   ptrdiff_t nchars = coding->head_ascii;
6271
6272   if (coding->head_ascii < 0)
6273     check_ascii (coding);
6274   else
6275     coding_set_source (coding);
6276   src = coding->source + coding->head_ascii;
6277   /* We look ahead one byte for CR LF.  */
6278   end = coding->source + coding->src_bytes - 1;
6279   eol_seen = coding->eol_seen;
6280   while (src < end)
6281     {
6282       int c = *src;
6283
6284       if (UTF_8_1_OCTET_P (*src))
6285         {
6286           src++;
6287           if (c < 0x20)
6288             {
6289               if (c == '\r')
6290                 {
6291                   if (*src == '\n')
6292                     {
6293                       eol_seen |= EOL_SEEN_CRLF;
6294                       src++;
6295                       nchars++;
6296                     }
6297                   else
6298                     eol_seen |= EOL_SEEN_CR;
6299                 }
6300               else if (c == '\n')
6301                 eol_seen |= EOL_SEEN_LF;
6302             }
6303         }
6304       else if (UTF_8_2_OCTET_LEADING_P (c))
6305         {
6306           if (c < 0xC2          /* overlong sequence */
6307               || src + 1 >= end
6308               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6309             return -1;
6310           src += 2;
6311         }
6312       else if (UTF_8_3_OCTET_LEADING_P (c))
6313         {
6314           if (src + 2 >= end
6315               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6316                     && UTF_8_EXTRA_OCTET_P (src[2])))
6317             return -1;
6318           c = (((c & 0xF) << 12)
6319                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6320           if (c < 0x800                       /* overlong sequence */
6321               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6322             return -1;
6323           src += 3;
6324         }
6325       else if (UTF_8_4_OCTET_LEADING_P (c))
6326         {
6327           if (src + 3 >= end
6328               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6329                     && UTF_8_EXTRA_OCTET_P (src[2])
6330                     && UTF_8_EXTRA_OCTET_P (src[3])))
6331             return -1;
6332           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6333                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6334           if (c < 0x10000       /* overlong sequence */
6335               || c >= 0x110000) /* non-Unicode character  */
6336             return -1;
6337           src += 4;
6338         }
6339       else
6340         return -1;
6341       nchars++;
6342     }
6343
6344   if (src == end)
6345     {
6346       if (! UTF_8_1_OCTET_P (*src))
6347         return -1;
6348       nchars++;
6349       if (*src == '\r')
6350         eol_seen |= EOL_SEEN_CR;
6351       else if (*src  == '\n')
6352         eol_seen |= EOL_SEEN_LF;
6353     }
6354   coding->eol_seen = eol_seen;
6355   return nchars;
6356 }
6357
6358
6359 /* Return whether STRING is a valid UTF-8 string.  STRING must be a
6360    unibyte string.  */
6361
6362 bool
6363 utf8_string_p (Lisp_Object string)
6364 {
6365   eassert (!STRING_MULTIBYTE (string));
6366   struct coding_system coding;
6367   setup_coding_system (Qutf_8_unix, &coding);
6368   /* We initialize only the fields that check_utf_8 accesses.  */
6369   coding.head_ascii = -1;
6370   coding.src_pos = 0;
6371   coding.src_pos_byte = 0;
6372   coding.src_chars = SCHARS (string);
6373   coding.src_bytes = SBYTES (string);
6374   coding.src_object = string;
6375   coding.eol_seen = EOL_SEEN_NONE;
6376   return check_utf_8 (&coding) != -1;
6377 }
6378
6379
6380 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6381    SOURCE is encoded.  If CATEGORY is one of
6382    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6383    two-byte, else they are encoded by one-byte.
6384
6385    Return one of EOL_SEEN_XXX.  */
6386
6387 #define MAX_EOL_CHECK_COUNT 3
6388
6389 static int
6390 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6391             enum coding_category category)
6392 {
6393   const unsigned char *src = source, *src_end = src + src_bytes;
6394   unsigned char c;
6395   int total  = 0;
6396   int eol_seen = EOL_SEEN_NONE;
6397
6398   if ((1 << category) & CATEGORY_MASK_UTF_16)
6399     {
6400       bool msb = category == (coding_category_utf_16_le
6401                               | coding_category_utf_16_le_nosig);
6402       bool lsb = !msb;
6403
6404       while (src + 1 < src_end)
6405         {
6406           c = src[lsb];
6407           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6408             {
6409               int this_eol;
6410
6411               if (c == '\n')
6412                 this_eol = EOL_SEEN_LF;
6413               else if (src + 3 >= src_end
6414                        || src[msb + 2] != 0
6415                        || src[lsb + 2] != '\n')
6416                 this_eol = EOL_SEEN_CR;
6417               else
6418                 {
6419                   this_eol = EOL_SEEN_CRLF;
6420                   src += 2;
6421                 }
6422
6423               if (eol_seen == EOL_SEEN_NONE)
6424                 /* This is the first end-of-line.  */
6425                 eol_seen = this_eol;
6426               else if (eol_seen != this_eol)
6427                 {
6428                   /* The found type is different from what found before.
6429                      Allow for stray ^M characters in DOS EOL files.  */
6430                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6431                       || (eol_seen == EOL_SEEN_CRLF
6432                           && this_eol == EOL_SEEN_CR))
6433                     eol_seen = EOL_SEEN_CRLF;
6434                   else
6435                     {
6436                       eol_seen = EOL_SEEN_LF;
6437                       break;
6438                     }
6439                 }
6440               if (++total == MAX_EOL_CHECK_COUNT)
6441                 break;
6442             }
6443           src += 2;
6444         }
6445     }
6446   else
6447     while (src < src_end)
6448       {
6449         c = *src++;
6450         if (c == '\n' || c == '\r')
6451           {
6452             int this_eol;
6453
6454             if (c == '\n')
6455               this_eol = EOL_SEEN_LF;
6456             else if (src >= src_end || *src != '\n')
6457               this_eol = EOL_SEEN_CR;
6458             else
6459               this_eol = EOL_SEEN_CRLF, src++;
6460
6461             if (eol_seen == EOL_SEEN_NONE)
6462               /* This is the first end-of-line.  */
6463               eol_seen = this_eol;
6464             else if (eol_seen != this_eol)
6465               {
6466                 /* The found type is different from what found before.
6467                    Allow for stray ^M characters in DOS EOL files.  */
6468                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6469                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6470                   eol_seen = EOL_SEEN_CRLF;
6471                 else
6472                   {
6473                     eol_seen = EOL_SEEN_LF;
6474                     break;
6475                   }
6476               }
6477             if (++total == MAX_EOL_CHECK_COUNT)
6478               break;
6479           }
6480       }
6481   return eol_seen;
6482 }
6483
6484
6485 static Lisp_Object
6486 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6487 {
6488   Lisp_Object eol_type;
6489
6490   eol_type = CODING_ID_EOL_TYPE (coding->id);
6491   if (! VECTORP (eol_type))
6492     /* Already adjusted.  */
6493     return eol_type;
6494   if (eol_seen & EOL_SEEN_LF)
6495     {
6496       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6497       eol_type = Qunix;
6498     }
6499   else if (eol_seen & EOL_SEEN_CRLF)
6500     {
6501       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6502       eol_type = Qdos;
6503     }
6504   else if (eol_seen & EOL_SEEN_CR)
6505     {
6506       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6507       eol_type = Qmac;
6508     }
6509   return eol_type;
6510 }
6511
6512 /* Detect how a text specified in CODING is encoded.  If a coding
6513    system is detected, update fields of CODING by the detected coding
6514    system.  */
6515
6516 static void
6517 detect_coding (struct coding_system *coding)
6518 {
6519   const unsigned char *src, *src_end;
6520   unsigned int saved_mode = coding->mode;
6521   Lisp_Object found = Qnil;
6522   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6523
6524   coding->consumed = coding->consumed_char = 0;
6525   coding->produced = coding->produced_char = 0;
6526   coding_set_source (coding);
6527
6528   src_end = coding->source + coding->src_bytes;
6529
6530   coding->eol_seen = EOL_SEEN_NONE;
6531   /* If we have not yet decided the text encoding type, detect it
6532      now.  */
6533   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6534     {
6535       int c, i;
6536       struct coding_detection_info detect_info;
6537       bool null_byte_found = 0, eight_bit_found = 0;
6538       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6539                                        inhibit_null_byte_detection);
6540       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6541                                        inhibit_iso_escape_detection);
6542       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6543
6544       coding->head_ascii = 0;
6545       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6546       for (src = coding->source; src < src_end; src++)
6547         {
6548           c = *src;
6549           if (c & 0x80)
6550             {
6551               eight_bit_found = 1;
6552               if (null_byte_found)
6553                 break;
6554             }
6555           else if (c < 0x20)
6556             {
6557               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6558                   && ! inhibit_ied
6559                   && ! detect_info.checked)
6560                 {
6561                   if (detect_coding_iso_2022 (coding, &detect_info))
6562                     {
6563                       /* We have scanned the whole data.  */
6564                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6565                         {
6566                           /* We didn't find an 8-bit code.  We may
6567                              have found a null-byte, but it's very
6568                              rare that a binary file conforms to
6569                              ISO-2022.  */
6570                           src = src_end;
6571                           coding->head_ascii = src - coding->source;
6572                         }
6573                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6574                       break;
6575                     }
6576                 }
6577               else if (! c && !inhibit_nbd)
6578                 {
6579                   null_byte_found = 1;
6580                   if (eight_bit_found)
6581                     break;
6582                 }
6583               else if (! disable_ascii_optimization
6584                        && ! inhibit_eol_conversion)
6585                 {
6586                   if (c == '\r')
6587                     {
6588                       if (src < src_end && src[1] == '\n')
6589                         {
6590                           coding->eol_seen |= EOL_SEEN_CRLF;
6591                           src++;
6592                           if (! eight_bit_found)
6593                             coding->head_ascii++;
6594                         }
6595                       else
6596                         coding->eol_seen |= EOL_SEEN_CR;
6597                     }
6598                   else if (c == '\n')
6599                     {
6600                       coding->eol_seen |= EOL_SEEN_LF;
6601                     }
6602                 }
6603
6604               if (! eight_bit_found)
6605                 coding->head_ascii++;
6606             }
6607           else if (! eight_bit_found)
6608             coding->head_ascii++;
6609         }
6610
6611       if (null_byte_found || eight_bit_found
6612           || coding->head_ascii < coding->src_bytes
6613           || detect_info.found)
6614         {
6615           enum coding_category category;
6616           struct coding_system *this;
6617
6618           if (coding->head_ascii == coding->src_bytes)
6619             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6620             for (i = 0; i < coding_category_raw_text; i++)
6621               {
6622                 category = coding_priorities[i];
6623                 this = coding_categories + category;
6624                 if (detect_info.found & (1 << category))
6625                   break;
6626               }
6627           else
6628             {
6629               if (null_byte_found)
6630                 {
6631                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6632                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6633                 }
6634               else if (prefer_utf_8
6635                        && detect_coding_utf_8 (coding, &detect_info))
6636                 {
6637                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6638                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6639                 }
6640               for (i = 0; i < coding_category_raw_text; i++)
6641                 {
6642                   category = coding_priorities[i];
6643                   this = coding_categories + category;
6644                   /* Some of this->detector (e.g. detect_coding_sjis)
6645                      require this information.  */
6646                   coding->id = this->id;
6647                   if (this->id < 0)
6648                     {
6649                       /* No coding system of this category is defined.  */
6650                       detect_info.rejected |= (1 << category);
6651                     }
6652                   else if (category >= coding_category_raw_text)
6653                     continue;
6654                   else if (detect_info.checked & (1 << category))
6655                     {
6656                       if (detect_info.found & (1 << category))
6657                         break;
6658                     }
6659                   else if ((*(this->detector)) (coding, &detect_info)
6660                            && detect_info.found & (1 << category))
6661                     break;
6662                 }
6663             }
6664
6665           if (i < coding_category_raw_text)
6666             {
6667               if (category == coding_category_utf_8_auto)
6668                 {
6669                   Lisp_Object coding_systems;
6670
6671                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6672                                          coding_attr_utf_bom);
6673                   if (CONSP (coding_systems))
6674                     {
6675                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6676                         found = XCAR (coding_systems);
6677                       else
6678                         found = XCDR (coding_systems);
6679                     }
6680                   else
6681                     found = CODING_ID_NAME (this->id);
6682                 }
6683               else if (category == coding_category_utf_16_auto)
6684                 {
6685                   Lisp_Object coding_systems;
6686
6687                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6688                                          coding_attr_utf_bom);
6689                   if (CONSP (coding_systems))
6690                     {
6691                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6692                         found = XCAR (coding_systems);
6693                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6694                         found = XCDR (coding_systems);
6695                     }
6696                   else
6697                     found = CODING_ID_NAME (this->id);
6698                 }
6699               else
6700                 found = CODING_ID_NAME (this->id);
6701             }
6702           else if (null_byte_found)
6703             found = Qno_conversion;
6704           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6705                    == CATEGORY_MASK_ANY)
6706             found = Qraw_text;
6707           else if (detect_info.rejected)
6708             for (i = 0; i < coding_category_raw_text; i++)
6709               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6710                 {
6711                   this = coding_categories + coding_priorities[i];
6712                   found = CODING_ID_NAME (this->id);
6713                   break;
6714                 }
6715         }
6716     }
6717   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6718            == coding_category_utf_8_auto)
6719     {
6720       Lisp_Object coding_systems;
6721       struct coding_detection_info detect_info;
6722
6723       coding_systems
6724         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6725       detect_info.found = detect_info.rejected = 0;
6726       if (check_ascii (coding) == coding->src_bytes)
6727         {
6728           if (CONSP (coding_systems))
6729             found = XCDR (coding_systems);
6730         }
6731       else
6732         {
6733           if (CONSP (coding_systems)
6734               && detect_coding_utf_8 (coding, &detect_info))
6735             {
6736               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6737                 found = XCAR (coding_systems);
6738               else
6739                 found = XCDR (coding_systems);
6740             }
6741         }
6742     }
6743   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6744            == coding_category_utf_16_auto)
6745     {
6746       Lisp_Object coding_systems;
6747       struct coding_detection_info detect_info;
6748
6749       coding_systems
6750         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6751       detect_info.found = detect_info.rejected = 0;
6752       coding->head_ascii = 0;
6753       if (CONSP (coding_systems)
6754           && detect_coding_utf_16 (coding, &detect_info))
6755         {
6756           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6757             found = XCAR (coding_systems);
6758           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6759             found = XCDR (coding_systems);
6760         }
6761     }
6762
6763   if (! NILP (found))
6764     {
6765       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6766                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6767                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6768                            : EOL_SEEN_LF);
6769
6770       setup_coding_system (found, coding);
6771       if (specified_eol != EOL_SEEN_NONE)
6772         adjust_coding_eol_type (coding, specified_eol);
6773     }
6774
6775   coding->mode = saved_mode;
6776 }
6777
6778
6779 static void
6780 decode_eol (struct coding_system *coding)
6781 {
6782   Lisp_Object eol_type;
6783   unsigned char *p, *pbeg, *pend;
6784
6785   eol_type = CODING_ID_EOL_TYPE (coding->id);
6786   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6787     return;
6788
6789   if (NILP (coding->dst_object))
6790     pbeg = coding->destination;
6791   else
6792     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6793   pend = pbeg + coding->produced;
6794
6795   if (VECTORP (eol_type))
6796     {
6797       int eol_seen = EOL_SEEN_NONE;
6798
6799       for (p = pbeg; p < pend; p++)
6800         {
6801           if (*p == '\n')
6802             eol_seen |= EOL_SEEN_LF;
6803           else if (*p == '\r')
6804             {
6805               if (p + 1 < pend && *(p + 1) == '\n')
6806                 {
6807                   eol_seen |= EOL_SEEN_CRLF;
6808                   p++;
6809                 }
6810               else
6811                 eol_seen |= EOL_SEEN_CR;
6812             }
6813         }
6814       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6815       if ((eol_seen & EOL_SEEN_CRLF) != 0
6816           && (eol_seen & EOL_SEEN_CR) != 0
6817           && (eol_seen & EOL_SEEN_LF) == 0)
6818         eol_seen = EOL_SEEN_CRLF;
6819       else if (eol_seen != EOL_SEEN_NONE
6820           && eol_seen != EOL_SEEN_LF
6821           && eol_seen != EOL_SEEN_CRLF
6822           && eol_seen != EOL_SEEN_CR)
6823         eol_seen = EOL_SEEN_LF;
6824       if (eol_seen != EOL_SEEN_NONE)
6825         eol_type = adjust_coding_eol_type (coding, eol_seen);
6826     }
6827
6828   if (EQ (eol_type, Qmac))
6829     {
6830       for (p = pbeg; p < pend; p++)
6831         if (*p == '\r')
6832           *p = '\n';
6833     }
6834   else if (EQ (eol_type, Qdos))
6835     {
6836       ptrdiff_t n = 0;
6837       ptrdiff_t pos = coding->dst_pos;
6838       ptrdiff_t pos_byte = coding->dst_pos_byte;
6839       ptrdiff_t pos_end = pos_byte + coding->produced - 1;
6840
6841       /* This assertion is here instead of code, now deleted, that
6842          handled the NILP case, which no longer happens with the
6843          current codebase.  */
6844       eassert (!NILP (coding->dst_object));
6845
6846       while (pos_byte < pos_end)
6847         {
6848           int incr;
6849
6850           p = BYTE_POS_ADDR (pos_byte);
6851           if (coding->dst_multibyte)
6852             incr = BYTES_BY_CHAR_HEAD (*p);
6853           else
6854             incr = 1;
6855
6856           if (*p == '\r' && p[1] == '\n')
6857             {
6858               del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6859               n++;
6860               pos_end--;
6861             }
6862           pos++;
6863           pos_byte += incr;
6864         }
6865       coding->produced -= n;
6866       coding->produced_char -= n;
6867     }
6868 }
6869
6870
6871 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
6872    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
6873    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
6874 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6875
6876 /* Return a translation table (or list of them) from coding system
6877    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6878    not ENCODEP). */
6879
6880 static Lisp_Object
6881 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6882 {
6883   Lisp_Object standard, translation_table;
6884   Lisp_Object val;
6885
6886   if (NILP (Venable_character_translation))
6887     {
6888       if (max_lookup)
6889         *max_lookup = 0;
6890       return Qnil;
6891     }
6892   if (encodep)
6893     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6894       standard = Vstandard_translation_table_for_encode;
6895   else
6896     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6897       standard = Vstandard_translation_table_for_decode;
6898   if (NILP (translation_table))
6899     translation_table = standard;
6900   else
6901     {
6902       if (SYMBOLP (translation_table))
6903         translation_table = Fget (translation_table, Qtranslation_table);
6904       else if (CONSP (translation_table))
6905         {
6906           translation_table = Fcopy_sequence (translation_table);
6907           for (val = translation_table; CONSP (val); val = XCDR (val))
6908             if (SYMBOLP (XCAR (val)))
6909               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6910         }
6911       if (CHAR_TABLE_P (standard))
6912         {
6913           if (CONSP (translation_table))
6914             translation_table = nconc2 (translation_table, list1 (standard));
6915           else
6916             translation_table = list2 (translation_table, standard);
6917         }
6918     }
6919
6920   if (max_lookup)
6921     {
6922       *max_lookup = 1;
6923       if (CHAR_TABLE_P (translation_table)
6924           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6925         {
6926           val = XCHAR_TABLE (translation_table)->extras[1];
6927           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6928             *max_lookup = min (XFASTINT (val), MAX_LOOKUP_MAX);
6929         }
6930       else if (CONSP (translation_table))
6931         {
6932           Lisp_Object tail;
6933
6934           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6935             if (CHAR_TABLE_P (XCAR (tail))
6936                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6937               {
6938                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6939                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6940                   *max_lookup = min (XFASTINT (tailval), MAX_LOOKUP_MAX);
6941               }
6942         }
6943     }
6944   return translation_table;
6945 }
6946
6947 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6948   do {                                                          \
6949     trans = Qnil;                                               \
6950     if (CHAR_TABLE_P (table))                                   \
6951       {                                                         \
6952         trans = CHAR_TABLE_REF (table, c);                      \
6953         if (CHARACTERP (trans))                                 \
6954           c = XFASTINT (trans), trans = Qnil;                   \
6955       }                                                         \
6956     else if (CONSP (table))                                     \
6957       {                                                         \
6958         Lisp_Object tail;                                       \
6959                                                                 \
6960         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6961           if (CHAR_TABLE_P (XCAR (tail)))                       \
6962             {                                                   \
6963               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6964               if (CHARACTERP (trans))                           \
6965                 c = XFASTINT (trans), trans = Qnil;             \
6966               else if (! NILP (trans))                          \
6967                 break;                                          \
6968             }                                                   \
6969       }                                                         \
6970   } while (0)
6971
6972
6973 /* Return a translation of character(s) at BUF according to TRANS.
6974    TRANS is TO-CHAR, [TO-CHAR ...], or ((FROM .  TO) ...) where FROM =
6975    [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].  The return value
6976    is TO-CHAR or [TO-CHAR ...] if a translation is found, Qnil if not
6977    found, or Qt if BUF is too short to lookup characters in FROM.  As
6978    a side effect, if a translation is found, *NCHARS is set to the
6979    number of characters being translated.  */
6980
6981 static Lisp_Object
6982 get_translation (Lisp_Object trans, int *buf, int *buf_end, ptrdiff_t *nchars)
6983 {
6984   if (INTEGERP (trans) || VECTORP (trans))
6985     {
6986       *nchars = 1;
6987       return trans;
6988     }
6989   for (; CONSP (trans); trans = XCDR (trans))
6990     {
6991       Lisp_Object val = XCAR (trans);
6992       Lisp_Object from = XCAR (val);
6993       ptrdiff_t len = ASIZE (from);
6994       ptrdiff_t i;
6995
6996       for (i = 0; i < len; i++)
6997         {
6998           if (buf + i == buf_end)
6999             return Qt;
7000           if (XINT (AREF (from, i)) != buf[i])
7001             break;
7002         }
7003       if (i == len)
7004         {
7005           *nchars = len;
7006           return XCDR (val);
7007         }
7008     }
7009   return Qnil;
7010 }
7011
7012
7013 static int
7014 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7015                bool last_block)
7016 {
7017   unsigned char *dst = coding->destination + coding->produced;
7018   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7019   ptrdiff_t produced;
7020   ptrdiff_t produced_chars = 0;
7021   int carryover = 0;
7022
7023   if (! coding->chars_at_source)
7024     {
7025       /* Source characters are in coding->charbuf.  */
7026       int *buf = coding->charbuf;
7027       int *buf_end = buf + coding->charbuf_used;
7028
7029       if (EQ (coding->src_object, coding->dst_object)
7030           && ! NILP (coding->dst_object))
7031         {
7032           eassert (growable_destination (coding));
7033           coding_set_source (coding);
7034           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7035         }
7036
7037       while (buf < buf_end)
7038         {
7039           int c = *buf;
7040           ptrdiff_t i;
7041
7042           if (c >= 0)
7043             {
7044               ptrdiff_t from_nchars = 1, to_nchars = 1;
7045               Lisp_Object trans = Qnil;
7046
7047               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7048               if (! NILP (trans))
7049                 {
7050                   trans = get_translation (trans, buf, buf_end, &from_nchars);
7051                   if (INTEGERP (trans))
7052                     c = XINT (trans);
7053                   else if (VECTORP (trans))
7054                     {
7055                       to_nchars = ASIZE (trans);
7056                       c = XINT (AREF (trans, 0));
7057                     }
7058                   else if (EQ (trans, Qt) && ! last_block)
7059                     break;
7060                 }
7061
7062               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7063                 {
7064                   eassert (growable_destination (coding));
7065                   ptrdiff_t dst_size;
7066                   if (INT_MULTIPLY_WRAPV (to_nchars, MAX_MULTIBYTE_LENGTH,
7067                                           &dst_size)
7068                       || INT_ADD_WRAPV (buf_end - buf, dst_size, &dst_size))
7069                     memory_full (SIZE_MAX);
7070                   dst = alloc_destination (coding, dst_size, dst);
7071                   if (EQ (coding->src_object, coding->dst_object))
7072                     {
7073                       coding_set_source (coding);
7074                       dst_end = (((unsigned char *) coding->source)
7075                                  + coding->consumed);
7076                     }
7077                   else
7078                     dst_end = coding->destination + coding->dst_bytes;
7079                 }
7080
7081               for (i = 0; i < to_nchars; i++)
7082                 {
7083                   if (i > 0)
7084                     c = XINT (AREF (trans, i));
7085                   if (coding->dst_multibyte
7086                       || ! CHAR_BYTE8_P (c))
7087                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7088                   else
7089                     *dst++ = CHAR_TO_BYTE8 (c);
7090                 }
7091               produced_chars += to_nchars;
7092               buf += from_nchars;
7093             }
7094           else
7095             /* This is an annotation datum.  (-C) is the length.  */
7096             buf += -c;
7097         }
7098       carryover = buf_end - buf;
7099     }
7100   else
7101     {
7102       /* Source characters are at coding->source.  */
7103       const unsigned char *src = coding->source;
7104       const unsigned char *src_end = src + coding->consumed;
7105
7106       if (EQ (coding->dst_object, coding->src_object))
7107         {
7108           eassert (growable_destination (coding));
7109           dst_end = (unsigned char *) src;
7110         }
7111       if (coding->src_multibyte != coding->dst_multibyte)
7112         {
7113           if (coding->src_multibyte)
7114             {
7115               bool multibytep = 1;
7116               ptrdiff_t consumed_chars = 0;
7117
7118               while (1)
7119                 {
7120                   const unsigned char *src_base = src;
7121                   int c;
7122
7123                   ONE_MORE_BYTE (c);
7124                   if (dst == dst_end)
7125                     {
7126                       eassert (growable_destination (coding));
7127                       if (EQ (coding->src_object, coding->dst_object))
7128                         dst_end = (unsigned char *) src;
7129                       if (dst == dst_end)
7130                         {
7131                           ptrdiff_t offset = src - coding->source;
7132
7133                           dst = alloc_destination (coding, src_end - src + 1,
7134                                                    dst);
7135                           dst_end = coding->destination + coding->dst_bytes;
7136                           coding_set_source (coding);
7137                           src = coding->source + offset;
7138                           src_end = coding->source + coding->consumed;
7139                           if (EQ (coding->src_object, coding->dst_object))
7140                             dst_end = (unsigned char *) src;
7141                         }
7142                     }
7143                   *dst++ = c;
7144                   produced_chars++;
7145                 }
7146             no_more_source:
7147               ;
7148             }
7149           else
7150             while (src < src_end)
7151               {
7152                 bool multibytep = 1;
7153                 int c = *src++;
7154
7155                 if (dst >= dst_end - 1)
7156                   {
7157                     eassert (growable_destination (coding));
7158                     if (EQ (coding->src_object, coding->dst_object))
7159                       dst_end = (unsigned char *) src;
7160                     if (dst >= dst_end - 1)
7161                       {
7162                         ptrdiff_t offset = src - coding->source;
7163                         ptrdiff_t more_bytes;
7164
7165                         if (EQ (coding->src_object, coding->dst_object))
7166                           more_bytes = ((src_end - src) / 2) + 2;
7167                         else
7168                           more_bytes = src_end - src + 2;
7169                         dst = alloc_destination (coding, more_bytes, dst);
7170                         dst_end = coding->destination + coding->dst_bytes;
7171                         coding_set_source (coding);
7172                         src = coding->source + offset;
7173                         src_end = coding->source + coding->consumed;
7174                         if (EQ (coding->src_object, coding->dst_object))
7175                           dst_end = (unsigned char *) src;
7176                       }
7177                   }
7178                 EMIT_ONE_BYTE (c);
7179               }
7180         }
7181       else
7182         {
7183           if (!EQ (coding->src_object, coding->dst_object))
7184             {
7185               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7186
7187               if (require > 0)
7188                 {
7189                   ptrdiff_t offset = src - coding->source;
7190
7191                   dst = alloc_destination (coding, require, dst);
7192                   coding_set_source (coding);
7193                   src = coding->source + offset;
7194                   src_end = coding->source + coding->consumed;
7195                 }
7196             }
7197           produced_chars = coding->consumed_char;
7198           while (src < src_end)
7199             *dst++ = *src++;
7200         }
7201     }
7202
7203   produced = dst - (coding->destination + coding->produced);
7204   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7205     insert_from_gap (produced_chars, produced, 0);
7206   coding->produced += produced;
7207   coding->produced_char += produced_chars;
7208   return carryover;
7209 }
7210
7211 /* Compose text in CODING->object according to the annotation data at
7212    CHARBUF.  CHARBUF is an array:
7213      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7214  */
7215
7216 static void
7217 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7218 {
7219   int len;
7220   ptrdiff_t to;
7221   enum composition_method method;
7222   Lisp_Object components;
7223
7224   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7225   to = pos + charbuf[2];
7226   method = (enum composition_method) (charbuf[4]);
7227
7228   if (method == COMPOSITION_RELATIVE)
7229     components = Qnil;
7230   else
7231     {
7232       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7233       int i, j;
7234
7235       if (method == COMPOSITION_WITH_RULE)
7236         len = charbuf[2] * 3 - 2;
7237       charbuf += MAX_ANNOTATION_LENGTH;
7238       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7239       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7240         {
7241           if (charbuf[i] >= 0)
7242             args[j] = make_number (charbuf[i]);
7243           else
7244             {
7245               i++;
7246               args[j] = make_number (charbuf[i] % 0x100);
7247             }
7248         }
7249       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7250     }
7251   compose_text (pos, to, components, Qnil, coding->dst_object);
7252 }
7253
7254
7255 /* Put `charset' property on text in CODING->object according to
7256    the annotation data at CHARBUF.  CHARBUF is an array:
7257      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7258  */
7259
7260 static void
7261 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7262 {
7263   ptrdiff_t from = pos - charbuf[2];
7264   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7265
7266   Fput_text_property (make_number (from), make_number (pos),
7267                       Qcharset, CHARSET_NAME (charset),
7268                       coding->dst_object);
7269 }
7270
7271 #define MAX_CHARBUF_SIZE 0x4000
7272 /* How many units decoding functions expect in coding->charbuf at
7273    most.  Currently, decode_coding_emacs_mule expects the following
7274    size, and that is the largest value.  */
7275 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7276
7277 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7278   do {                                                          \
7279     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
7280                            MAX_CHARBUF_SIZE);                   \
7281     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
7282     coding->charbuf_size = units;                               \
7283   } while (0)
7284
7285 static void
7286 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7287 {
7288   int *charbuf = coding->charbuf;
7289   int *charbuf_end = charbuf + coding->charbuf_used;
7290
7291   if (NILP (coding->dst_object))
7292     return;
7293
7294   while (charbuf < charbuf_end)
7295     {
7296       if (*charbuf >= 0)
7297         pos++, charbuf++;
7298       else
7299         {
7300           int len = -*charbuf;
7301
7302           if (len > 2)
7303             switch (charbuf[1])
7304               {
7305               case CODING_ANNOTATE_COMPOSITION_MASK:
7306                 produce_composition (coding, charbuf, pos);
7307                 break;
7308               case CODING_ANNOTATE_CHARSET_MASK:
7309                 produce_charset (coding, charbuf, pos);
7310                 break;
7311               default:
7312                 break;
7313               }
7314           charbuf += len;
7315         }
7316     }
7317 }
7318
7319 /* Decode the data at CODING->src_object into CODING->dst_object.
7320    CODING->src_object is a buffer, a string, or nil.
7321    CODING->dst_object is a buffer.
7322
7323    If CODING->src_object is a buffer, it must be the current buffer.
7324    In this case, if CODING->src_pos is positive, it is a position of
7325    the source text in the buffer, otherwise, the source text is in the
7326    gap area of the buffer, and CODING->src_pos specifies the offset of
7327    the text from GPT (which must be the same as PT).  If this is the
7328    same buffer as CODING->dst_object, CODING->src_pos must be
7329    negative.
7330
7331    If CODING->src_object is a string, CODING->src_pos is an index to
7332    that string.
7333
7334    If CODING->src_object is nil, CODING->source must already point to
7335    the non-relocatable memory area.  In this case, CODING->src_pos is
7336    an offset from CODING->source.
7337
7338    The decoded data is inserted at the current point of the buffer
7339    CODING->dst_object.
7340 */
7341
7342 static void
7343 decode_coding (struct coding_system *coding)
7344 {
7345   Lisp_Object attrs;
7346   Lisp_Object undo_list;
7347   Lisp_Object translation_table;
7348   struct ccl_spec cclspec;
7349   int carryover;
7350   int i;
7351
7352   USE_SAFE_ALLOCA;
7353
7354   if (BUFFERP (coding->src_object)
7355       && coding->src_pos > 0
7356       && coding->src_pos < GPT
7357       && coding->src_pos + coding->src_chars > GPT)
7358     move_gap_both (coding->src_pos, coding->src_pos_byte);
7359
7360   undo_list = Qt;
7361   if (BUFFERP (coding->dst_object))
7362     {
7363       set_buffer_internal (XBUFFER (coding->dst_object));
7364       if (GPT != PT)
7365         move_gap_both (PT, PT_BYTE);
7366
7367       /* We must disable undo_list in order to record the whole insert
7368          transaction via record_insert at the end.  But doing so also
7369          disables the recording of the first change to the undo_list.
7370          Therefore we check for first change here and record it via
7371          record_first_change if needed.  */
7372       if (MODIFF <= SAVE_MODIFF)
7373         record_first_change ();
7374
7375       undo_list = BVAR (current_buffer, undo_list);
7376       bset_undo_list (current_buffer, Qt);
7377     }
7378
7379   coding->consumed = coding->consumed_char = 0;
7380   coding->produced = coding->produced_char = 0;
7381   coding->chars_at_source = 0;
7382   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7383
7384   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7385
7386   attrs = CODING_ID_ATTRS (coding->id);
7387   translation_table = get_translation_table (attrs, 0, NULL);
7388
7389   carryover = 0;
7390   if (coding->decoder == decode_coding_ccl)
7391     {
7392       coding->spec.ccl = &cclspec;
7393       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7394     }
7395   do
7396     {
7397       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7398
7399       coding_set_source (coding);
7400       coding->annotated = 0;
7401       coding->charbuf_used = carryover;
7402       (*(coding->decoder)) (coding);
7403       coding_set_destination (coding);
7404       carryover = produce_chars (coding, translation_table, 0);
7405       if (coding->annotated)
7406         produce_annotation (coding, pos);
7407       for (i = 0; i < carryover; i++)
7408         coding->charbuf[i]
7409           = coding->charbuf[coding->charbuf_used - carryover + i];
7410     }
7411   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7412          || (coding->consumed < coding->src_bytes
7413              && (coding->result == CODING_RESULT_SUCCESS
7414                  || coding->result == CODING_RESULT_INVALID_SRC)));
7415
7416   if (carryover > 0)
7417     {
7418       coding_set_destination (coding);
7419       coding->charbuf_used = carryover;
7420       produce_chars (coding, translation_table, 1);
7421     }
7422
7423   coding->carryover_bytes = 0;
7424   if (coding->consumed < coding->src_bytes)
7425     {
7426       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7427       const unsigned char *src;
7428
7429       coding_set_source (coding);
7430       coding_set_destination (coding);
7431       src = coding->source + coding->consumed;
7432
7433       if (coding->mode & CODING_MODE_LAST_BLOCK)
7434         {
7435           /* Flush out unprocessed data as binary chars.  We are sure
7436              that the number of data is less than the size of
7437              coding->charbuf.  */
7438           coding->charbuf_used = 0;
7439           coding->chars_at_source = 0;
7440
7441           while (nbytes-- > 0)
7442             {
7443               int c;
7444
7445               /* Copy raw bytes in their 2-byte forms from multibyte
7446                  text as single characters.  */
7447               if (coding->src_multibyte
7448                   && CHAR_BYTE8_HEAD_P (*src) && nbytes > 0)
7449                 {
7450                   c = STRING_CHAR_ADVANCE (src);
7451                   nbytes--;
7452                 }
7453               else
7454                 {
7455                   c = *src++;
7456
7457                   if (c & 0x80)
7458                     c = BYTE8_TO_CHAR (c);
7459                 }
7460               coding->charbuf[coding->charbuf_used++] = c;
7461             }
7462           produce_chars (coding, Qnil, 1);
7463         }
7464       else
7465         {
7466           /* Record unprocessed bytes in coding->carryover.  We are
7467              sure that the number of data is less than the size of
7468              coding->carryover.  */
7469           unsigned char *p = coding->carryover;
7470
7471           if (nbytes > sizeof coding->carryover)
7472             nbytes = sizeof coding->carryover;
7473           coding->carryover_bytes = nbytes;
7474           while (nbytes-- > 0)
7475             *p++ = *src++;
7476         }
7477       coding->consumed = coding->src_bytes;
7478     }
7479
7480   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7481       && !inhibit_eol_conversion)
7482     decode_eol (coding);
7483   if (BUFFERP (coding->dst_object))
7484     {
7485       bset_undo_list (current_buffer, undo_list);
7486       record_insert (coding->dst_pos, coding->produced_char);
7487     }
7488
7489   SAFE_FREE ();
7490 }
7491
7492
7493 /* Extract an annotation datum from a composition starting at POS and
7494    ending before LIMIT of CODING->src_object (buffer or string), store
7495    the data in BUF, set *STOP to a starting position of the next
7496    composition (if any) or to LIMIT, and return the address of the
7497    next element of BUF.
7498
7499    If such an annotation is not found, set *STOP to a starting
7500    position of a composition after POS (if any) or to LIMIT, and
7501    return BUF.  */
7502
7503 static int *
7504 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7505                                struct coding_system *coding, int *buf,
7506                                ptrdiff_t *stop)
7507 {
7508   ptrdiff_t start, end;
7509   Lisp_Object prop;
7510
7511   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7512       || end > limit)
7513     *stop = limit;
7514   else if (start > pos)
7515     *stop = start;
7516   else
7517     {
7518       if (start == pos)
7519         {
7520           /* We found a composition.  Store the corresponding
7521              annotation data in BUF.  */
7522           int *head = buf;
7523           enum composition_method method = composition_method (prop);
7524           int nchars = COMPOSITION_LENGTH (prop);
7525
7526           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7527           if (method != COMPOSITION_RELATIVE)
7528             {
7529               Lisp_Object components;
7530               ptrdiff_t i, len, i_byte;
7531
7532               components = COMPOSITION_COMPONENTS (prop);
7533               if (VECTORP (components))
7534                 {
7535                   len = ASIZE (components);
7536                   for (i = 0; i < len; i++)
7537                     *buf++ = XINT (AREF (components, i));
7538                 }
7539               else if (STRINGP (components))
7540                 {
7541                   len = SCHARS (components);
7542                   i = i_byte = 0;
7543                   while (i < len)
7544                     {
7545                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7546                       buf++;
7547                     }
7548                 }
7549               else if (INTEGERP (components))
7550                 {
7551                   len = 1;
7552                   *buf++ = XINT (components);
7553                 }
7554               else if (CONSP (components))
7555                 {
7556                   for (len = 0; CONSP (components);
7557                        len++, components = XCDR (components))
7558                     *buf++ = XINT (XCAR (components));
7559                 }
7560               else
7561                 emacs_abort ();
7562               *head -= len;
7563             }
7564         }
7565
7566       if (find_composition (end, limit, &start, &end, &prop,
7567                             coding->src_object)
7568           && end <= limit)
7569         *stop = start;
7570       else
7571         *stop = limit;
7572     }
7573   return buf;
7574 }
7575
7576
7577 /* Extract an annotation datum from a text property `charset' at POS of
7578    CODING->src_object (buffer of string), store the data in BUF, set
7579    *STOP to the position where the value of `charset' property changes
7580    (limiting by LIMIT), and return the address of the next element of
7581    BUF.
7582
7583    If the property value is nil, set *STOP to the position where the
7584    property value is non-nil (limiting by LIMIT), and return BUF.  */
7585
7586 static int *
7587 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7588                            struct coding_system *coding, int *buf,
7589                            ptrdiff_t *stop)
7590 {
7591   Lisp_Object val, next;
7592   int id;
7593
7594   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7595   if (! NILP (val) && CHARSETP (val))
7596     id = XINT (CHARSET_SYMBOL_ID (val));
7597   else
7598     id = -1;
7599   ADD_CHARSET_DATA (buf, 0, id);
7600   next = Fnext_single_property_change (make_number (pos), Qcharset,
7601                                        coding->src_object,
7602                                        make_number (limit));
7603   *stop = XINT (next);
7604   return buf;
7605 }
7606
7607
7608 static void
7609 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7610                int max_lookup)
7611 {
7612   int *buf = coding->charbuf;
7613   int *buf_end = coding->charbuf + coding->charbuf_size;
7614   const unsigned char *src = coding->source + coding->consumed;
7615   const unsigned char *src_end = coding->source + coding->src_bytes;
7616   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7617   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7618   bool multibytep = coding->src_multibyte;
7619   Lisp_Object eol_type;
7620   int c;
7621   ptrdiff_t stop, stop_composition, stop_charset;
7622   int *lookup_buf = NULL;
7623
7624   if (! NILP (translation_table))
7625     lookup_buf = alloca (sizeof (int) * max_lookup);
7626
7627   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7628   if (VECTORP (eol_type))
7629     eol_type = Qunix;
7630
7631   /* Note: composition handling is not yet implemented.  */
7632   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7633
7634   if (NILP (coding->src_object))
7635     stop = stop_composition = stop_charset = end_pos;
7636   else
7637     {
7638       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7639         stop = stop_composition = pos;
7640       else
7641         stop = stop_composition = end_pos;
7642       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7643         stop = stop_charset = pos;
7644       else
7645         stop_charset = end_pos;
7646     }
7647
7648   /* Compensate for CRLF and conversion.  */
7649   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7650   while (buf < buf_end)
7651     {
7652       Lisp_Object trans;
7653
7654       if (pos == stop)
7655         {
7656           if (pos == end_pos)
7657             break;
7658           if (pos == stop_composition)
7659             buf = handle_composition_annotation (pos, end_pos, coding,
7660                                                  buf, &stop_composition);
7661           if (pos == stop_charset)
7662             buf = handle_charset_annotation (pos, end_pos, coding,
7663                                              buf, &stop_charset);
7664           stop = (stop_composition < stop_charset
7665                   ? stop_composition : stop_charset);
7666         }
7667
7668       if (! multibytep)
7669         {
7670           int bytes;
7671
7672           if (coding->encoder == encode_coding_raw_text
7673               || coding->encoder == encode_coding_ccl)
7674             c = *src++, pos++;
7675           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7676             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7677           else
7678             c = BYTE8_TO_CHAR (*src), src++, pos++;
7679         }
7680       else
7681         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7682       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7683         c = '\n';
7684       if (! EQ (eol_type, Qunix))
7685         {
7686           if (c == '\n')
7687             {
7688               if (EQ (eol_type, Qdos))
7689                 *buf++ = '\r';
7690               else
7691                 c = '\r';
7692             }
7693         }
7694
7695       trans = Qnil;
7696       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7697       if (NILP (trans))
7698         *buf++ = c;
7699       else
7700         {
7701           ptrdiff_t from_nchars = 1, to_nchars = 1;
7702           int *lookup_buf_end;
7703           const unsigned char *p = src;
7704           int i;
7705
7706           lookup_buf[0] = c;
7707           for (i = 1; i < max_lookup && p < src_end; i++)
7708             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7709           lookup_buf_end = lookup_buf + i;
7710           trans = get_translation (trans, lookup_buf, lookup_buf_end,
7711                                    &from_nchars);
7712           if (INTEGERP (trans))
7713             c = XINT (trans);
7714           else if (VECTORP (trans))
7715             {
7716               to_nchars = ASIZE (trans);
7717               if (buf_end - buf < to_nchars)
7718                 break;
7719               c = XINT (AREF (trans, 0));
7720             }
7721           else
7722             break;
7723           *buf++ = c;
7724           for (i = 1; i < to_nchars; i++)
7725             *buf++ = XINT (AREF (trans, i));
7726           for (i = 1; i < from_nchars; i++, pos++)
7727             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7728         }
7729     }
7730
7731   coding->consumed = src - coding->source;
7732   coding->consumed_char = pos - coding->src_pos;
7733   coding->charbuf_used = buf - coding->charbuf;
7734   coding->chars_at_source = 0;
7735 }
7736
7737
7738 /* Encode the text at CODING->src_object into CODING->dst_object.
7739    CODING->src_object is a buffer or a string.
7740    CODING->dst_object is a buffer or nil.
7741
7742    If CODING->src_object is a buffer, it must be the current buffer.
7743    In this case, if CODING->src_pos is positive, it is a position of
7744    the source text in the buffer, otherwise. the source text is in the
7745    gap area of the buffer, and coding->src_pos specifies the offset of
7746    the text from GPT (which must be the same as PT).  If this is the
7747    same buffer as CODING->dst_object, CODING->src_pos must be
7748    negative and CODING should not have `pre-write-conversion'.
7749
7750    If CODING->src_object is a string, CODING should not have
7751    `pre-write-conversion'.
7752
7753    If CODING->dst_object is a buffer, the encoded data is inserted at
7754    the current point of that buffer.
7755
7756    If CODING->dst_object is nil, the encoded data is placed at the
7757    memory area specified by CODING->destination.  */
7758
7759 static void
7760 encode_coding (struct coding_system *coding)
7761 {
7762   Lisp_Object attrs;
7763   Lisp_Object translation_table;
7764   int max_lookup;
7765   struct ccl_spec cclspec;
7766
7767   USE_SAFE_ALLOCA;
7768
7769   attrs = CODING_ID_ATTRS (coding->id);
7770   if (coding->encoder == encode_coding_raw_text)
7771     translation_table = Qnil, max_lookup = 0;
7772   else
7773     translation_table = get_translation_table (attrs, 1, &max_lookup);
7774
7775   if (BUFFERP (coding->dst_object))
7776     {
7777       set_buffer_internal (XBUFFER (coding->dst_object));
7778       coding->dst_multibyte
7779         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7780     }
7781
7782   coding->consumed = coding->consumed_char = 0;
7783   coding->produced = coding->produced_char = 0;
7784   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7785
7786   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7787
7788   if (coding->encoder == encode_coding_ccl)
7789     {
7790       coding->spec.ccl = &cclspec;
7791       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7792     }
7793   do {
7794     coding_set_source (coding);
7795     consume_chars (coding, translation_table, max_lookup);
7796     coding_set_destination (coding);
7797     (*(coding->encoder)) (coding);
7798   } while (coding->consumed_char < coding->src_chars);
7799
7800   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7801     insert_from_gap (coding->produced_char, coding->produced, 0);
7802
7803   SAFE_FREE ();
7804 }
7805
7806
7807 /* Name (or base name) of work buffer for code conversion.  */
7808 static Lisp_Object Vcode_conversion_workbuf_name;
7809
7810 /* A working buffer used by the top level conversion.  Once it is
7811    created, it is never destroyed.  It has the name
7812    Vcode_conversion_workbuf_name.  The other working buffers are
7813    destroyed after the use is finished, and their names are modified
7814    versions of Vcode_conversion_workbuf_name.  */
7815 static Lisp_Object Vcode_conversion_reused_workbuf;
7816
7817 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7818 static bool reused_workbuf_in_use;
7819
7820
7821 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7822    multibyteness of returning buffer.  */
7823
7824 static Lisp_Object
7825 make_conversion_work_buffer (bool multibyte)
7826 {
7827   Lisp_Object name, workbuf;
7828   struct buffer *current;
7829
7830   if (reused_workbuf_in_use)
7831     {
7832       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7833       workbuf = Fget_buffer_create (name);
7834     }
7835   else
7836     {
7837       reused_workbuf_in_use = 1;
7838       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7839         Vcode_conversion_reused_workbuf
7840           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7841       workbuf = Vcode_conversion_reused_workbuf;
7842     }
7843   current = current_buffer;
7844   set_buffer_internal (XBUFFER (workbuf));
7845   /* We can't allow modification hooks to run in the work buffer.  For
7846      instance, directory_files_internal assumes that file decoding
7847      doesn't compile new regexps.  */
7848   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7849   Ferase_buffer ();
7850   bset_undo_list (current_buffer, Qt);
7851   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7852   set_buffer_internal (current);
7853   return workbuf;
7854 }
7855
7856
7857 static void
7858 code_conversion_restore (Lisp_Object arg)
7859 {
7860   Lisp_Object current, workbuf;
7861
7862   current = XCAR (arg);
7863   workbuf = XCDR (arg);
7864   if (! NILP (workbuf))
7865     {
7866       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7867         reused_workbuf_in_use = 0;
7868       else
7869         Fkill_buffer (workbuf);
7870     }
7871   set_buffer_internal (XBUFFER (current));
7872 }
7873
7874 Lisp_Object
7875 code_conversion_save (bool with_work_buf, bool multibyte)
7876 {
7877   Lisp_Object workbuf = Qnil;
7878
7879   if (with_work_buf)
7880     workbuf = make_conversion_work_buffer (multibyte);
7881   record_unwind_protect (code_conversion_restore,
7882                          Fcons (Fcurrent_buffer (), workbuf));
7883   return workbuf;
7884 }
7885
7886 static void
7887 coding_restore_undo_list (Lisp_Object arg)
7888 {
7889   Lisp_Object undo_list = XCAR (arg);
7890   struct buffer *buf = XBUFFER (XCDR (arg));
7891
7892   bset_undo_list (buf, undo_list);
7893 }
7894
7895 void
7896 decode_coding_gap (struct coding_system *coding,
7897                    ptrdiff_t chars, ptrdiff_t bytes)
7898 {
7899   ptrdiff_t count = SPECPDL_INDEX ();
7900   Lisp_Object attrs;
7901
7902   coding->src_object = Fcurrent_buffer ();
7903   coding->src_chars = chars;
7904   coding->src_bytes = bytes;
7905   coding->src_pos = -chars;
7906   coding->src_pos_byte = -bytes;
7907   coding->src_multibyte = chars < bytes;
7908   coding->dst_object = coding->src_object;
7909   coding->dst_pos = PT;
7910   coding->dst_pos_byte = PT_BYTE;
7911   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7912
7913   coding->head_ascii = -1;
7914   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7915   coding->eol_seen = EOL_SEEN_NONE;
7916   if (CODING_REQUIRE_DETECTION (coding))
7917     detect_coding (coding);
7918   attrs = CODING_ID_ATTRS (coding->id);
7919   if (! disable_ascii_optimization
7920       && ! coding->src_multibyte
7921       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7922       && NILP (CODING_ATTR_POST_READ (attrs))
7923       && NILP (get_translation_table (attrs, 0, NULL)))
7924     {
7925       chars = coding->head_ascii;
7926       if (chars < 0)
7927         chars = check_ascii (coding);
7928       if (chars != bytes)
7929         {
7930           /* There exists a non-ASCII byte.  */
7931           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7932               && coding->detected_utf8_bytes == coding->src_bytes)
7933             {
7934               if (coding->detected_utf8_chars >= 0)
7935                 chars = coding->detected_utf8_chars;
7936               else
7937                 chars = check_utf_8 (coding);
7938               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7939                   && coding->head_ascii == 0
7940                   && coding->source[0] == UTF_8_BOM_1
7941                   && coding->source[1] == UTF_8_BOM_2
7942                   && coding->source[2] == UTF_8_BOM_3)
7943                 {
7944                   chars--;
7945                   bytes -= 3;
7946                   coding->src_bytes -= 3;
7947                 }
7948             }
7949           else
7950             chars = -1;
7951         }
7952       if (chars >= 0)
7953         {
7954           Lisp_Object eol_type;
7955
7956           eol_type = CODING_ID_EOL_TYPE (coding->id);
7957           if (VECTORP (eol_type))
7958             {
7959               if (coding->eol_seen != EOL_SEEN_NONE)
7960                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7961             }
7962           if (EQ (eol_type, Qmac))
7963             {
7964               unsigned char *src_end = GAP_END_ADDR;
7965               unsigned char *src = src_end - coding->src_bytes;
7966
7967               while (src < src_end)
7968                 {
7969                   if (*src++ == '\r')
7970                     src[-1] = '\n';
7971                 }
7972             }
7973           else if (EQ (eol_type, Qdos))
7974             {
7975               unsigned char *src = GAP_END_ADDR;
7976               unsigned char *src_beg = src - coding->src_bytes;
7977               unsigned char *dst = src;
7978               ptrdiff_t diff;
7979
7980               while (src_beg < src)
7981                 {
7982                   *--dst = *--src;
7983                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7984                     src--;
7985                 }
7986               diff = dst - src;
7987               bytes -= diff;
7988               chars -= diff;
7989             }
7990           coding->produced = bytes;
7991           coding->produced_char = chars;
7992           insert_from_gap (chars, bytes, 1);
7993           return;
7994         }
7995     }
7996   code_conversion_save (0, 0);
7997
7998   coding->mode |= CODING_MODE_LAST_BLOCK;
7999   current_buffer->text->inhibit_shrinking = 1;
8000   decode_coding (coding);
8001   current_buffer->text->inhibit_shrinking = 0;
8002
8003   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8004     {
8005       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8006       Lisp_Object val;
8007       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
8008       ptrdiff_t count1 = SPECPDL_INDEX ();
8009
8010       record_unwind_protect (coding_restore_undo_list,
8011                              Fcons (undo_list, Fcurrent_buffer ()));
8012       bset_undo_list (current_buffer, Qt);
8013       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8014       val = call1 (CODING_ATTR_POST_READ (attrs),
8015                    make_number (coding->produced_char));
8016       CHECK_NATNUM (val);
8017       coding->produced_char += Z - prev_Z;
8018       coding->produced += Z_BYTE - prev_Z_BYTE;
8019       unbind_to (count1, Qnil);
8020     }
8021
8022   unbind_to (count, Qnil);
8023 }
8024
8025
8026 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8027    SRC_OBJECT into DST_OBJECT by coding context CODING.
8028
8029    SRC_OBJECT is a buffer, a string, or Qnil.
8030
8031    If it is a buffer, the text is at point of the buffer.  FROM and TO
8032    are positions in the buffer.
8033
8034    If it is a string, the text is at the beginning of the string.
8035    FROM and TO are indices to the string.
8036
8037    If it is nil, the text is at coding->source.  FROM and TO are
8038    indices to coding->source.
8039
8040    DST_OBJECT is a buffer, Qt, or Qnil.
8041
8042    If it is a buffer, the decoded text is inserted at point of the
8043    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8044    is deleted.
8045
8046    If it is Qt, a string is made from the decoded text, and
8047    set in CODING->dst_object.
8048
8049    If it is Qnil, the decoded text is stored at CODING->destination.
8050    The caller must allocate CODING->dst_bytes bytes at
8051    CODING->destination by xmalloc.  If the decoded text is longer than
8052    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8053  */
8054
8055 void
8056 decode_coding_object (struct coding_system *coding,
8057                       Lisp_Object src_object,
8058                       ptrdiff_t from, ptrdiff_t from_byte,
8059                       ptrdiff_t to, ptrdiff_t to_byte,
8060                       Lisp_Object dst_object)
8061 {
8062   ptrdiff_t count = SPECPDL_INDEX ();
8063   unsigned char *destination UNINIT;
8064   ptrdiff_t dst_bytes UNINIT;
8065   ptrdiff_t chars = to - from;
8066   ptrdiff_t bytes = to_byte - from_byte;
8067   Lisp_Object attrs;
8068   ptrdiff_t saved_pt = -1, saved_pt_byte UNINIT;
8069   bool need_marker_adjustment = 0;
8070   Lisp_Object old_deactivate_mark;
8071
8072   old_deactivate_mark = Vdeactivate_mark;
8073
8074   if (NILP (dst_object))
8075     {
8076       destination = coding->destination;
8077       dst_bytes = coding->dst_bytes;
8078     }
8079
8080   coding->src_object = src_object;
8081   coding->src_chars = chars;
8082   coding->src_bytes = bytes;
8083   coding->src_multibyte = chars < bytes;
8084
8085   if (STRINGP (src_object))
8086     {
8087       coding->src_pos = from;
8088       coding->src_pos_byte = from_byte;
8089     }
8090   else if (BUFFERP (src_object))
8091     {
8092       set_buffer_internal (XBUFFER (src_object));
8093       if (from != GPT)
8094         move_gap_both (from, from_byte);
8095       if (EQ (src_object, dst_object))
8096         {
8097           struct Lisp_Marker *tail;
8098
8099           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8100             {
8101               tail->need_adjustment
8102                 = tail->charpos == (tail->insertion_type ? from : to);
8103               need_marker_adjustment |= tail->need_adjustment;
8104             }
8105           saved_pt = PT, saved_pt_byte = PT_BYTE;
8106           TEMP_SET_PT_BOTH (from, from_byte);
8107           current_buffer->text->inhibit_shrinking = 1;
8108           del_range_both (from, from_byte, to, to_byte, 1);
8109           coding->src_pos = -chars;
8110           coding->src_pos_byte = -bytes;
8111         }
8112       else
8113         {
8114           coding->src_pos = from;
8115           coding->src_pos_byte = from_byte;
8116         }
8117     }
8118
8119   if (CODING_REQUIRE_DETECTION (coding))
8120     detect_coding (coding);
8121   attrs = CODING_ID_ATTRS (coding->id);
8122
8123   if (EQ (dst_object, Qt)
8124       || (! NILP (CODING_ATTR_POST_READ (attrs))
8125           && NILP (dst_object)))
8126     {
8127       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8128       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8129       coding->dst_pos = BEG;
8130       coding->dst_pos_byte = BEG_BYTE;
8131     }
8132   else if (BUFFERP (dst_object))
8133     {
8134       code_conversion_save (0, 0);
8135       coding->dst_object = dst_object;
8136       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8137       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8138       coding->dst_multibyte
8139         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8140     }
8141   else
8142     {
8143       code_conversion_save (0, 0);
8144       coding->dst_object = Qnil;
8145       /* Most callers presume this will return a multibyte result, and they
8146          won't use `binary' or `raw-text' anyway, so let's not worry about
8147          CODING_FOR_UNIBYTE.  */
8148       coding->dst_multibyte = 1;
8149     }
8150
8151   decode_coding (coding);
8152
8153   if (BUFFERP (coding->dst_object))
8154     set_buffer_internal (XBUFFER (coding->dst_object));
8155
8156   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8157     {
8158       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8159       Lisp_Object val;
8160       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
8161       ptrdiff_t count1 = SPECPDL_INDEX ();
8162
8163       record_unwind_protect (coding_restore_undo_list,
8164                              Fcons (undo_list, Fcurrent_buffer ()));
8165       bset_undo_list (current_buffer, Qt);
8166       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8167       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8168                         make_number (coding->produced_char));
8169       CHECK_NATNUM (val);
8170       coding->produced_char += Z - prev_Z;
8171       coding->produced += Z_BYTE - prev_Z_BYTE;
8172       unbind_to (count1, Qnil);
8173     }
8174
8175   if (EQ (dst_object, Qt))
8176     {
8177       coding->dst_object = Fbuffer_string ();
8178     }
8179   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8180     {
8181       set_buffer_internal (XBUFFER (coding->dst_object));
8182       if (dst_bytes < coding->produced)
8183         {
8184           eassert (coding->produced > 0);
8185           destination = xrealloc (destination, coding->produced);
8186           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8187             move_gap_both (BEGV, BEGV_BYTE);
8188           memcpy (destination, BEGV_ADDR, coding->produced);
8189           coding->destination = destination;
8190         }
8191     }
8192
8193   if (saved_pt >= 0)
8194     {
8195       /* This is the case of:
8196          (BUFFERP (src_object) && EQ (src_object, dst_object))
8197          As we have moved PT while replacing the original buffer
8198          contents, we must recover it now.  */
8199       set_buffer_internal (XBUFFER (src_object));
8200       current_buffer->text->inhibit_shrinking = 0;
8201       if (saved_pt < from)
8202         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8203       else if (saved_pt < from + chars)
8204         TEMP_SET_PT_BOTH (from, from_byte);
8205       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8206         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8207                           saved_pt_byte + (coding->produced - bytes));
8208       else
8209         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8210                           saved_pt_byte + (coding->produced - bytes));
8211
8212       if (need_marker_adjustment)
8213         {
8214           struct Lisp_Marker *tail;
8215
8216           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8217             if (tail->need_adjustment)
8218               {
8219                 tail->need_adjustment = 0;
8220                 if (tail->insertion_type)
8221                   {
8222                     tail->bytepos = from_byte;
8223                     tail->charpos = from;
8224                   }
8225                 else
8226                   {
8227                     tail->bytepos = from_byte + coding->produced;
8228                     tail->charpos
8229                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8230                          ? tail->bytepos : from + coding->produced_char);
8231                   }
8232               }
8233         }
8234     }
8235
8236   Vdeactivate_mark = old_deactivate_mark;
8237   unbind_to (count, coding->dst_object);
8238 }
8239
8240
8241 void
8242 encode_coding_object (struct coding_system *coding,
8243                       Lisp_Object src_object,
8244                       ptrdiff_t from, ptrdiff_t from_byte,
8245                       ptrdiff_t to, ptrdiff_t to_byte,
8246                       Lisp_Object dst_object)
8247 {
8248   ptrdiff_t count = SPECPDL_INDEX ();
8249   ptrdiff_t chars = to - from;
8250   ptrdiff_t bytes = to_byte - from_byte;
8251   Lisp_Object attrs;
8252   ptrdiff_t saved_pt = -1, saved_pt_byte;
8253   bool need_marker_adjustment = 0;
8254   bool kill_src_buffer = 0;
8255   Lisp_Object old_deactivate_mark;
8256
8257   old_deactivate_mark = Vdeactivate_mark;
8258
8259   coding->src_object = src_object;
8260   coding->src_chars = chars;
8261   coding->src_bytes = bytes;
8262   coding->src_multibyte = chars < bytes;
8263
8264   attrs = CODING_ID_ATTRS (coding->id);
8265
8266   if (EQ (src_object, dst_object))
8267     {
8268       struct Lisp_Marker *tail;
8269
8270       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8271         {
8272           tail->need_adjustment
8273             = tail->charpos == (tail->insertion_type ? from : to);
8274           need_marker_adjustment |= tail->need_adjustment;
8275         }
8276     }
8277
8278   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8279     {
8280       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8281       set_buffer_internal (XBUFFER (coding->src_object));
8282       if (STRINGP (src_object))
8283         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8284       else if (BUFFERP (src_object))
8285         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8286       else
8287         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8288
8289       if (EQ (src_object, dst_object))
8290         {
8291           set_buffer_internal (XBUFFER (src_object));
8292           saved_pt = PT, saved_pt_byte = PT_BYTE;
8293           del_range_both (from, from_byte, to, to_byte, 1);
8294           set_buffer_internal (XBUFFER (coding->src_object));
8295         }
8296
8297       safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8298                   make_number (BEG), make_number (Z));
8299       if (XBUFFER (coding->src_object) != current_buffer)
8300         kill_src_buffer = 1;
8301       coding->src_object = Fcurrent_buffer ();
8302       if (BEG != GPT)
8303         move_gap_both (BEG, BEG_BYTE);
8304       coding->src_chars = Z - BEG;
8305       coding->src_bytes = Z_BYTE - BEG_BYTE;
8306       coding->src_pos = BEG;
8307       coding->src_pos_byte = BEG_BYTE;
8308       coding->src_multibyte = Z < Z_BYTE;
8309     }
8310   else if (STRINGP (src_object))
8311     {
8312       code_conversion_save (0, 0);
8313       coding->src_pos = from;
8314       coding->src_pos_byte = from_byte;
8315     }
8316   else if (BUFFERP (src_object))
8317     {
8318       code_conversion_save (0, 0);
8319       set_buffer_internal (XBUFFER (src_object));
8320       if (EQ (src_object, dst_object))
8321         {
8322           saved_pt = PT, saved_pt_byte = PT_BYTE;
8323           coding->src_object = del_range_1 (from, to, 1, 1);
8324           coding->src_pos = 0;
8325           coding->src_pos_byte = 0;
8326         }
8327       else
8328         {
8329           if (from < GPT && to >= GPT)
8330             move_gap_both (from, from_byte);
8331           coding->src_pos = from;
8332           coding->src_pos_byte = from_byte;
8333         }
8334     }
8335   else
8336     {
8337       code_conversion_save (0, 0);
8338       coding->src_pos = from;
8339       coding->src_pos_byte = from_byte;
8340     }
8341
8342   if (BUFFERP (dst_object))
8343     {
8344       coding->dst_object = dst_object;
8345       if (EQ (src_object, dst_object))
8346         {
8347           coding->dst_pos = from;
8348           coding->dst_pos_byte = from_byte;
8349         }
8350       else
8351         {
8352           struct buffer *current = current_buffer;
8353
8354           set_buffer_temp (XBUFFER (dst_object));
8355           coding->dst_pos = PT;
8356           coding->dst_pos_byte = PT_BYTE;
8357           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8358           set_buffer_temp (current);
8359         }
8360       coding->dst_multibyte
8361         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8362     }
8363   else if (EQ (dst_object, Qt))
8364     {
8365       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8366       coding->dst_object = Qnil;
8367       coding->destination = xmalloc (dst_bytes);
8368       coding->dst_bytes = dst_bytes;
8369       coding->dst_multibyte = 0;
8370     }
8371   else
8372     {
8373       coding->dst_object = Qnil;
8374       coding->dst_multibyte = 0;
8375     }
8376
8377   encode_coding (coding);
8378
8379   if (EQ (dst_object, Qt))
8380     {
8381       if (BUFFERP (coding->dst_object))
8382         coding->dst_object = Fbuffer_string ();
8383       else if (coding->raw_destination)
8384         /* This is used to avoid creating huge Lisp string.
8385            NOTE: caller who sets `raw_destination' is also
8386            responsible for freeing `destination' buffer.  */
8387         coding->dst_object = Qnil;
8388       else
8389         {
8390           coding->dst_object
8391             = make_unibyte_string ((char *) coding->destination,
8392                                    coding->produced);
8393           xfree (coding->destination);
8394         }
8395     }
8396
8397   if (saved_pt >= 0)
8398     {
8399       /* This is the case of:
8400          (BUFFERP (src_object) && EQ (src_object, dst_object))
8401          As we have moved PT while replacing the original buffer
8402          contents, we must recover it now.  */
8403       set_buffer_internal (XBUFFER (src_object));
8404       if (saved_pt < from)
8405         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8406       else if (saved_pt < from + chars)
8407         TEMP_SET_PT_BOTH (from, from_byte);
8408       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8409         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8410                           saved_pt_byte + (coding->produced - bytes));
8411       else
8412         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8413                           saved_pt_byte + (coding->produced - bytes));
8414
8415       if (need_marker_adjustment)
8416         {
8417           struct Lisp_Marker *tail;
8418
8419           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8420             if (tail->need_adjustment)
8421               {
8422                 tail->need_adjustment = 0;
8423                 if (tail->insertion_type)
8424                   {
8425                     tail->bytepos = from_byte;
8426                     tail->charpos = from;
8427                   }
8428                 else
8429                   {
8430                     tail->bytepos = from_byte + coding->produced;
8431                     tail->charpos
8432                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8433                          ? tail->bytepos : from + coding->produced_char);
8434                   }
8435               }
8436         }
8437     }
8438
8439   if (kill_src_buffer)
8440     Fkill_buffer (coding->src_object);
8441
8442   Vdeactivate_mark = old_deactivate_mark;
8443   unbind_to (count, Qnil);
8444 }
8445
8446
8447 Lisp_Object
8448 preferred_coding_system (void)
8449 {
8450   int id = coding_categories[coding_priorities[0]].id;
8451
8452   return CODING_ID_NAME (id);
8453 }
8454
8455 #if defined (WINDOWSNT) || defined (CYGWIN)
8456
8457 Lisp_Object
8458 from_unicode (Lisp_Object str)
8459 {
8460   CHECK_STRING (str);
8461   if (!STRING_MULTIBYTE (str) &&
8462       SBYTES (str) & 1)
8463     {
8464       str = Fsubstring (str, make_number (0), make_number (-1));
8465     }
8466
8467   return code_convert_string_norecord (str, Qutf_16le, 0);
8468 }
8469
8470 Lisp_Object
8471 from_unicode_buffer (const wchar_t *wstr)
8472 {
8473   /* We get one of the two final null bytes for free.  */
8474   ptrdiff_t len = 1 + sizeof (wchar_t) * wcslen (wstr);
8475   AUTO_STRING_WITH_LEN (str, (char *) wstr, len);
8476   return from_unicode (str);
8477 }
8478
8479 wchar_t *
8480 to_unicode (Lisp_Object str, Lisp_Object *buf)
8481 {
8482   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8483   /* We need to make another copy (in addition to the one made by
8484      code_convert_string_norecord) to ensure that the final string is
8485      _doubly_ zero terminated --- that is, that the string is
8486      terminated by two zero bytes and one utf-16le null character.
8487      Because strings are already terminated with a single zero byte,
8488      we just add one additional zero. */
8489   str = make_uninit_string (SBYTES (*buf) + 1);
8490   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8491   SDATA (str) [SBYTES (*buf)] = '\0';
8492   *buf = str;
8493   return WCSDATA (*buf);
8494 }
8495
8496 #endif /* WINDOWSNT || CYGWIN */
8497
8498 \f
8499 #ifdef emacs
8500 /*** 8. Emacs Lisp library functions ***/
8501
8502 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8503        doc: /* Return t if OBJECT is nil or a coding-system.
8504 See the documentation of `define-coding-system' for information
8505 about coding-system objects.  */)
8506   (Lisp_Object object)
8507 {
8508   if (NILP (object)
8509       || CODING_SYSTEM_ID (object) >= 0)
8510     return Qt;
8511   if (! SYMBOLP (object)
8512       || NILP (Fget (object, Qcoding_system_define_form)))
8513     return Qnil;
8514   return Qt;
8515 }
8516
8517 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8518        Sread_non_nil_coding_system, 1, 1, 0,
8519        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8520   (Lisp_Object prompt)
8521 {
8522   Lisp_Object val;
8523   do
8524     {
8525       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8526                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8527     }
8528   while (SCHARS (val) == 0);
8529   return (Fintern (val, Qnil));
8530 }
8531
8532 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8533        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8534 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8535 Ignores case when completing coding systems (all Emacs coding systems
8536 are lower-case).  */)
8537   (Lisp_Object prompt, Lisp_Object default_coding_system)
8538 {
8539   Lisp_Object val;
8540   ptrdiff_t count = SPECPDL_INDEX ();
8541
8542   if (SYMBOLP (default_coding_system))
8543     default_coding_system = SYMBOL_NAME (default_coding_system);
8544   specbind (Qcompletion_ignore_case, Qt);
8545   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8546                           Qt, Qnil, Qcoding_system_history,
8547                           default_coding_system, Qnil);
8548   unbind_to (count, Qnil);
8549   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8550 }
8551
8552 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8553        1, 1, 0,
8554        doc: /* Check validity of CODING-SYSTEM.
8555 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8556 It is valid if it is nil or a symbol defined as a coding system by the
8557 function `define-coding-system'.  */)
8558   (Lisp_Object coding_system)
8559 {
8560   Lisp_Object define_form;
8561
8562   define_form = Fget (coding_system, Qcoding_system_define_form);
8563   if (! NILP (define_form))
8564     {
8565       Fput (coding_system, Qcoding_system_define_form, Qnil);
8566       safe_eval (define_form);
8567     }
8568   if (!NILP (Fcoding_system_p (coding_system)))
8569     return coding_system;
8570   xsignal1 (Qcoding_system_error, coding_system);
8571 }
8572
8573 \f
8574 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8575    HIGHEST, return the coding system of the highest
8576    priority among the detected coding systems.  Otherwise return a
8577    list of detected coding systems sorted by their priorities.  If
8578    MULTIBYTEP, it is assumed that the bytes are in correct
8579    multibyte form but contains only ASCII and eight-bit chars.
8580    Otherwise, the bytes are raw bytes.
8581
8582    CODING-SYSTEM controls the detection as below:
8583
8584    If it is nil, detect both text-format and eol-format.  If the
8585    text-format part of CODING-SYSTEM is already specified
8586    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8587    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8588    detect only text-format.  */
8589
8590 Lisp_Object
8591 detect_coding_system (const unsigned char *src,
8592                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8593                       bool highest, bool multibytep,
8594                       Lisp_Object coding_system)
8595 {
8596   const unsigned char *src_end = src + src_bytes;
8597   Lisp_Object attrs, eol_type;
8598   Lisp_Object val = Qnil;
8599   struct coding_system coding;
8600   ptrdiff_t id;
8601   struct coding_detection_info detect_info;
8602   enum coding_category base_category;
8603   bool null_byte_found = 0, eight_bit_found = 0;
8604
8605   if (NILP (coding_system))
8606     coding_system = Qundecided;
8607   setup_coding_system (coding_system, &coding);
8608   attrs = CODING_ID_ATTRS (coding.id);
8609   eol_type = CODING_ID_EOL_TYPE (coding.id);
8610   coding_system = CODING_ATTR_BASE_NAME (attrs);
8611
8612   coding.source = src;
8613   coding.src_chars = src_chars;
8614   coding.src_bytes = src_bytes;
8615   coding.src_multibyte = multibytep;
8616   coding.consumed = 0;
8617   coding.mode |= CODING_MODE_LAST_BLOCK;
8618   coding.head_ascii = 0;
8619
8620   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8621
8622   /* At first, detect text-format if necessary.  */
8623   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8624   if (base_category == coding_category_undecided)
8625     {
8626       enum coding_category category UNINIT;
8627       struct coding_system *this UNINIT;
8628       int c, i;
8629       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8630                                        inhibit_null_byte_detection);
8631       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8632                                        inhibit_iso_escape_detection);
8633       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8634
8635       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8636       for (; src < src_end; src++)
8637         {
8638           c = *src;
8639           if (c & 0x80)
8640             {
8641               eight_bit_found = 1;
8642               if (null_byte_found)
8643                 break;
8644             }
8645           else if (c < 0x20)
8646             {
8647               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8648                   && ! inhibit_ied
8649                   && ! detect_info.checked)
8650                 {
8651                   if (detect_coding_iso_2022 (&coding, &detect_info))
8652                     {
8653                       /* We have scanned the whole data.  */
8654                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8655                         {
8656                           /* We didn't find an 8-bit code.  We may
8657                              have found a null-byte, but it's very
8658                              rare that a binary file confirm to
8659                              ISO-2022.  */
8660                           src = src_end;
8661                           coding.head_ascii = src - coding.source;
8662                         }
8663                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8664                       break;
8665                     }
8666                 }
8667               else if (! c && !inhibit_nbd)
8668                 {
8669                   null_byte_found = 1;
8670                   if (eight_bit_found)
8671                     break;
8672                 }
8673               if (! eight_bit_found)
8674                 coding.head_ascii++;
8675             }
8676           else if (! eight_bit_found)
8677             coding.head_ascii++;
8678         }
8679
8680       if (null_byte_found || eight_bit_found
8681           || coding.head_ascii < coding.src_bytes
8682           || detect_info.found)
8683         {
8684           if (coding.head_ascii == coding.src_bytes)
8685             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8686             for (i = 0; i < coding_category_raw_text; i++)
8687               {
8688                 category = coding_priorities[i];
8689                 this = coding_categories + category;
8690                 if (detect_info.found & (1 << category))
8691                   break;
8692               }
8693           else
8694             {
8695               if (null_byte_found)
8696                 {
8697                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8698                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8699                 }
8700               else if (prefer_utf_8
8701                        && detect_coding_utf_8 (&coding, &detect_info))
8702                 {
8703                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8704                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8705                 }
8706               for (i = 0; i < coding_category_raw_text; i++)
8707                 {
8708                   category = coding_priorities[i];
8709                   this = coding_categories + category;
8710
8711                   if (this->id < 0)
8712                     {
8713                       /* No coding system of this category is defined.  */
8714                       detect_info.rejected |= (1 << category);
8715                     }
8716                   else if (category >= coding_category_raw_text)
8717                     continue;
8718                   else if (detect_info.checked & (1 << category))
8719                     {
8720                       if (highest
8721                           && (detect_info.found & (1 << category)))
8722                         break;
8723                     }
8724                   else if ((*(this->detector)) (&coding, &detect_info)
8725                            && highest
8726                            && (detect_info.found & (1 << category)))
8727                     {
8728                       if (category == coding_category_utf_16_auto)
8729                         {
8730                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8731                             category = coding_category_utf_16_le;
8732                           else
8733                             category = coding_category_utf_16_be;
8734                         }
8735                       break;
8736                     }
8737                 }
8738             }
8739         }
8740
8741       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8742           || null_byte_found)
8743         {
8744           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8745           id = CODING_SYSTEM_ID (Qno_conversion);
8746           val = list1 (make_number (id));
8747         }
8748       else if (! detect_info.rejected && ! detect_info.found)
8749         {
8750           detect_info.found = CATEGORY_MASK_ANY;
8751           id = coding_categories[coding_category_undecided].id;
8752           val = list1 (make_number (id));
8753         }
8754       else if (highest)
8755         {
8756           if (detect_info.found)
8757             {
8758               detect_info.found = 1 << category;
8759               val = list1 (make_number (this->id));
8760             }
8761           else
8762             for (i = 0; i < coding_category_raw_text; i++)
8763               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8764                 {
8765                   detect_info.found = 1 << coding_priorities[i];
8766                   id = coding_categories[coding_priorities[i]].id;
8767                   val = list1 (make_number (id));
8768                   break;
8769                 }
8770         }
8771       else
8772         {
8773           int mask = detect_info.rejected | detect_info.found;
8774           int found = 0;
8775
8776           for (i = coding_category_raw_text - 1; i >= 0; i--)
8777             {
8778               category = coding_priorities[i];
8779               if (! (mask & (1 << category)))
8780                 {
8781                   found |= 1 << category;
8782                   id = coding_categories[category].id;
8783                   if (id >= 0)
8784                     val = list1 (make_number (id));
8785                 }
8786             }
8787           for (i = coding_category_raw_text - 1; i >= 0; i--)
8788             {
8789               category = coding_priorities[i];
8790               if (detect_info.found & (1 << category))
8791                 {
8792                   id = coding_categories[category].id;
8793                   val = Fcons (make_number (id), val);
8794                 }
8795             }
8796           detect_info.found |= found;
8797         }
8798     }
8799   else if (base_category == coding_category_utf_8_auto)
8800     {
8801       if (detect_coding_utf_8 (&coding, &detect_info))
8802         {
8803           struct coding_system *this;
8804
8805           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8806             this = coding_categories + coding_category_utf_8_sig;
8807           else
8808             this = coding_categories + coding_category_utf_8_nosig;
8809           val = list1 (make_number (this->id));
8810         }
8811     }
8812   else if (base_category == coding_category_utf_16_auto)
8813     {
8814       if (detect_coding_utf_16 (&coding, &detect_info))
8815         {
8816           struct coding_system *this;
8817
8818           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8819             this = coding_categories + coding_category_utf_16_le;
8820           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8821             this = coding_categories + coding_category_utf_16_be;
8822           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8823             this = coding_categories + coding_category_utf_16_be_nosig;
8824           else
8825             this = coding_categories + coding_category_utf_16_le_nosig;
8826           val = list1 (make_number (this->id));
8827         }
8828     }
8829   else
8830     {
8831       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8832       val = list1 (make_number (coding.id));
8833     }
8834
8835   /* Then, detect eol-format if necessary.  */
8836   {
8837     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8838     Lisp_Object tail;
8839
8840     if (VECTORP (eol_type))
8841       {
8842         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8843           {
8844             if (null_byte_found)
8845               normal_eol = EOL_SEEN_LF;
8846             else
8847               normal_eol = detect_eol (coding.source, src_bytes,
8848                                        coding_category_raw_text);
8849           }
8850         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8851                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8852           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8853                                       coding_category_utf_16_be);
8854         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8855                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8856           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8857                                       coding_category_utf_16_le);
8858       }
8859     else
8860       {
8861         if (EQ (eol_type, Qunix))
8862           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8863         else if (EQ (eol_type, Qdos))
8864           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8865         else
8866           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8867       }
8868
8869     for (tail = val; CONSP (tail); tail = XCDR (tail))
8870       {
8871         enum coding_category category;
8872         int this_eol;
8873
8874         id = XINT (XCAR (tail));
8875         attrs = CODING_ID_ATTRS (id);
8876         category = XINT (CODING_ATTR_CATEGORY (attrs));
8877         eol_type = CODING_ID_EOL_TYPE (id);
8878         if (VECTORP (eol_type))
8879           {
8880             if (category == coding_category_utf_16_be
8881                 || category == coding_category_utf_16_be_nosig)
8882               this_eol = utf_16_be_eol;
8883             else if (category == coding_category_utf_16_le
8884                      || category == coding_category_utf_16_le_nosig)
8885               this_eol = utf_16_le_eol;
8886             else
8887               this_eol = normal_eol;
8888
8889             if (this_eol == EOL_SEEN_LF)
8890               XSETCAR (tail, AREF (eol_type, 0));
8891             else if (this_eol == EOL_SEEN_CRLF)
8892               XSETCAR (tail, AREF (eol_type, 1));
8893             else if (this_eol == EOL_SEEN_CR)
8894               XSETCAR (tail, AREF (eol_type, 2));
8895             else
8896               XSETCAR (tail, CODING_ID_NAME (id));
8897           }
8898         else
8899           XSETCAR (tail, CODING_ID_NAME (id));
8900       }
8901   }
8902
8903   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8904 }
8905
8906
8907 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8908        2, 3, 0,
8909        doc: /* Detect coding system of the text in the region between START and END.
8910 Return a list of possible coding systems ordered by priority.
8911 The coding systems to try and their priorities follows what
8912 the function `coding-system-priority-list' (which see) returns.
8913
8914 If only ASCII characters are found (except for such ISO-2022 control
8915 characters as ESC), it returns a list of single element `undecided'
8916 or its subsidiary coding system according to a detected end-of-line
8917 format.
8918
8919 If optional argument HIGHEST is non-nil, return the coding system of
8920 highest priority.  */)
8921   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8922 {
8923   ptrdiff_t from, to;
8924   ptrdiff_t from_byte, to_byte;
8925
8926   validate_region (&start, &end);
8927   from = XINT (start), to = XINT (end);
8928   from_byte = CHAR_TO_BYTE (from);
8929   to_byte = CHAR_TO_BYTE (to);
8930
8931   if (from < GPT && to >= GPT)
8932     move_gap_both (to, to_byte);
8933
8934   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8935                                to - from, to_byte - from_byte,
8936                                !NILP (highest),
8937                                !NILP (BVAR (current_buffer
8938                                       , enable_multibyte_characters)),
8939                                Qnil);
8940 }
8941
8942 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8943        1, 2, 0,
8944        doc: /* Detect coding system of the text in STRING.
8945 Return a list of possible coding systems ordered by priority.
8946 The coding systems to try and their priorities follows what
8947 the function `coding-system-priority-list' (which see) returns.
8948
8949 If only ASCII characters are found (except for such ISO-2022 control
8950 characters as ESC), it returns a list of single element `undecided'
8951 or its subsidiary coding system according to a detected end-of-line
8952 format.
8953
8954 If optional argument HIGHEST is non-nil, return the coding system of
8955 highest priority.  */)
8956   (Lisp_Object string, Lisp_Object highest)
8957 {
8958   CHECK_STRING (string);
8959
8960   return detect_coding_system (SDATA (string),
8961                                SCHARS (string), SBYTES (string),
8962                                !NILP (highest), STRING_MULTIBYTE (string),
8963                                Qnil);
8964 }
8965
8966
8967 static bool
8968 char_encodable_p (int c, Lisp_Object attrs)
8969 {
8970   Lisp_Object tail;
8971   struct charset *charset;
8972   Lisp_Object translation_table;
8973
8974   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8975   if (! NILP (translation_table))
8976     c = translate_char (translation_table, c);
8977   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8978        CONSP (tail); tail = XCDR (tail))
8979     {
8980       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8981       if (CHAR_CHARSET_P (c, charset))
8982         break;
8983     }
8984   return (! NILP (tail));
8985 }
8986
8987
8988 /* Return a list of coding systems that safely encode the text between
8989    START and END.  If EXCLUDE is non-nil, it is a list of coding
8990    systems not to check.  The returned list doesn't contain any such
8991    coding systems.  In any case, if the text contains only ASCII or is
8992    unibyte, return t.  */
8993
8994 DEFUN ("find-coding-systems-region-internal",
8995        Ffind_coding_systems_region_internal,
8996        Sfind_coding_systems_region_internal, 2, 3, 0,
8997        doc: /* Internal use only.  */)
8998   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8999 {
9000   Lisp_Object coding_attrs_list, safe_codings;
9001   ptrdiff_t start_byte, end_byte;
9002   const unsigned char *p, *pbeg, *pend;
9003   int c;
9004   Lisp_Object tail, elt, work_table;
9005
9006   if (STRINGP (start))
9007     {
9008       if (!STRING_MULTIBYTE (start)
9009           || SCHARS (start) == SBYTES (start))
9010         return Qt;
9011       start_byte = 0;
9012       end_byte = SBYTES (start);
9013     }
9014   else
9015     {
9016       CHECK_NUMBER_COERCE_MARKER (start);
9017       CHECK_NUMBER_COERCE_MARKER (end);
9018       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9019         args_out_of_range (start, end);
9020       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9021         return Qt;
9022       start_byte = CHAR_TO_BYTE (XINT (start));
9023       end_byte = CHAR_TO_BYTE (XINT (end));
9024       if (XINT (end) - XINT (start) == end_byte - start_byte)
9025         return Qt;
9026
9027       if (XINT (start) < GPT && XINT (end) > GPT)
9028         {
9029           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9030             move_gap_both (XINT (start), start_byte);
9031           else
9032             move_gap_both (XINT (end), end_byte);
9033         }
9034     }
9035
9036   coding_attrs_list = Qnil;
9037   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9038     if (NILP (exclude)
9039         || NILP (Fmemq (XCAR (tail), exclude)))
9040       {
9041         Lisp_Object attrs;
9042
9043         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9044         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9045           {
9046             ASET (attrs, coding_attr_trans_tbl,
9047                   get_translation_table (attrs, 1, NULL));
9048             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9049           }
9050       }
9051
9052   if (STRINGP (start))
9053     p = pbeg = SDATA (start);
9054   else
9055     p = pbeg = BYTE_POS_ADDR (start_byte);
9056   pend = p + (end_byte - start_byte);
9057
9058   while (p < pend && ASCII_CHAR_P (*p)) p++;
9059   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9060
9061   work_table = Fmake_char_table (Qnil, Qnil);
9062   while (p < pend)
9063     {
9064       if (ASCII_CHAR_P (*p))
9065         p++;
9066       else
9067         {
9068           c = STRING_CHAR_ADVANCE (p);
9069           if (!NILP (char_table_ref (work_table, c)))
9070             /* This character was already checked.  Ignore it.  */
9071             continue;
9072
9073           charset_map_loaded = 0;
9074           for (tail = coding_attrs_list; CONSP (tail);)
9075             {
9076               elt = XCAR (tail);
9077               if (NILP (elt))
9078                 tail = XCDR (tail);
9079               else if (char_encodable_p (c, elt))
9080                 tail = XCDR (tail);
9081               else if (CONSP (XCDR (tail)))
9082                 {
9083                   XSETCAR (tail, XCAR (XCDR (tail)));
9084                   XSETCDR (tail, XCDR (XCDR (tail)));
9085                 }
9086               else
9087                 {
9088                   XSETCAR (tail, Qnil);
9089                   tail = XCDR (tail);
9090                 }
9091             }
9092           if (charset_map_loaded)
9093             {
9094               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9095
9096               if (STRINGP (start))
9097                 pbeg = SDATA (start);
9098               else
9099                 pbeg = BYTE_POS_ADDR (start_byte);
9100               p = pbeg + p_offset;
9101               pend = pbeg + pend_offset;
9102             }
9103           char_table_set (work_table, c, Qt);
9104         }
9105     }
9106
9107   safe_codings = list2 (Qraw_text, Qno_conversion);
9108   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9109     if (! NILP (XCAR (tail)))
9110       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9111
9112   return safe_codings;
9113 }
9114
9115
9116 DEFUN ("unencodable-char-position", Funencodable_char_position,
9117        Sunencodable_char_position, 3, 5, 0,
9118        doc: /* Return position of first un-encodable character in a region.
9119 START and END specify the region and CODING-SYSTEM specifies the
9120 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9121
9122 If optional 4th argument COUNT is non-nil, it specifies at most how
9123 many un-encodable characters to search.  In this case, the value is a
9124 list of positions.
9125
9126 If optional 5th argument STRING is non-nil, it is a string to search
9127 for un-encodable characters.  In that case, START and END are indexes
9128 to the string and treated as in `substring'.  */)
9129   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9130    Lisp_Object count, Lisp_Object string)
9131 {
9132   EMACS_INT n;
9133   struct coding_system coding;
9134   Lisp_Object attrs, charset_list, translation_table;
9135   Lisp_Object positions;
9136   ptrdiff_t from, to;
9137   const unsigned char *p, *stop, *pend;
9138   bool ascii_compatible;
9139
9140   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9141   attrs = CODING_ID_ATTRS (coding.id);
9142   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9143     return Qnil;
9144   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9145   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9146   translation_table = get_translation_table (attrs, 1, NULL);
9147
9148   if (NILP (string))
9149     {
9150       validate_region (&start, &end);
9151       from = XINT (start);
9152       to = XINT (end);
9153       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9154           || (ascii_compatible
9155               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9156         return Qnil;
9157       p = CHAR_POS_ADDR (from);
9158       pend = CHAR_POS_ADDR (to);
9159       if (from < GPT && to >= GPT)
9160         stop = GPT_ADDR;
9161       else
9162         stop = pend;
9163     }
9164   else
9165     {
9166       CHECK_STRING (string);
9167       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9168       if (! STRING_MULTIBYTE (string))
9169         return Qnil;
9170       p = SDATA (string) + string_char_to_byte (string, from);
9171       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9172       if (ascii_compatible && (to - from) == (pend - p))
9173         return Qnil;
9174     }
9175
9176   if (NILP (count))
9177     n = 1;
9178   else
9179     {
9180       CHECK_NATNUM (count);
9181       n = XINT (count);
9182     }
9183
9184   positions = Qnil;
9185   charset_map_loaded = 0;
9186   while (1)
9187     {
9188       int c;
9189
9190       if (ascii_compatible)
9191         while (p < stop && ASCII_CHAR_P (*p))
9192           p++, from++;
9193       if (p >= stop)
9194         {
9195           if (p >= pend)
9196             break;
9197           stop = pend;
9198           p = GAP_END_ADDR;
9199         }
9200
9201       c = STRING_CHAR_ADVANCE (p);
9202       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9203           && ! char_charset (translate_char (translation_table, c),
9204                              charset_list, NULL))
9205         {
9206           positions = Fcons (make_number (from), positions);
9207           n--;
9208           if (n == 0)
9209             break;
9210         }
9211
9212       from++;
9213       if (charset_map_loaded && NILP (string))
9214         {
9215           p = CHAR_POS_ADDR (from);
9216           pend = CHAR_POS_ADDR (to);
9217           if (from < GPT && to >= GPT)
9218             stop = GPT_ADDR;
9219           else
9220             stop = pend;
9221           charset_map_loaded = 0;
9222         }
9223     }
9224
9225   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9226 }
9227
9228
9229 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9230        Scheck_coding_systems_region, 3, 3, 0,
9231        doc: /* Check if the region is encodable by coding systems.
9232
9233 START and END are buffer positions specifying the region.
9234 CODING-SYSTEM-LIST is a list of coding systems to check.
9235
9236 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9237 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9238 whole region, POS0, POS1, ... are buffer positions where non-encodable
9239 characters are found.
9240
9241 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9242 value is nil.
9243
9244 START may be a string.  In that case, check if the string is
9245 encodable, and the value contains indices to the string instead of
9246 buffer positions.  END is ignored.
9247
9248 If the current buffer (or START if it is a string) is unibyte, the value
9249 is nil.  */)
9250   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9251 {
9252   Lisp_Object list;
9253   ptrdiff_t start_byte, end_byte;
9254   ptrdiff_t pos;
9255   const unsigned char *p, *pbeg, *pend;
9256   int c;
9257   Lisp_Object tail, elt, attrs;
9258
9259   if (STRINGP (start))
9260     {
9261       if (!STRING_MULTIBYTE (start)
9262           || SCHARS (start) == SBYTES (start))
9263         return Qnil;
9264       start_byte = 0;
9265       end_byte = SBYTES (start);
9266       pos = 0;
9267     }
9268   else
9269     {
9270       CHECK_NUMBER_COERCE_MARKER (start);
9271       CHECK_NUMBER_COERCE_MARKER (end);
9272       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9273         args_out_of_range (start, end);
9274       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9275         return Qnil;
9276       start_byte = CHAR_TO_BYTE (XINT (start));
9277       end_byte = CHAR_TO_BYTE (XINT (end));
9278       if (XINT (end) - XINT (start) == end_byte - start_byte)
9279         return Qnil;
9280
9281       if (XINT (start) < GPT && XINT (end) > GPT)
9282         {
9283           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9284             move_gap_both (XINT (start), start_byte);
9285           else
9286             move_gap_both (XINT (end), end_byte);
9287         }
9288       pos = XINT (start);
9289     }
9290
9291   list = Qnil;
9292   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9293     {
9294       elt = XCAR (tail);
9295       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9296       ASET (attrs, coding_attr_trans_tbl,
9297             get_translation_table (attrs, 1, NULL));
9298       list = Fcons (list2 (elt, attrs), list);
9299     }
9300
9301   if (STRINGP (start))
9302     p = pbeg = SDATA (start);
9303   else
9304     p = pbeg = BYTE_POS_ADDR (start_byte);
9305   pend = p + (end_byte - start_byte);
9306
9307   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9308   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9309
9310   while (p < pend)
9311     {
9312       if (ASCII_CHAR_P (*p))
9313         p++;
9314       else
9315         {
9316           c = STRING_CHAR_ADVANCE (p);
9317
9318           charset_map_loaded = 0;
9319           for (tail = list; CONSP (tail); tail = XCDR (tail))
9320             {
9321               elt = XCDR (XCAR (tail));
9322               if (! char_encodable_p (c, XCAR (elt)))
9323                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9324             }
9325           if (charset_map_loaded)
9326             {
9327               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9328
9329               if (STRINGP (start))
9330                 pbeg = SDATA (start);
9331               else
9332                 pbeg = BYTE_POS_ADDR (start_byte);
9333               p = pbeg + p_offset;
9334               pend = pbeg + pend_offset;
9335             }
9336         }
9337       pos++;
9338     }
9339
9340   tail = list;
9341   list = Qnil;
9342   for (; CONSP (tail); tail = XCDR (tail))
9343     {
9344       elt = XCAR (tail);
9345       if (CONSP (XCDR (XCDR (elt))))
9346         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9347                       list);
9348     }
9349
9350   return list;
9351 }
9352
9353
9354 static Lisp_Object
9355 code_convert_region (Lisp_Object start, Lisp_Object end,
9356                      Lisp_Object coding_system, Lisp_Object dst_object,
9357                      bool encodep, bool norecord)
9358 {
9359   struct coding_system coding;
9360   ptrdiff_t from, from_byte, to, to_byte;
9361   Lisp_Object src_object;
9362
9363   if (NILP (coding_system))
9364     coding_system = Qno_conversion;
9365   else
9366     CHECK_CODING_SYSTEM (coding_system);
9367   src_object = Fcurrent_buffer ();
9368   if (NILP (dst_object))
9369     dst_object = src_object;
9370   else if (! EQ (dst_object, Qt))
9371     CHECK_BUFFER (dst_object);
9372
9373   validate_region (&start, &end);
9374   from = XFASTINT (start);
9375   from_byte = CHAR_TO_BYTE (from);
9376   to = XFASTINT (end);
9377   to_byte = CHAR_TO_BYTE (to);
9378
9379   setup_coding_system (coding_system, &coding);
9380   coding.mode |= CODING_MODE_LAST_BLOCK;
9381
9382   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9383     {
9384       struct buffer *buf = XBUFFER (dst_object);
9385       ptrdiff_t buf_pt = BUF_PT (buf);
9386
9387       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9388     }
9389
9390   if (encodep)
9391     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9392                           dst_object);
9393   else
9394     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9395                           dst_object);
9396   if (! norecord)
9397     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9398
9399   return (BUFFERP (dst_object)
9400           ? make_number (coding.produced_char)
9401           : coding.dst_object);
9402 }
9403
9404
9405 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9406        3, 4, "r\nzCoding system: ",
9407        doc: /* Decode the current region from the specified coding system.
9408 When called from a program, takes four arguments:
9409         START, END, CODING-SYSTEM, and DESTINATION.
9410 START and END are buffer positions.
9411
9412 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9413 If nil, the region between START and END is replaced by the decoded text.
9414 If buffer, the decoded text is inserted in that buffer after point (point
9415 does not move).
9416 In those cases, the length of the decoded text is returned.
9417 If DESTINATION is t, the decoded text is returned.
9418
9419 This function sets `last-coding-system-used' to the precise coding system
9420 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9421 not fully specified.)  */)
9422   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9423 {
9424   return code_convert_region (start, end, coding_system, destination, 0, 0);
9425 }
9426
9427 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9428        3, 4, "r\nzCoding system: ",
9429        doc: /* Encode the current region by specified coding system.
9430 When called from a program, takes four arguments:
9431         START, END, CODING-SYSTEM and DESTINATION.
9432 START and END are buffer positions.
9433
9434 Optional 4th argument DESTINATION specifies where the encoded text goes.
9435 If nil, the region between START and END is replaced by the encoded text.
9436 If buffer, the encoded text is inserted in that buffer after point (point
9437 does not move).
9438 In those cases, the length of the encoded text is returned.
9439 If DESTINATION is t, the encoded text is returned.
9440
9441 This function sets `last-coding-system-used' to the precise coding system
9442 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9443 not fully specified.)  */)
9444   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9445 {
9446   return code_convert_region (start, end, coding_system, destination, 1, 0);
9447 }
9448
9449 Lisp_Object
9450 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9451                      Lisp_Object dst_object, bool encodep, bool nocopy,
9452                      bool norecord)
9453 {
9454   struct coding_system coding;
9455   ptrdiff_t chars, bytes;
9456
9457   CHECK_STRING (string);
9458   if (NILP (coding_system))
9459     {
9460       if (! norecord)
9461         Vlast_coding_system_used = Qno_conversion;
9462       if (NILP (dst_object))
9463         return (nocopy ? Fcopy_sequence (string) : string);
9464     }
9465
9466   if (NILP (coding_system))
9467     coding_system = Qno_conversion;
9468   else
9469     CHECK_CODING_SYSTEM (coding_system);
9470   if (NILP (dst_object))
9471     dst_object = Qt;
9472   else if (! EQ (dst_object, Qt))
9473     CHECK_BUFFER (dst_object);
9474
9475   setup_coding_system (coding_system, &coding);
9476   coding.mode |= CODING_MODE_LAST_BLOCK;
9477   chars = SCHARS (string);
9478   bytes = SBYTES (string);
9479
9480   if (BUFFERP (dst_object))
9481     {
9482       struct buffer *buf = XBUFFER (dst_object);
9483       ptrdiff_t buf_pt = BUF_PT (buf);
9484
9485       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9486     }
9487
9488   if (encodep)
9489     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9490   else
9491     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9492   if (! norecord)
9493     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9494
9495   return (BUFFERP (dst_object)
9496           ? make_number (coding.produced_char)
9497           : coding.dst_object);
9498 }
9499
9500
9501 /* Encode or decode STRING according to CODING_SYSTEM.
9502    Do not set Vlast_coding_system_used.
9503
9504    This function is called only from macros DECODE_FILE and
9505    ENCODE_FILE, thus we ignore character composition.  */
9506
9507 Lisp_Object
9508 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9509                               bool encodep)
9510 {
9511   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9512 }
9513
9514 /* Encode or decode a file name, to or from a unibyte string suitable
9515    for passing to C library functions.  */
9516 Lisp_Object
9517 decode_file_name (Lisp_Object fname)
9518 {
9519 #ifdef WINDOWSNT
9520   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9521      converts the file names either to UTF-16LE or to the system ANSI
9522      codepage internally, depending on the underlying OS; see w32.c.  */
9523   if (! NILP (Fcoding_system_p (Qutf_8)))
9524     return code_convert_string_norecord (fname, Qutf_8, 0);
9525   return fname;
9526 #else  /* !WINDOWSNT */
9527   if (! NILP (Vfile_name_coding_system))
9528     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9529   else if (! NILP (Vdefault_file_name_coding_system))
9530     return code_convert_string_norecord (fname,
9531                                          Vdefault_file_name_coding_system, 0);
9532   else
9533     return fname;
9534 #endif
9535 }
9536
9537 Lisp_Object
9538 encode_file_name (Lisp_Object fname)
9539 {
9540   /* This is especially important during bootstrap and dumping, when
9541      file-name encoding is not yet known, and therefore any non-ASCII
9542      file names are unibyte strings, and could only be thrashed if we
9543      try to encode them.  */
9544   if (!STRING_MULTIBYTE (fname))
9545     return fname;
9546 #ifdef WINDOWSNT
9547   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9548      converts the file names either to UTF-16LE or to the system ANSI
9549      codepage internally, depending on the underlying OS; see w32.c.  */
9550   if (! NILP (Fcoding_system_p (Qutf_8)))
9551     return code_convert_string_norecord (fname, Qutf_8, 1);
9552   return fname;
9553 #else  /* !WINDOWSNT */
9554   if (! NILP (Vfile_name_coding_system))
9555     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9556   else if (! NILP (Vdefault_file_name_coding_system))
9557     return code_convert_string_norecord (fname,
9558                                          Vdefault_file_name_coding_system, 1);
9559   else
9560     return fname;
9561 #endif
9562 }
9563
9564 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9565        2, 4, 0,
9566        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9567
9568 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9569 if the decoding operation is trivial.
9570
9571 Optional fourth arg BUFFER non-nil means that the decoded text is
9572 inserted in that buffer after point (point does not move).  In this
9573 case, the return value is the length of the decoded text.
9574
9575 This function sets `last-coding-system-used' to the precise coding system
9576 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9577 not fully specified.)  */)
9578   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9579 {
9580   return code_convert_string (string, coding_system, buffer,
9581                               0, ! NILP (nocopy), 0);
9582 }
9583
9584 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9585        2, 4, 0,
9586        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9587
9588 Optional third arg NOCOPY non-nil means it is OK to return STRING
9589 itself if the encoding operation is trivial.
9590
9591 Optional fourth arg BUFFER non-nil means that the encoded text is
9592 inserted in that buffer after point (point does not move).  In this
9593 case, the return value is the length of the encoded text.
9594
9595 This function sets `last-coding-system-used' to the precise coding system
9596 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9597 not fully specified.)  */)
9598   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9599 {
9600   return code_convert_string (string, coding_system, buffer,
9601                               1, ! NILP (nocopy), 0);
9602 }
9603
9604 \f
9605 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9606        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9607 Return the corresponding character.  */)
9608   (Lisp_Object code)
9609 {
9610   Lisp_Object spec, attrs, val;
9611   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9612   EMACS_INT ch;
9613   int c;
9614
9615   CHECK_NATNUM (code);
9616   ch = XFASTINT (code);
9617   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9618   attrs = AREF (spec, 0);
9619
9620   if (ASCII_CHAR_P (ch)
9621       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9622     return code;
9623
9624   val = CODING_ATTR_CHARSET_LIST (attrs);
9625   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9626   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9627   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9628
9629   if (ch <= 0x7F)
9630     {
9631       c = ch;
9632       charset = charset_roman;
9633     }
9634   else if (ch >= 0xA0 && ch < 0xDF)
9635     {
9636       c = ch - 0x80;
9637       charset = charset_kana;
9638     }
9639   else
9640     {
9641       EMACS_INT c1 = ch >> 8;
9642       int c2 = ch & 0xFF;
9643
9644       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9645           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9646         error ("Invalid code: %"pI"d", ch);
9647       c = ch;
9648       SJIS_TO_JIS (c);
9649       charset = charset_kanji;
9650     }
9651   c = DECODE_CHAR (charset, c);
9652   if (c < 0)
9653     error ("Invalid code: %"pI"d", ch);
9654   return make_number (c);
9655 }
9656
9657
9658 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9659        doc: /* Encode a Japanese character CH to shift_jis encoding.
9660 Return the corresponding code in SJIS.  */)
9661   (Lisp_Object ch)
9662 {
9663   Lisp_Object spec, attrs, charset_list;
9664   int c;
9665   struct charset *charset;
9666   unsigned code;
9667
9668   CHECK_CHARACTER (ch);
9669   c = XFASTINT (ch);
9670   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9671   attrs = AREF (spec, 0);
9672
9673   if (ASCII_CHAR_P (c)
9674       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9675     return ch;
9676
9677   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9678   charset = char_charset (c, charset_list, &code);
9679   if (code == CHARSET_INVALID_CODE (charset))
9680     error ("Can't encode by shift_jis encoding: %c", c);
9681   JIS_TO_SJIS (code);
9682
9683   return make_number (code);
9684 }
9685
9686 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9687        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9688 Return the corresponding character.  */)
9689   (Lisp_Object code)
9690 {
9691   Lisp_Object spec, attrs, val;
9692   struct charset *charset_roman, *charset_big5, *charset;
9693   EMACS_INT ch;
9694   int c;
9695
9696   CHECK_NATNUM (code);
9697   ch = XFASTINT (code);
9698   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9699   attrs = AREF (spec, 0);
9700
9701   if (ASCII_CHAR_P (ch)
9702       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9703     return code;
9704
9705   val = CODING_ATTR_CHARSET_LIST (attrs);
9706   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9707   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9708
9709   if (ch <= 0x7F)
9710     {
9711       c = ch;
9712       charset = charset_roman;
9713     }
9714   else
9715     {
9716       EMACS_INT b1 = ch >> 8;
9717       int b2 = ch & 0x7F;
9718       if (b1 < 0xA1 || b1 > 0xFE
9719           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9720         error ("Invalid code: %"pI"d", ch);
9721       c = ch;
9722       charset = charset_big5;
9723     }
9724   c = DECODE_CHAR (charset, c);
9725   if (c < 0)
9726     error ("Invalid code: %"pI"d", ch);
9727   return make_number (c);
9728 }
9729
9730 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9731        doc: /* Encode the Big5 character CH to BIG5 coding system.
9732 Return the corresponding character code in Big5.  */)
9733   (Lisp_Object ch)
9734 {
9735   Lisp_Object spec, attrs, charset_list;
9736   struct charset *charset;
9737   int c;
9738   unsigned code;
9739
9740   CHECK_CHARACTER (ch);
9741   c = XFASTINT (ch);
9742   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9743   attrs = AREF (spec, 0);
9744   if (ASCII_CHAR_P (c)
9745       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9746     return ch;
9747
9748   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9749   charset = char_charset (c, charset_list, &code);
9750   if (code == CHARSET_INVALID_CODE (charset))
9751     error ("Can't encode by Big5 encoding: %c", c);
9752
9753   return make_number (code);
9754 }
9755
9756 \f
9757 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9758        Sset_terminal_coding_system_internal, 1, 2, 0,
9759        doc: /* Internal use only.  */)
9760   (Lisp_Object coding_system, Lisp_Object terminal)
9761 {
9762   struct terminal *term = decode_live_terminal (terminal);
9763   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9764   CHECK_SYMBOL (coding_system);
9765   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9766   /* We had better not send unsafe characters to terminal.  */
9767   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9768   /* Character composition should be disabled.  */
9769   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9770   terminal_coding->src_multibyte = 1;
9771   terminal_coding->dst_multibyte = 0;
9772   tset_charset_list
9773     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9774             ? coding_charset_list (terminal_coding)
9775             : list1 (make_number (charset_ascii))));
9776   return Qnil;
9777 }
9778
9779 DEFUN ("set-safe-terminal-coding-system-internal",
9780        Fset_safe_terminal_coding_system_internal,
9781        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9782        doc: /* Internal use only.  */)
9783   (Lisp_Object coding_system)
9784 {
9785   CHECK_SYMBOL (coding_system);
9786   setup_coding_system (Fcheck_coding_system (coding_system),
9787                        &safe_terminal_coding);
9788   /* Character composition should be disabled.  */
9789   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9790   safe_terminal_coding.src_multibyte = 1;
9791   safe_terminal_coding.dst_multibyte = 0;
9792   return Qnil;
9793 }
9794
9795 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9796        Sterminal_coding_system, 0, 1, 0,
9797        doc: /* Return coding system specified for terminal output on the given terminal.
9798 TERMINAL may be a terminal object, a frame, or nil for the selected
9799 frame's terminal device.  */)
9800   (Lisp_Object terminal)
9801 {
9802   struct coding_system *terminal_coding
9803     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
9804   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9805
9806   /* For backward compatibility, return nil if it is `undecided'.  */
9807   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9808 }
9809
9810 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9811        Sset_keyboard_coding_system_internal, 1, 2, 0,
9812        doc: /* Internal use only.  */)
9813   (Lisp_Object coding_system, Lisp_Object terminal)
9814 {
9815   struct terminal *t = decode_live_terminal (terminal);
9816   CHECK_SYMBOL (coding_system);
9817   if (NILP (coding_system))
9818     coding_system = Qno_conversion;
9819   else
9820     Fcheck_coding_system (coding_system);
9821   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9822   /* Character composition should be disabled.  */
9823   TERMINAL_KEYBOARD_CODING (t)->common_flags
9824     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9825   return Qnil;
9826 }
9827
9828 DEFUN ("keyboard-coding-system",
9829        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9830        doc: /* Return coding system specified for decoding keyboard input.  */)
9831   (Lisp_Object terminal)
9832 {
9833   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9834                          (decode_live_terminal (terminal))->id);
9835 }
9836
9837 \f
9838 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9839        Sfind_operation_coding_system,  1, MANY, 0,
9840        doc: /* Choose a coding system for an operation based on the target name.
9841 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9842 DECODING-SYSTEM is the coding system to use for decoding
9843 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9844 for encoding (in case OPERATION does encoding).
9845
9846 The first argument OPERATION specifies an I/O primitive:
9847   For file I/O, `insert-file-contents' or `write-region'.
9848   For process I/O, `call-process', `call-process-region', or `start-process'.
9849   For network I/O, `open-network-stream'.
9850
9851 The remaining arguments should be the same arguments that were passed
9852 to the primitive.  Depending on which primitive, one of those arguments
9853 is selected as the TARGET.  For example, if OPERATION does file I/O,
9854 whichever argument specifies the file name is TARGET.
9855
9856 TARGET has a meaning which depends on OPERATION:
9857   For file I/O, TARGET is a file name (except for the special case below).
9858   For process I/O, TARGET is a process name.
9859   For network I/O, TARGET is a service name or a port number.
9860
9861 This function looks up what is specified for TARGET in
9862 `file-coding-system-alist', `process-coding-system-alist',
9863 or `network-coding-system-alist' depending on OPERATION.
9864 They may specify a coding system, a cons of coding systems,
9865 or a function symbol to call.
9866 In the last case, we call the function with one argument,
9867 which is a list of all the arguments given to this function.
9868 If the function can't decide a coding system, it can return
9869 `undecided' so that the normal code-detection is performed.
9870
9871 If OPERATION is `insert-file-contents', the argument corresponding to
9872 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9873 file name to look up, and BUFFER is a buffer that contains the file's
9874 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9875 function to call for FILENAME, that function should examine the
9876 contents of BUFFER instead of reading the file.
9877
9878 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9879   (ptrdiff_t nargs, Lisp_Object *args)
9880 {
9881   Lisp_Object operation, target_idx, target, val;
9882   register Lisp_Object chain;
9883
9884   if (nargs < 2)
9885     error ("Too few arguments");
9886   operation = args[0];
9887   if (!SYMBOLP (operation)
9888       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9889     error ("Invalid first argument");
9890   if (nargs <= 1 + XFASTINT (target_idx))
9891     error ("Too few arguments for operation `%s'",
9892            SDATA (SYMBOL_NAME (operation)));
9893   target = args[XFASTINT (target_idx) + 1];
9894   if (!(STRINGP (target)
9895         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9896             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9897         || (EQ (operation, Qopen_network_stream)
9898             && (INTEGERP (target) || EQ (target, Qt)))))
9899     error ("Invalid argument %"pI"d of operation `%s'",
9900            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9901   if (CONSP (target))
9902     target = XCAR (target);
9903
9904   chain = ((EQ (operation, Qinsert_file_contents)
9905             || EQ (operation, Qwrite_region))
9906            ? Vfile_coding_system_alist
9907            : (EQ (operation, Qopen_network_stream)
9908               ? Vnetwork_coding_system_alist
9909               : Vprocess_coding_system_alist));
9910   if (NILP (chain))
9911     return Qnil;
9912
9913   for (; CONSP (chain); chain = XCDR (chain))
9914     {
9915       Lisp_Object elt;
9916
9917       elt = XCAR (chain);
9918       if (CONSP (elt)
9919           && ((STRINGP (target)
9920                && STRINGP (XCAR (elt))
9921                && fast_string_match (XCAR (elt), target) >= 0)
9922               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9923         {
9924           val = XCDR (elt);
9925           /* Here, if VAL is both a valid coding system and a valid
9926              function symbol, we return VAL as a coding system.  */
9927           if (CONSP (val))
9928             return val;
9929           if (! SYMBOLP (val))
9930             return Qnil;
9931           if (! NILP (Fcoding_system_p (val)))
9932             return Fcons (val, val);
9933           if (! NILP (Ffboundp (val)))
9934             {
9935               /* We use call1 rather than safe_call1
9936                  so as to get bug reports about functions called here
9937                  which don't handle the current interface.  */
9938               val = call1 (val, Flist (nargs, args));
9939               if (CONSP (val))
9940                 return val;
9941               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9942                 return Fcons (val, val);
9943             }
9944           return Qnil;
9945         }
9946     }
9947   return Qnil;
9948 }
9949
9950 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9951        Sset_coding_system_priority, 0, MANY, 0,
9952        doc: /* Assign higher priority to the coding systems given as arguments.
9953 If multiple coding systems belong to the same category,
9954 all but the first one are ignored.
9955
9956 usage: (set-coding-system-priority &rest coding-systems)  */)
9957   (ptrdiff_t nargs, Lisp_Object *args)
9958 {
9959   ptrdiff_t i, j;
9960   bool changed[coding_category_max];
9961   enum coding_category priorities[coding_category_max];
9962
9963   memset (changed, 0, sizeof changed);
9964
9965   for (i = j = 0; i < nargs; i++)
9966     {
9967       enum coding_category category;
9968       Lisp_Object spec, attrs;
9969
9970       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9971       attrs = AREF (spec, 0);
9972       category = XINT (CODING_ATTR_CATEGORY (attrs));
9973       if (changed[category])
9974         /* Ignore this coding system because a coding system of the
9975            same category already had a higher priority.  */
9976         continue;
9977       changed[category] = 1;
9978       priorities[j++] = category;
9979       if (coding_categories[category].id >= 0
9980           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9981         setup_coding_system (args[i], &coding_categories[category]);
9982       Fset (AREF (Vcoding_category_table, category), args[i]);
9983     }
9984
9985   /* Now we have decided top J priorities.  Reflect the order of the
9986      original priorities to the remaining priorities.  */
9987
9988   for (i = j, j = 0; i < coding_category_max; i++, j++)
9989     {
9990       while (j < coding_category_max
9991              && changed[coding_priorities[j]])
9992         j++;
9993       if (j == coding_category_max)
9994         emacs_abort ();
9995       priorities[i] = coding_priorities[j];
9996     }
9997
9998   memcpy (coding_priorities, priorities, sizeof priorities);
9999
10000   /* Update `coding-category-list'.  */
10001   Vcoding_category_list = Qnil;
10002   for (i = coding_category_max; i-- > 0; )
10003     Vcoding_category_list
10004       = Fcons (AREF (Vcoding_category_table, priorities[i]),
10005                Vcoding_category_list);
10006
10007   return Qnil;
10008 }
10009
10010 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
10011        Scoding_system_priority_list, 0, 1, 0,
10012        doc: /* Return a list of coding systems ordered by their priorities.
10013 The list contains a subset of coding systems; i.e. coding systems
10014 assigned to each coding category (see `coding-category-list').
10015
10016 HIGHESTP non-nil means just return the highest priority one.  */)
10017   (Lisp_Object highestp)
10018 {
10019   int i;
10020   Lisp_Object val;
10021
10022   for (i = 0, val = Qnil; i < coding_category_max; i++)
10023     {
10024       enum coding_category category = coding_priorities[i];
10025       int id = coding_categories[category].id;
10026       Lisp_Object attrs;
10027
10028       if (id < 0)
10029         continue;
10030       attrs = CODING_ID_ATTRS (id);
10031       if (! NILP (highestp))
10032         return CODING_ATTR_BASE_NAME (attrs);
10033       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10034     }
10035   return Fnreverse (val);
10036 }
10037
10038 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10039
10040 static Lisp_Object
10041 make_subsidiaries (Lisp_Object base)
10042 {
10043   Lisp_Object subsidiaries;
10044   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10045   USE_SAFE_ALLOCA;
10046   char *buf = SAFE_ALLOCA (base_name_len + 6);
10047   int i;
10048
10049   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10050   subsidiaries = make_uninit_vector (3);
10051   for (i = 0; i < 3; i++)
10052     {
10053       strcpy (buf + base_name_len, suffixes[i]);
10054       ASET (subsidiaries, i, intern (buf));
10055     }
10056   SAFE_FREE ();
10057   return subsidiaries;
10058 }
10059
10060
10061 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10062        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10063        doc: /* For internal use only.
10064 usage: (define-coding-system-internal ...)  */)
10065   (ptrdiff_t nargs, Lisp_Object *args)
10066 {
10067   Lisp_Object name;
10068   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10069   Lisp_Object attrs;            /* Vector of attributes.  */
10070   Lisp_Object eol_type;
10071   Lisp_Object aliases;
10072   Lisp_Object coding_type, charset_list, safe_charsets;
10073   enum coding_category category;
10074   Lisp_Object tail, val;
10075   int max_charset_id = 0;
10076   int i;
10077
10078   if (nargs < coding_arg_max)
10079     goto short_args;
10080
10081   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10082
10083   name = args[coding_arg_name];
10084   CHECK_SYMBOL (name);
10085   ASET (attrs, coding_attr_base_name, name);
10086
10087   val = args[coding_arg_mnemonic];
10088   if (! STRINGP (val))
10089     CHECK_CHARACTER (val);
10090   ASET (attrs, coding_attr_mnemonic, val);
10091
10092   coding_type = args[coding_arg_coding_type];
10093   CHECK_SYMBOL (coding_type);
10094   ASET (attrs, coding_attr_type, coding_type);
10095
10096   charset_list = args[coding_arg_charset_list];
10097   if (SYMBOLP (charset_list))
10098     {
10099       if (EQ (charset_list, Qiso_2022))
10100         {
10101           if (! EQ (coding_type, Qiso_2022))
10102             error ("Invalid charset-list");
10103           charset_list = Viso_2022_charset_list;
10104         }
10105       else if (EQ (charset_list, Qemacs_mule))
10106         {
10107           if (! EQ (coding_type, Qemacs_mule))
10108             error ("Invalid charset-list");
10109           charset_list = Vemacs_mule_charset_list;
10110         }
10111       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10112         {
10113           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10114             error ("Invalid charset-list");
10115           if (max_charset_id < XFASTINT (XCAR (tail)))
10116             max_charset_id = XFASTINT (XCAR (tail));
10117         }
10118     }
10119   else
10120     {
10121       charset_list = Fcopy_sequence (charset_list);
10122       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10123         {
10124           struct charset *charset;
10125
10126           val = XCAR (tail);
10127           CHECK_CHARSET_GET_CHARSET (val, charset);
10128           if (EQ (coding_type, Qiso_2022)
10129               ? CHARSET_ISO_FINAL (charset) < 0
10130               : EQ (coding_type, Qemacs_mule)
10131               ? CHARSET_EMACS_MULE_ID (charset) < 0
10132               : 0)
10133             error ("Can't handle charset `%s'",
10134                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10135
10136           XSETCAR (tail, make_number (charset->id));
10137           if (max_charset_id < charset->id)
10138             max_charset_id = charset->id;
10139         }
10140     }
10141   ASET (attrs, coding_attr_charset_list, charset_list);
10142
10143   safe_charsets = make_uninit_string (max_charset_id + 1);
10144   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10145   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10146     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10147   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10148
10149   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10150
10151   val = args[coding_arg_decode_translation_table];
10152   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10153     CHECK_SYMBOL (val);
10154   ASET (attrs, coding_attr_decode_tbl, val);
10155
10156   val = args[coding_arg_encode_translation_table];
10157   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10158     CHECK_SYMBOL (val);
10159   ASET (attrs, coding_attr_encode_tbl, val);
10160
10161   val = args[coding_arg_post_read_conversion];
10162   CHECK_SYMBOL (val);
10163   ASET (attrs, coding_attr_post_read, val);
10164
10165   val = args[coding_arg_pre_write_conversion];
10166   CHECK_SYMBOL (val);
10167   ASET (attrs, coding_attr_pre_write, val);
10168
10169   val = args[coding_arg_default_char];
10170   if (NILP (val))
10171     ASET (attrs, coding_attr_default_char, make_number (' '));
10172   else
10173     {
10174       CHECK_CHARACTER (val);
10175       ASET (attrs, coding_attr_default_char, val);
10176     }
10177
10178   val = args[coding_arg_for_unibyte];
10179   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10180
10181   val = args[coding_arg_plist];
10182   CHECK_LIST (val);
10183   ASET (attrs, coding_attr_plist, val);
10184
10185   if (EQ (coding_type, Qcharset))
10186     {
10187       /* Generate a lisp vector of 256 elements.  Each element is nil,
10188          integer, or a list of charset IDs.
10189
10190          If Nth element is nil, the byte code N is invalid in this
10191          coding system.
10192
10193          If Nth element is a number NUM, N is the first byte of a
10194          charset whose ID is NUM.
10195
10196          If Nth element is a list of charset IDs, N is the first byte
10197          of one of them.  The list is sorted by dimensions of the
10198          charsets.  A charset of smaller dimension comes first. */
10199       val = Fmake_vector (make_number (256), Qnil);
10200
10201       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10202         {
10203           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10204           int dim = CHARSET_DIMENSION (charset);
10205           int idx = (dim - 1) * 4;
10206
10207           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10208             ASET (attrs, coding_attr_ascii_compat, Qt);
10209
10210           for (i = charset->code_space[idx];
10211                i <= charset->code_space[idx + 1]; i++)
10212             {
10213               Lisp_Object tmp, tmp2;
10214               int dim2;
10215
10216               tmp = AREF (val, i);
10217               if (NILP (tmp))
10218                 tmp = XCAR (tail);
10219               else if (NUMBERP (tmp))
10220                 {
10221                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10222                   if (dim < dim2)
10223                     tmp = list2 (XCAR (tail), tmp);
10224                   else
10225                     tmp = list2 (tmp, XCAR (tail));
10226                 }
10227               else
10228                 {
10229                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10230                     {
10231                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10232                       if (dim < dim2)
10233                         break;
10234                     }
10235                   if (NILP (tmp2))
10236                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10237                   else
10238                     {
10239                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10240                       XSETCAR (tmp2, XCAR (tail));
10241                     }
10242                 }
10243               ASET (val, i, tmp);
10244             }
10245         }
10246       ASET (attrs, coding_attr_charset_valids, val);
10247       category = coding_category_charset;
10248     }
10249   else if (EQ (coding_type, Qccl))
10250     {
10251       Lisp_Object valids;
10252
10253       if (nargs < coding_arg_ccl_max)
10254         goto short_args;
10255
10256       val = args[coding_arg_ccl_decoder];
10257       CHECK_CCL_PROGRAM (val);
10258       if (VECTORP (val))
10259         val = Fcopy_sequence (val);
10260       ASET (attrs, coding_attr_ccl_decoder, val);
10261
10262       val = args[coding_arg_ccl_encoder];
10263       CHECK_CCL_PROGRAM (val);
10264       if (VECTORP (val))
10265         val = Fcopy_sequence (val);
10266       ASET (attrs, coding_attr_ccl_encoder, val);
10267
10268       val = args[coding_arg_ccl_valids];
10269       valids = Fmake_string (make_number (256), make_number (0), Qnil);
10270       for (tail = val; CONSP (tail); tail = XCDR (tail))
10271         {
10272           int from, to;
10273
10274           val = XCAR (tail);
10275           if (INTEGERP (val))
10276             {
10277               if (! (0 <= XINT (val) && XINT (val) <= 255))
10278                 args_out_of_range_3 (val, make_number (0), make_number (255));
10279               from = to = XINT (val);
10280             }
10281           else
10282             {
10283               CHECK_CONS (val);
10284               CHECK_NATNUM_CAR (val);
10285               CHECK_NUMBER_CDR (val);
10286               if (XINT (XCAR (val)) > 255)
10287                 args_out_of_range_3 (XCAR (val),
10288                                      make_number (0), make_number (255));
10289               from = XINT (XCAR (val));
10290               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10291                 args_out_of_range_3 (XCDR (val),
10292                                      XCAR (val), make_number (255));
10293               to = XINT (XCDR (val));
10294             }
10295           for (i = from; i <= to; i++)
10296             SSET (valids, i, 1);
10297         }
10298       ASET (attrs, coding_attr_ccl_valids, valids);
10299
10300       category = coding_category_ccl;
10301     }
10302   else if (EQ (coding_type, Qutf_16))
10303     {
10304       Lisp_Object bom, endian;
10305
10306       ASET (attrs, coding_attr_ascii_compat, Qnil);
10307
10308       if (nargs < coding_arg_utf16_max)
10309         goto short_args;
10310
10311       bom = args[coding_arg_utf16_bom];
10312       if (! NILP (bom) && ! EQ (bom, Qt))
10313         {
10314           CHECK_CONS (bom);
10315           val = XCAR (bom);
10316           CHECK_CODING_SYSTEM (val);
10317           val = XCDR (bom);
10318           CHECK_CODING_SYSTEM (val);
10319         }
10320       ASET (attrs, coding_attr_utf_bom, bom);
10321
10322       endian = args[coding_arg_utf16_endian];
10323       CHECK_SYMBOL (endian);
10324       if (NILP (endian))
10325         endian = Qbig;
10326       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10327         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10328       ASET (attrs, coding_attr_utf_16_endian, endian);
10329
10330       category = (CONSP (bom)
10331                   ? coding_category_utf_16_auto
10332                   : NILP (bom)
10333                   ? (EQ (endian, Qbig)
10334                      ? coding_category_utf_16_be_nosig
10335                      : coding_category_utf_16_le_nosig)
10336                   : (EQ (endian, Qbig)
10337                      ? coding_category_utf_16_be
10338                      : coding_category_utf_16_le));
10339     }
10340   else if (EQ (coding_type, Qiso_2022))
10341     {
10342       Lisp_Object initial, reg_usage, request, flags;
10343
10344       if (nargs < coding_arg_iso2022_max)
10345         goto short_args;
10346
10347       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10348       CHECK_VECTOR (initial);
10349       for (i = 0; i < 4; i++)
10350         {
10351           val = AREF (initial, i);
10352           if (! NILP (val))
10353             {
10354               struct charset *charset;
10355
10356               CHECK_CHARSET_GET_CHARSET (val, charset);
10357               ASET (initial, i, make_number (CHARSET_ID (charset)));
10358               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10359                 ASET (attrs, coding_attr_ascii_compat, Qt);
10360             }
10361           else
10362             ASET (initial, i, make_number (-1));
10363         }
10364
10365       reg_usage = args[coding_arg_iso2022_reg_usage];
10366       CHECK_CONS (reg_usage);
10367       CHECK_NUMBER_CAR (reg_usage);
10368       CHECK_NUMBER_CDR (reg_usage);
10369
10370       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10371       for (tail = request; CONSP (tail); tail = XCDR (tail))
10372         {
10373           int id;
10374           Lisp_Object tmp1;
10375
10376           val = XCAR (tail);
10377           CHECK_CONS (val);
10378           tmp1 = XCAR (val);
10379           CHECK_CHARSET_GET_ID (tmp1, id);
10380           CHECK_NATNUM_CDR (val);
10381           if (XINT (XCDR (val)) >= 4)
10382             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10383           XSETCAR (val, make_number (id));
10384         }
10385
10386       flags = args[coding_arg_iso2022_flags];
10387       CHECK_NATNUM (flags);
10388       i = XINT (flags) & INT_MAX;
10389       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10390         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10391       flags = make_number (i);
10392
10393       ASET (attrs, coding_attr_iso_initial, initial);
10394       ASET (attrs, coding_attr_iso_usage, reg_usage);
10395       ASET (attrs, coding_attr_iso_request, request);
10396       ASET (attrs, coding_attr_iso_flags, flags);
10397       setup_iso_safe_charsets (attrs);
10398
10399       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10400         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10401                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10402                     ? coding_category_iso_7_else
10403                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10404                     ? coding_category_iso_7
10405                     : coding_category_iso_7_tight);
10406       else
10407         {
10408           int id = XINT (AREF (initial, 1));
10409
10410           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10411                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10412                        || id < 0)
10413                       ? coding_category_iso_8_else
10414                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10415                       ? coding_category_iso_8_1
10416                       : coding_category_iso_8_2);
10417         }
10418       if (category != coding_category_iso_8_1
10419           && category != coding_category_iso_8_2)
10420         ASET (attrs, coding_attr_ascii_compat, Qnil);
10421     }
10422   else if (EQ (coding_type, Qemacs_mule))
10423     {
10424       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10425         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10426       ASET (attrs, coding_attr_ascii_compat, Qt);
10427       category = coding_category_emacs_mule;
10428     }
10429   else if (EQ (coding_type, Qshift_jis))
10430     {
10431
10432       struct charset *charset;
10433
10434       if (XINT (Flength (charset_list)) != 3
10435           && XINT (Flength (charset_list)) != 4)
10436         error ("There should be three or four charsets");
10437
10438       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10439       if (CHARSET_DIMENSION (charset) != 1)
10440         error ("Dimension of charset %s is not one",
10441                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10442       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10443         ASET (attrs, coding_attr_ascii_compat, Qt);
10444
10445       charset_list = XCDR (charset_list);
10446       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10447       if (CHARSET_DIMENSION (charset) != 1)
10448         error ("Dimension of charset %s is not one",
10449                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10450
10451       charset_list = XCDR (charset_list);
10452       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10453       if (CHARSET_DIMENSION (charset) != 2)
10454         error ("Dimension of charset %s is not two",
10455                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10456
10457       charset_list = XCDR (charset_list);
10458       if (! NILP (charset_list))
10459         {
10460           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10461           if (CHARSET_DIMENSION (charset) != 2)
10462             error ("Dimension of charset %s is not two",
10463                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10464         }
10465
10466       category = coding_category_sjis;
10467       Vsjis_coding_system = name;
10468     }
10469   else if (EQ (coding_type, Qbig5))
10470     {
10471       struct charset *charset;
10472
10473       if (XINT (Flength (charset_list)) != 2)
10474         error ("There should be just two charsets");
10475
10476       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10477       if (CHARSET_DIMENSION (charset) != 1)
10478         error ("Dimension of charset %s is not one",
10479                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10480       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10481         ASET (attrs, coding_attr_ascii_compat, Qt);
10482
10483       charset_list = XCDR (charset_list);
10484       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10485       if (CHARSET_DIMENSION (charset) != 2)
10486         error ("Dimension of charset %s is not two",
10487                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10488
10489       category = coding_category_big5;
10490       Vbig5_coding_system = name;
10491     }
10492   else if (EQ (coding_type, Qraw_text))
10493     {
10494       category = coding_category_raw_text;
10495       ASET (attrs, coding_attr_ascii_compat, Qt);
10496     }
10497   else if (EQ (coding_type, Qutf_8))
10498     {
10499       Lisp_Object bom;
10500
10501       if (nargs < coding_arg_utf8_max)
10502         goto short_args;
10503
10504       bom = args[coding_arg_utf8_bom];
10505       if (! NILP (bom) && ! EQ (bom, Qt))
10506         {
10507           CHECK_CONS (bom);
10508           val = XCAR (bom);
10509           CHECK_CODING_SYSTEM (val);
10510           val = XCDR (bom);
10511           CHECK_CODING_SYSTEM (val);
10512         }
10513       ASET (attrs, coding_attr_utf_bom, bom);
10514       if (NILP (bom))
10515         ASET (attrs, coding_attr_ascii_compat, Qt);
10516
10517       category = (CONSP (bom) ? coding_category_utf_8_auto
10518                   : NILP (bom) ? coding_category_utf_8_nosig
10519                   : coding_category_utf_8_sig);
10520     }
10521   else if (EQ (coding_type, Qundecided))
10522     {
10523       if (nargs < coding_arg_undecided_max)
10524         goto short_args;
10525       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10526             args[coding_arg_undecided_inhibit_null_byte_detection]);
10527       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10528             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10529       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10530             args[coding_arg_undecided_prefer_utf_8]);
10531       category = coding_category_undecided;
10532     }
10533   else
10534     error ("Invalid coding system type: %s",
10535            SDATA (SYMBOL_NAME (coding_type)));
10536
10537   ASET (attrs, coding_attr_category, make_number (category));
10538   ASET (attrs, coding_attr_plist,
10539         Fcons (QCcategory,
10540                Fcons (AREF (Vcoding_category_table, category),
10541                       CODING_ATTR_PLIST (attrs))));
10542   ASET (attrs, coding_attr_plist,
10543         Fcons (QCascii_compatible_p,
10544                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10545                       CODING_ATTR_PLIST (attrs))));
10546
10547   eol_type = args[coding_arg_eol_type];
10548   if (! NILP (eol_type)
10549       && ! EQ (eol_type, Qunix)
10550       && ! EQ (eol_type, Qdos)
10551       && ! EQ (eol_type, Qmac))
10552     error ("Invalid eol-type");
10553
10554   aliases = list1 (name);
10555
10556   if (NILP (eol_type))
10557     {
10558       eol_type = make_subsidiaries (name);
10559       for (i = 0; i < 3; i++)
10560         {
10561           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10562
10563           this_name = AREF (eol_type, i);
10564           this_aliases = list1 (this_name);
10565           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10566           this_spec = make_uninit_vector (3);
10567           ASET (this_spec, 0, attrs);
10568           ASET (this_spec, 1, this_aliases);
10569           ASET (this_spec, 2, this_eol_type);
10570           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10571           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10572           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist, Qnil);
10573           if (NILP (val))
10574             Vcoding_system_alist
10575               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10576                        Vcoding_system_alist);
10577         }
10578     }
10579
10580   spec_vec = make_uninit_vector (3);
10581   ASET (spec_vec, 0, attrs);
10582   ASET (spec_vec, 1, aliases);
10583   ASET (spec_vec, 2, eol_type);
10584
10585   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10586   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10587   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist, Qnil);
10588   if (NILP (val))
10589     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10590                                   Vcoding_system_alist);
10591
10592   {
10593     int id = coding_categories[category].id;
10594
10595     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10596       setup_coding_system (name, &coding_categories[category]);
10597   }
10598
10599   return Qnil;
10600
10601  short_args:
10602   Fsignal (Qwrong_number_of_arguments,
10603            Fcons (intern ("define-coding-system-internal"),
10604                   make_number (nargs)));
10605 }
10606
10607
10608 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10609        3, 3, 0,
10610        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10611   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10612 {
10613   Lisp_Object spec, attrs;
10614
10615   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10616   attrs = AREF (spec, 0);
10617   if (EQ (prop, QCmnemonic))
10618     {
10619       if (! STRINGP (val))
10620         CHECK_CHARACTER (val);
10621       ASET (attrs, coding_attr_mnemonic, val);
10622     }
10623   else if (EQ (prop, QCdefault_char))
10624     {
10625       if (NILP (val))
10626         val = make_number (' ');
10627       else
10628         CHECK_CHARACTER (val);
10629       ASET (attrs, coding_attr_default_char, val);
10630     }
10631   else if (EQ (prop, QCdecode_translation_table))
10632     {
10633       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10634         CHECK_SYMBOL (val);
10635       ASET (attrs, coding_attr_decode_tbl, val);
10636     }
10637   else if (EQ (prop, QCencode_translation_table))
10638     {
10639       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10640         CHECK_SYMBOL (val);
10641       ASET (attrs, coding_attr_encode_tbl, val);
10642     }
10643   else if (EQ (prop, QCpost_read_conversion))
10644     {
10645       CHECK_SYMBOL (val);
10646       ASET (attrs, coding_attr_post_read, val);
10647     }
10648   else if (EQ (prop, QCpre_write_conversion))
10649     {
10650       CHECK_SYMBOL (val);
10651       ASET (attrs, coding_attr_pre_write, val);
10652     }
10653   else if (EQ (prop, QCascii_compatible_p))
10654     {
10655       ASET (attrs, coding_attr_ascii_compat, val);
10656     }
10657
10658   ASET (attrs, coding_attr_plist,
10659         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10660   return val;
10661 }
10662
10663
10664 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10665        Sdefine_coding_system_alias, 2, 2, 0,
10666        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10667   (Lisp_Object alias, Lisp_Object coding_system)
10668 {
10669   Lisp_Object spec, aliases, eol_type, val;
10670
10671   CHECK_SYMBOL (alias);
10672   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10673   aliases = AREF (spec, 1);
10674   /* ALIASES should be a list of length more than zero, and the first
10675      element is a base coding system.  Append ALIAS at the tail of the
10676      list.  */
10677   while (!NILP (XCDR (aliases)))
10678     aliases = XCDR (aliases);
10679   XSETCDR (aliases, list1 (alias));
10680
10681   eol_type = AREF (spec, 2);
10682   if (VECTORP (eol_type))
10683     {
10684       Lisp_Object subsidiaries;
10685       int i;
10686
10687       subsidiaries = make_subsidiaries (alias);
10688       for (i = 0; i < 3; i++)
10689         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10690                                      AREF (eol_type, i));
10691     }
10692
10693   Fputhash (alias, spec, Vcoding_system_hash_table);
10694   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10695   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist, Qnil);
10696   if (NILP (val))
10697     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10698                                   Vcoding_system_alist);
10699
10700   return Qnil;
10701 }
10702
10703 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10704        1, 1, 0,
10705        doc: /* Return the base of CODING-SYSTEM.
10706 Any alias or subsidiary coding system is not a base coding system.  */)
10707   (Lisp_Object coding_system)
10708 {
10709   Lisp_Object spec, attrs;
10710
10711   if (NILP (coding_system))
10712     return (Qno_conversion);
10713   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10714   attrs = AREF (spec, 0);
10715   return CODING_ATTR_BASE_NAME (attrs);
10716 }
10717
10718 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10719        1, 1, 0,
10720        doc: /* Return the property list of CODING-SYSTEM.  */)
10721   (Lisp_Object coding_system)
10722 {
10723   Lisp_Object spec, attrs;
10724
10725   if (NILP (coding_system))
10726     coding_system = Qno_conversion;
10727   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10728   attrs = AREF (spec, 0);
10729   return CODING_ATTR_PLIST (attrs);
10730 }
10731
10732
10733 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10734        1, 1, 0,
10735        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10736   (Lisp_Object coding_system)
10737 {
10738   Lisp_Object spec;
10739
10740   if (NILP (coding_system))
10741     coding_system = Qno_conversion;
10742   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10743   return AREF (spec, 1);
10744 }
10745
10746 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10747        Scoding_system_eol_type, 1, 1, 0,
10748        doc: /* Return eol-type of CODING-SYSTEM.
10749 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10750
10751 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10752 and CR respectively.
10753
10754 A vector value indicates that a format of end-of-line should be
10755 detected automatically.  Nth element of the vector is the subsidiary
10756 coding system whose eol-type is N.  */)
10757   (Lisp_Object coding_system)
10758 {
10759   Lisp_Object spec, eol_type;
10760   int n;
10761
10762   if (NILP (coding_system))
10763     coding_system = Qno_conversion;
10764   if (! CODING_SYSTEM_P (coding_system))
10765     return Qnil;
10766   spec = CODING_SYSTEM_SPEC (coding_system);
10767   eol_type = AREF (spec, 2);
10768   if (VECTORP (eol_type))
10769     return Fcopy_sequence (eol_type);
10770   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10771   return make_number (n);
10772 }
10773
10774 #endif /* emacs */
10775
10776 \f
10777 /*** 9. Post-amble ***/
10778
10779 void
10780 init_coding_once (void)
10781 {
10782   int i;
10783
10784   for (i = 0; i < coding_category_max; i++)
10785     {
10786       coding_categories[i].id = -1;
10787       coding_priorities[i] = i;
10788     }
10789
10790   /* ISO2022 specific initialize routine.  */
10791   for (i = 0; i < 0x20; i++)
10792     iso_code_class[i] = ISO_control_0;
10793   for (i = 0x21; i < 0x7F; i++)
10794     iso_code_class[i] = ISO_graphic_plane_0;
10795   for (i = 0x80; i < 0xA0; i++)
10796     iso_code_class[i] = ISO_control_1;
10797   for (i = 0xA1; i < 0xFF; i++)
10798     iso_code_class[i] = ISO_graphic_plane_1;
10799   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10800   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10801   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10802   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10803   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10804   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10805   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10806   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10807   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10808
10809   for (i = 0; i < 256; i++)
10810     {
10811       emacs_mule_bytes[i] = 1;
10812     }
10813   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10814   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10815   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10816   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10817 }
10818
10819 #ifdef emacs
10820
10821 void
10822 syms_of_coding (void)
10823 {
10824   staticpro (&Vcoding_system_hash_table);
10825   Vcoding_system_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
10826
10827   staticpro (&Vsjis_coding_system);
10828   Vsjis_coding_system = Qnil;
10829
10830   staticpro (&Vbig5_coding_system);
10831   Vbig5_coding_system = Qnil;
10832
10833   staticpro (&Vcode_conversion_reused_workbuf);
10834   Vcode_conversion_reused_workbuf = Qnil;
10835
10836   staticpro (&Vcode_conversion_workbuf_name);
10837   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10838
10839   reused_workbuf_in_use = 0;
10840
10841   DEFSYM (Qcharset, "charset");
10842   DEFSYM (Qtarget_idx, "target-idx");
10843   DEFSYM (Qcoding_system_history, "coding-system-history");
10844   Fset (Qcoding_system_history, Qnil);
10845
10846   /* Target FILENAME is the first argument.  */
10847   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10848   /* Target FILENAME is the third argument.  */
10849   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10850
10851   DEFSYM (Qcall_process, "call-process");
10852   /* Target PROGRAM is the first argument.  */
10853   Fput (Qcall_process, Qtarget_idx, make_number (0));
10854
10855   DEFSYM (Qcall_process_region, "call-process-region");
10856   /* Target PROGRAM is the third argument.  */
10857   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10858
10859   DEFSYM (Qstart_process, "start-process");
10860   /* Target PROGRAM is the third argument.  */
10861   Fput (Qstart_process, Qtarget_idx, make_number (2));
10862
10863   DEFSYM (Qopen_network_stream, "open-network-stream");
10864   /* Target SERVICE is the fourth argument.  */
10865   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10866
10867   DEFSYM (Qunix, "unix");
10868   DEFSYM (Qdos, "dos");
10869   DEFSYM (Qmac, "mac");
10870
10871   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10872   DEFSYM (Qundecided, "undecided");
10873   DEFSYM (Qno_conversion, "no-conversion");
10874   DEFSYM (Qraw_text, "raw-text");
10875
10876   DEFSYM (Qiso_2022, "iso-2022");
10877
10878   DEFSYM (Qutf_8, "utf-8");
10879   DEFSYM (Qutf_8_unix, "utf-8-unix");
10880   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10881
10882 #if defined (WINDOWSNT) || defined (CYGWIN)
10883   /* No, not utf-16-le: that one has a BOM.  */
10884   DEFSYM (Qutf_16le, "utf-16le");
10885 #endif
10886
10887   DEFSYM (Qutf_16, "utf-16");
10888   DEFSYM (Qbig, "big");
10889   DEFSYM (Qlittle, "little");
10890
10891   DEFSYM (Qshift_jis, "shift-jis");
10892   DEFSYM (Qbig5, "big5");
10893
10894   DEFSYM (Qcoding_system_p, "coding-system-p");
10895
10896   /* Error signaled when there's a problem with detecting a coding system.  */
10897   DEFSYM (Qcoding_system_error, "coding-system-error");
10898   Fput (Qcoding_system_error, Qerror_conditions,
10899         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10900   Fput (Qcoding_system_error, Qerror_message,
10901         build_pure_c_string ("Invalid coding system"));
10902
10903   DEFSYM (Qtranslation_table, "translation-table");
10904   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10905   DEFSYM (Qtranslation_table_id, "translation-table-id");
10906
10907   /* Coding system emacs-mule and raw-text are for converting only
10908      end-of-line format.  */
10909   DEFSYM (Qemacs_mule, "emacs-mule");
10910
10911   DEFSYM (QCcategory, ":category");
10912   DEFSYM (QCmnemonic, ":mnemonic");
10913   DEFSYM (QCdefault_char, ":default-char");
10914   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10915   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10916   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10917   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10918   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10919
10920   Vcoding_category_table
10921     = Fmake_vector (make_number (coding_category_max), Qnil);
10922   staticpro (&Vcoding_category_table);
10923   /* Followings are target of code detection.  */
10924   ASET (Vcoding_category_table, coding_category_iso_7,
10925         intern_c_string ("coding-category-iso-7"));
10926   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10927         intern_c_string ("coding-category-iso-7-tight"));
10928   ASET (Vcoding_category_table, coding_category_iso_8_1,
10929         intern_c_string ("coding-category-iso-8-1"));
10930   ASET (Vcoding_category_table, coding_category_iso_8_2,
10931         intern_c_string ("coding-category-iso-8-2"));
10932   ASET (Vcoding_category_table, coding_category_iso_7_else,
10933         intern_c_string ("coding-category-iso-7-else"));
10934   ASET (Vcoding_category_table, coding_category_iso_8_else,
10935         intern_c_string ("coding-category-iso-8-else"));
10936   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10937         intern_c_string ("coding-category-utf-8-auto"));
10938   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10939         intern_c_string ("coding-category-utf-8"));
10940   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10941         intern_c_string ("coding-category-utf-8-sig"));
10942   ASET (Vcoding_category_table, coding_category_utf_16_be,
10943         intern_c_string ("coding-category-utf-16-be"));
10944   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10945         intern_c_string ("coding-category-utf-16-auto"));
10946   ASET (Vcoding_category_table, coding_category_utf_16_le,
10947         intern_c_string ("coding-category-utf-16-le"));
10948   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10949         intern_c_string ("coding-category-utf-16-be-nosig"));
10950   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10951         intern_c_string ("coding-category-utf-16-le-nosig"));
10952   ASET (Vcoding_category_table, coding_category_charset,
10953         intern_c_string ("coding-category-charset"));
10954   ASET (Vcoding_category_table, coding_category_sjis,
10955         intern_c_string ("coding-category-sjis"));
10956   ASET (Vcoding_category_table, coding_category_big5,
10957         intern_c_string ("coding-category-big5"));
10958   ASET (Vcoding_category_table, coding_category_ccl,
10959         intern_c_string ("coding-category-ccl"));
10960   ASET (Vcoding_category_table, coding_category_emacs_mule,
10961         intern_c_string ("coding-category-emacs-mule"));
10962   /* Followings are NOT target of code detection.  */
10963   ASET (Vcoding_category_table, coding_category_raw_text,
10964         intern_c_string ("coding-category-raw-text"));
10965   ASET (Vcoding_category_table, coding_category_undecided,
10966         intern_c_string ("coding-category-undecided"));
10967
10968   DEFSYM (Qinsufficient_source, "insufficient-source");
10969   DEFSYM (Qinvalid_source, "invalid-source");
10970   DEFSYM (Qinterrupted, "interrupted");
10971
10972   /* If a symbol has this property, evaluate the value to define the
10973      symbol as a coding system.  */
10974   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10975
10976   defsubr (&Scoding_system_p);
10977   defsubr (&Sread_coding_system);
10978   defsubr (&Sread_non_nil_coding_system);
10979   defsubr (&Scheck_coding_system);
10980   defsubr (&Sdetect_coding_region);
10981   defsubr (&Sdetect_coding_string);
10982   defsubr (&Sfind_coding_systems_region_internal);
10983   defsubr (&Sunencodable_char_position);
10984   defsubr (&Scheck_coding_systems_region);
10985   defsubr (&Sdecode_coding_region);
10986   defsubr (&Sencode_coding_region);
10987   defsubr (&Sdecode_coding_string);
10988   defsubr (&Sencode_coding_string);
10989   defsubr (&Sdecode_sjis_char);
10990   defsubr (&Sencode_sjis_char);
10991   defsubr (&Sdecode_big5_char);
10992   defsubr (&Sencode_big5_char);
10993   defsubr (&Sset_terminal_coding_system_internal);
10994   defsubr (&Sset_safe_terminal_coding_system_internal);
10995   defsubr (&Sterminal_coding_system);
10996   defsubr (&Sset_keyboard_coding_system_internal);
10997   defsubr (&Skeyboard_coding_system);
10998   defsubr (&Sfind_operation_coding_system);
10999   defsubr (&Sset_coding_system_priority);
11000   defsubr (&Sdefine_coding_system_internal);
11001   defsubr (&Sdefine_coding_system_alias);
11002   defsubr (&Scoding_system_put);
11003   defsubr (&Scoding_system_base);
11004   defsubr (&Scoding_system_plist);
11005   defsubr (&Scoding_system_aliases);
11006   defsubr (&Scoding_system_eol_type);
11007   defsubr (&Scoding_system_priority_list);
11008
11009   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
11010                doc: /* List of coding systems.
11011
11012 Do not alter the value of this variable manually.  This variable should be
11013 updated by the functions `define-coding-system' and
11014 `define-coding-system-alias'.  */);
11015   Vcoding_system_list = Qnil;
11016
11017   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
11018                doc: /* Alist of coding system names.
11019 Each element is one element list of coding system name.
11020 This variable is given to `completing-read' as COLLECTION argument.
11021
11022 Do not alter the value of this variable manually.  This variable should be
11023 updated by the functions `make-coding-system' and
11024 `define-coding-system-alias'.  */);
11025   Vcoding_system_alist = Qnil;
11026
11027   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
11028                doc: /* List of coding-categories (symbols) ordered by priority.
11029
11030 On detecting a coding system, Emacs tries code detection algorithms
11031 associated with each coding-category one by one in this order.  When
11032 one algorithm agrees with a byte sequence of source text, the coding
11033 system bound to the corresponding coding-category is selected.
11034
11035 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
11036   {
11037     int i;
11038
11039     Vcoding_category_list = Qnil;
11040     for (i = coding_category_max - 1; i >= 0; i--)
11041       Vcoding_category_list
11042         = Fcons (AREF (Vcoding_category_table, i),
11043                  Vcoding_category_list);
11044   }
11045
11046   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11047                doc: /* Specify the coding system for read operations.
11048 It is useful to bind this variable with `let', but do not set it globally.
11049 If the value is a coding system, it is used for decoding on read operation.
11050 If not, an appropriate element is used from one of the coding system alists.
11051 There are three such tables: `file-coding-system-alist',
11052 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11053   Vcoding_system_for_read = Qnil;
11054
11055   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11056                doc: /* Specify the coding system for write operations.
11057 Programs bind this variable with `let', but you should not set it globally.
11058 If the value is a coding system, it is used for encoding of output,
11059 when writing it to a file and when sending it to a file or subprocess.
11060
11061 If this does not specify a coding system, an appropriate element
11062 is used from one of the coding system alists.
11063 There are three such tables: `file-coding-system-alist',
11064 `process-coding-system-alist', and `network-coding-system-alist'.
11065 For output to files, if the above procedure does not specify a coding system,
11066 the value of `buffer-file-coding-system' is used.  */);
11067   Vcoding_system_for_write = Qnil;
11068
11069   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11070                doc: /*
11071 Coding system used in the latest file or process I/O.  */);
11072   Vlast_coding_system_used = Qnil;
11073
11074   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11075                doc: /*
11076 Error status of the last code conversion.
11077
11078 When an error was detected in the last code conversion, this variable
11079 is set to one of the following symbols.
11080   `insufficient-source'
11081   `inconsistent-eol'
11082   `invalid-source'
11083   `interrupted'
11084   `insufficient-memory'
11085 When no error was detected, the value doesn't change.  So, to check
11086 the error status of a code conversion by this variable, you must
11087 explicitly set this variable to nil before performing code
11088 conversion.  */);
11089   Vlast_code_conversion_error = Qnil;
11090
11091   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11092                doc: /*
11093 Non-nil means always inhibit code conversion of end-of-line format.
11094 See info node `Coding Systems' and info node `Text and Binary' concerning
11095 such conversion.  */);
11096   inhibit_eol_conversion = 0;
11097
11098   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11099                doc: /*
11100 Non-nil means process buffer inherits coding system of process output.
11101 Bind it to t if the process output is to be treated as if it were a file
11102 read from some filesystem.  */);
11103   inherit_process_coding_system = 0;
11104
11105   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11106                doc: /*
11107 Alist to decide a coding system to use for a file I/O operation.
11108 The format is ((PATTERN . VAL) ...),
11109 where PATTERN is a regular expression matching a file name,
11110 VAL is a coding system, a cons of coding systems, or a function symbol.
11111 If VAL is a coding system, it is used for both decoding and encoding
11112 the file contents.
11113 If VAL is a cons of coding systems, the car part is used for decoding,
11114 and the cdr part is used for encoding.
11115 If VAL is a function symbol, the function must return a coding system
11116 or a cons of coding systems which are used as above.  The function is
11117 called with an argument that is a list of the arguments with which
11118 `find-operation-coding-system' was called.  If the function can't decide
11119 a coding system, it can return `undecided' so that the normal
11120 code-detection is performed.
11121
11122 See also the function `find-operation-coding-system'
11123 and the variable `auto-coding-alist'.  */);
11124   Vfile_coding_system_alist = Qnil;
11125
11126   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11127                doc: /*
11128 Alist to decide a coding system to use for a process I/O operation.
11129 The format is ((PATTERN . VAL) ...),
11130 where PATTERN is a regular expression matching a program name,
11131 VAL is a coding system, a cons of coding systems, or a function symbol.
11132 If VAL is a coding system, it is used for both decoding what received
11133 from the program and encoding what sent to the program.
11134 If VAL is a cons of coding systems, the car part is used for decoding,
11135 and the cdr part is used for encoding.
11136 If VAL is a function symbol, the function must return a coding system
11137 or a cons of coding systems which are used as above.
11138
11139 See also the function `find-operation-coding-system'.  */);
11140   Vprocess_coding_system_alist = Qnil;
11141
11142   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11143                doc: /*
11144 Alist to decide a coding system to use for a network I/O operation.
11145 The format is ((PATTERN . VAL) ...),
11146 where PATTERN is a regular expression matching a network service name
11147 or is a port number to connect to,
11148 VAL is a coding system, a cons of coding systems, or a function symbol.
11149 If VAL is a coding system, it is used for both decoding what received
11150 from the network stream and encoding what sent to the network stream.
11151 If VAL is a cons of coding systems, the car part is used for decoding,
11152 and the cdr part is used for encoding.
11153 If VAL is a function symbol, the function must return a coding system
11154 or a cons of coding systems which are used as above.
11155
11156 See also the function `find-operation-coding-system'.  */);
11157   Vnetwork_coding_system_alist = Qnil;
11158
11159   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11160                doc: /* Coding system to use with system messages.
11161 Also used for decoding keyboard input on X Window system, and for
11162 encoding standard output and error streams.  */);
11163   Vlocale_coding_system = Qnil;
11164
11165   /* The eol mnemonics are reset in startup.el system-dependently.  */
11166   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11167                doc: /*
11168 String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11169   eol_mnemonic_unix = build_pure_c_string (":");
11170
11171   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11172                doc: /*
11173 String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11174   eol_mnemonic_dos = build_pure_c_string ("\\");
11175
11176   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11177                doc: /*
11178 String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11179   eol_mnemonic_mac = build_pure_c_string ("/");
11180
11181   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11182                doc: /*
11183 String displayed in mode line when end-of-line format is not yet determined.  */);
11184   eol_mnemonic_undecided = build_pure_c_string (":");
11185
11186   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11187                doc: /*
11188 Non-nil enables character translation while encoding and decoding.  */);
11189   Venable_character_translation = Qt;
11190
11191   DEFVAR_LISP ("standard-translation-table-for-decode",
11192                Vstandard_translation_table_for_decode,
11193                doc: /* Table for translating characters while decoding.  */);
11194   Vstandard_translation_table_for_decode = Qnil;
11195
11196   DEFVAR_LISP ("standard-translation-table-for-encode",
11197                Vstandard_translation_table_for_encode,
11198                doc: /* Table for translating characters while encoding.  */);
11199   Vstandard_translation_table_for_encode = Qnil;
11200
11201   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11202                doc: /* Alist of charsets vs revision numbers.
11203 While encoding, if a charset (car part of an element) is found,
11204 designate it with the escape sequence identifying revision (cdr part
11205 of the element).  */);
11206   Vcharset_revision_table = Qnil;
11207
11208   DEFVAR_LISP ("default-process-coding-system",
11209                Vdefault_process_coding_system,
11210                doc: /* Cons of coding systems used for process I/O by default.
11211 The car part is used for decoding a process output,
11212 the cdr part is used for encoding a text to be sent to a process.  */);
11213   Vdefault_process_coding_system = Qnil;
11214
11215   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11216                doc: /*
11217 Table of extra Latin codes in the range 128..159 (inclusive).
11218 This is a vector of length 256.
11219 If Nth element is non-nil, the existence of code N in a file
11220 \(or output of subprocess) doesn't prevent it to be detected as
11221 a coding system of ISO 2022 variant which has a flag
11222 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11223 or reading output of a subprocess.
11224 Only 128th through 159th elements have a meaning.  */);
11225   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11226
11227   DEFVAR_LISP ("select-safe-coding-system-function",
11228                Vselect_safe_coding_system_function,
11229                doc: /*
11230 Function to call to select safe coding system for encoding a text.
11231
11232 If set, this function is called to force a user to select a proper
11233 coding system which can encode the text in the case that a default
11234 coding system used in each operation can't encode the text.  The
11235 function should take care that the buffer is not modified while
11236 the coding system is being selected.
11237
11238 The default value is `select-safe-coding-system' (which see).  */);
11239   Vselect_safe_coding_system_function = Qnil;
11240
11241   DEFVAR_BOOL ("coding-system-require-warning",
11242                coding_system_require_warning,
11243                doc: /* Internal use only.
11244 If non-nil, on writing a file, `select-safe-coding-system-function' is
11245 called even if `coding-system-for-write' is non-nil.  The command
11246 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11247   coding_system_require_warning = 0;
11248
11249
11250   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11251                inhibit_iso_escape_detection,
11252                doc: /*
11253 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11254
11255 When Emacs reads text, it tries to detect how the text is encoded.
11256 This code detection is sensitive to escape sequences.  If Emacs sees
11257 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11258 of the ISO2022 encodings, and decodes text by the corresponding coding
11259 system (e.g. `iso-2022-7bit').
11260
11261 However, there may be a case that you want to read escape sequences in
11262 a file as is.  In such a case, you can set this variable to non-nil.
11263 Then the code detection will ignore any escape sequences, and no text is
11264 detected as encoded in some ISO-2022 encoding.  The result is that all
11265 escape sequences become visible in a buffer.
11266
11267 The default value is nil, and it is strongly recommended not to change
11268 it.  That is because many Emacs Lisp source files that contain
11269 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11270 in Emacs's distribution, and they won't be decoded correctly on
11271 reading if you suppress escape sequence detection.
11272
11273 The other way to read escape sequences in a file without decoding is
11274 to explicitly specify some coding system that doesn't use ISO-2022
11275 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11276   inhibit_iso_escape_detection = 0;
11277
11278   DEFVAR_BOOL ("inhibit-null-byte-detection",
11279                inhibit_null_byte_detection,
11280                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11281 By default, Emacs treats it as binary data, and does not attempt to
11282 decode it.  The effect is as if you specified `no-conversion' for
11283 reading that text.
11284
11285 Set this to non-nil when a regular text happens to include null bytes.
11286 Examples are Index nodes of Info files and null-byte delimited output
11287 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11288 decode text as usual.  */);
11289   inhibit_null_byte_detection = 0;
11290
11291   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11292                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11293 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11294   disable_ascii_optimization = 0;
11295
11296   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11297                doc: /* Char table for translating self-inserting characters.
11298 This is applied to the result of input methods, not their input.
11299 See also `keyboard-translate-table'.
11300
11301 Use of this variable for character code unification was rendered
11302 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11303 internal character representation.  */);
11304   Vtranslation_table_for_input = Qnil;
11305
11306   Lisp_Object args[coding_arg_undecided_max];
11307   memclear (args, sizeof args);
11308
11309   Lisp_Object plist[] =
11310     {
11311       QCname,
11312       args[coding_arg_name] = Qno_conversion,
11313       QCmnemonic,
11314       args[coding_arg_mnemonic] = make_number ('='),
11315       intern_c_string (":coding-type"),
11316       args[coding_arg_coding_type] = Qraw_text,
11317       QCascii_compatible_p,
11318       args[coding_arg_ascii_compatible_p] = Qt,
11319       QCdefault_char,
11320       args[coding_arg_default_char] = make_number (0),
11321       intern_c_string (":for-unibyte"),
11322       args[coding_arg_for_unibyte] = Qt,
11323       intern_c_string (":docstring"),
11324       (build_pure_c_string
11325        ("Do no conversion.\n"
11326         "\n"
11327         "When you visit a file with this coding, the file is read into a\n"
11328         "unibyte buffer as is, thus each byte of a file is treated as a\n"
11329         "character.")),
11330       intern_c_string (":eol-type"),
11331       args[coding_arg_eol_type] = Qunix,
11332     };
11333   args[coding_arg_plist] = CALLMANY (Flist, plist);
11334   Fdefine_coding_system_internal (coding_arg_max, args);
11335
11336   plist[1] = args[coding_arg_name] = Qundecided;
11337   plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11338   plist[5] = args[coding_arg_coding_type] = Qundecided;
11339   /* This is already set.
11340      plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11341   plist[8] = intern_c_string (":charset-list");
11342   plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11343   plist[11] = args[coding_arg_for_unibyte] = Qnil;
11344   plist[13] = build_pure_c_string ("No conversion on encoding, "
11345                                    "automatic conversion on decoding.");
11346   plist[15] = args[coding_arg_eol_type] = Qnil;
11347   args[coding_arg_plist] = CALLMANY (Flist, plist);
11348   args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11349   args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11350   Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11351
11352   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11353
11354   for (int i = 0; i < coding_category_max; i++)
11355     Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11356
11357 #if defined (DOS_NT)
11358   system_eol_type = Qdos;
11359 #else
11360   system_eol_type = Qunix;
11361 #endif
11362   staticpro (&system_eol_type);
11363 }
11364 #endif /* emacs */