src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2018 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or (at
  16 your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  Classic Mac OS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce an encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "termhooks.h"
 301
 302 Lisp_Object Vcoding_system_hash_table;
 303
 304 /* Format of end-of-line decided by system.  This is Qunix on
 305    Unix and Mac, Qdos on DOS/Windows.
 306    This has an effect only for external encoding (i.e. for output to
 307    file and process), not for in-buffer or Lisp string encoding.  */
 308 static Lisp_Object system_eol_type;
 309
 310 #ifdef emacs
 311
 312 /* Coding-systems are handed between Emacs Lisp programs and C internal
 313    routines by the following three variables.  */
 314 /* Coding system to be used to encode text for terminal display when
 315    terminal coding system is nil.  */
 316 struct coding_system safe_terminal_coding;
 317
 318 #endif /* emacs */
 319
 320 /* Two special coding systems.  */
 321 static Lisp_Object Vsjis_coding_system;
 322 static Lisp_Object Vbig5_coding_system;
 323
 324 /* ISO2022 section */
 325
 326 #define CODING_ISO_INITIAL(coding, reg)                 \
 327   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 328                      coding_attr_iso_initial),          \
 329                reg)))
 330
 331
 332 #define CODING_ISO_REQUEST(coding, charset_id)          \
 333   (((charset_id) <= (coding)->max_charset_id            \
 334     ? ((coding)->safe_charsets[charset_id] != 255       \
 335        ? (coding)->safe_charsets[charset_id]            \
 336        : -1)                                            \
 337     : -1))
 338
 339
 340 #define CODING_ISO_FLAGS(coding)        \
 341   ((coding)->spec.iso_2022.flags)
 342 #define CODING_ISO_DESIGNATION(coding, reg)     \
 343   ((coding)->spec.iso_2022.current_designation[reg])
 344 #define CODING_ISO_INVOCATION(coding, plane)    \
 345   ((coding)->spec.iso_2022.current_invocation[plane])
 346 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 347   ((coding)->spec.iso_2022.single_shifting)
 348 #define CODING_ISO_BOL(coding)  \
 349   ((coding)->spec.iso_2022.bol)
 350 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 351   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1       \
 352    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
 353 #define CODING_ISO_CMP_STATUS(coding)   \
 354   (&(coding)->spec.iso_2022.cmp_status)
 355 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 356   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 357 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 358   ((coding)->spec.iso_2022.embedded_utf_8)
 359
 360 /* Control characters of ISO2022.  */
 361                         /* code */      /* function */
 362 #define ISO_CODE_SO     0x0E            /* shift-out */
 363 #define ISO_CODE_SI     0x0F            /* shift-in */
 364 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 365 #define ISO_CODE_ESC    0x1B            /* escape */
 366 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 367 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 368 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 369
 370 /* All code (1-byte) of ISO2022 is classified into one of the
 371    followings.  */
 372 enum iso_code_class_type
 373   {
 374     ISO_control_0,              /* Control codes in the range
 375                                    0x00..0x1F and 0x7F, except for the
 376                                    following 5 codes.  */
 377     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 378     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 379     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 380     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 381     ISO_control_1,              /* Control codes in the range
 382                                    0x80..0x9F, except for the
 383                                    following 3 codes.  */
 384     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 385     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 386     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 387     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 388     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 389     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 390     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 391   };
 392
 393 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 394     `iso-flags' attribute of an iso2022 coding system.  */
 395
 396 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 397    instead of the correct short-form sequence (e.g. ESC $ A).  */
 398 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 399
 400 /* If set, reset graphic planes and registers at end-of-line to the
 401    initial state.  */
 402 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 403
 404 /* If set, reset graphic planes and registers before any control
 405    characters to the initial state.  */
 406 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 407
 408 /* If set, encode by 7-bit environment.  */
 409 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 410
 411 /* If set, use locking-shift function.  */
 412 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 413
 414 /* If set, use single-shift function.  Overwrite
 415    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 416 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 417
 418 /* If set, use designation escape sequence.  */
 419 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 420
 421 /* If set, produce revision number sequence.  */
 422 #define CODING_ISO_FLAG_REVISION        0x0080
 423
 424 /* If set, produce ISO6429's direction specifying sequence.  */
 425 #define CODING_ISO_FLAG_DIRECTION       0x0100
 426
 427 /* If set, assume designation states are reset at beginning of line on
 428    output.  */
 429 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 430
 431 /* If set, designation sequence should be placed at beginning of line
 432    on output.  */
 433 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 434
 435 /* If set, do not encode unsafe characters on output.  */
 436 #define CODING_ISO_FLAG_SAFE            0x0800
 437
 438 /* If set, extra latin codes (128..159) are accepted as a valid code
 439    on input.  */
 440 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 441
 442 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 443
 444 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 445
 446 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 447
 448 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 449
 450 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 451
 452 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 453
 454 /* A character to be produced on output if encoding of the original
 455    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 456 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 457
 458 /* UTF-8 section */
 459 #define CODING_UTF_8_BOM(coding)        \
 460   ((coding)->spec.utf_8_bom)
 461
 462 /* UTF-16 section */
 463 #define CODING_UTF_16_BOM(coding)       \
 464   ((coding)->spec.utf_16.bom)
 465
 466 #define CODING_UTF_16_ENDIAN(coding)    \
 467   ((coding)->spec.utf_16.endian)
 468
 469 #define CODING_UTF_16_SURROGATE(coding) \
 470   ((coding)->spec.utf_16.surrogate)
 471
 472
 473 /* CCL section */
 474 #define CODING_CCL_DECODER(coding)      \
 475   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 476 #define CODING_CCL_ENCODER(coding)      \
 477   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 478 #define CODING_CCL_VALIDS(coding)                                          \
 479   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 480
 481 /* Index for each coding category in `coding_categories' */
 482
 483 enum coding_category
 484   {
 485     coding_category_iso_7,
 486     coding_category_iso_7_tight,
 487     coding_category_iso_8_1,
 488     coding_category_iso_8_2,
 489     coding_category_iso_7_else,
 490     coding_category_iso_8_else,
 491     coding_category_utf_8_auto,
 492     coding_category_utf_8_nosig,
 493     coding_category_utf_8_sig,
 494     coding_category_utf_16_auto,
 495     coding_category_utf_16_be,
 496     coding_category_utf_16_le,
 497     coding_category_utf_16_be_nosig,
 498     coding_category_utf_16_le_nosig,
 499     coding_category_charset,
 500     coding_category_sjis,
 501     coding_category_big5,
 502     coding_category_ccl,
 503     coding_category_emacs_mule,
 504     /* All above are targets of code detection.  */
 505     coding_category_raw_text,
 506     coding_category_undecided,
 507     coding_category_max
 508   };
 509
 510 /* Definitions of flag bits used in detect_coding_XXXX.  */
 511 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 512 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 513 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 514 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 515 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 516 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 517 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 518 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 519 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 520 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 521 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 522 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 523 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 524 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 525 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 526 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 527 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 528 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 529 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 530 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 531
 532 /* This value is returned if detect_coding_mask () find nothing other
 533    than ASCII characters.  */
 534 #define CATEGORY_MASK_ANY               \
 535   (CATEGORY_MASK_ISO_7                  \
 536    | CATEGORY_MASK_ISO_7_TIGHT          \
 537    | CATEGORY_MASK_ISO_8_1              \
 538    | CATEGORY_MASK_ISO_8_2              \
 539    | CATEGORY_MASK_ISO_7_ELSE           \
 540    | CATEGORY_MASK_ISO_8_ELSE           \
 541    | CATEGORY_MASK_UTF_8_AUTO           \
 542    | CATEGORY_MASK_UTF_8_NOSIG          \
 543    | CATEGORY_MASK_UTF_8_SIG            \
 544    | CATEGORY_MASK_UTF_16_AUTO          \
 545    | CATEGORY_MASK_UTF_16_BE            \
 546    | CATEGORY_MASK_UTF_16_LE            \
 547    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 548    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 549    | CATEGORY_MASK_CHARSET              \
 550    | CATEGORY_MASK_SJIS                 \
 551    | CATEGORY_MASK_BIG5                 \
 552    | CATEGORY_MASK_CCL                  \
 553    | CATEGORY_MASK_EMACS_MULE)
 554
 555
 556 #define CATEGORY_MASK_ISO_7BIT \
 557   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 558
 559 #define CATEGORY_MASK_ISO_8BIT \
 560   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 561
 562 #define CATEGORY_MASK_ISO_ELSE \
 563   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 564
 565 #define CATEGORY_MASK_ISO_ESCAPE        \
 566   (CATEGORY_MASK_ISO_7                  \
 567    | CATEGORY_MASK_ISO_7_TIGHT          \
 568    | CATEGORY_MASK_ISO_7_ELSE           \
 569    | CATEGORY_MASK_ISO_8_ELSE)
 570
 571 #define CATEGORY_MASK_ISO       \
 572   (  CATEGORY_MASK_ISO_7BIT     \
 573      | CATEGORY_MASK_ISO_8BIT   \
 574      | CATEGORY_MASK_ISO_ELSE)
 575
 576 #define CATEGORY_MASK_UTF_16            \
 577   (CATEGORY_MASK_UTF_16_AUTO            \
 578    | CATEGORY_MASK_UTF_16_BE            \
 579    | CATEGORY_MASK_UTF_16_LE            \
 580    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 581    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 582
 583 #define CATEGORY_MASK_UTF_8     \
 584   (CATEGORY_MASK_UTF_8_AUTO     \
 585    | CATEGORY_MASK_UTF_8_NOSIG  \
 586    | CATEGORY_MASK_UTF_8_SIG)
 587
 588 /* Table of coding categories (Lisp symbols).  This variable is for
 589    internal use only.  */
 590 static Lisp_Object Vcoding_category_table;
 591
 592 /* Table of coding-categories ordered by priority.  */
 593 static enum coding_category coding_priorities[coding_category_max];
 594
 595 /* Nth element is a coding context for the coding system bound to the
 596    Nth coding category.  */
 597 static struct coding_system coding_categories[coding_category_max];
 598
 599 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 600
 601 static int
 602 encode_inhibit_flag (Lisp_Object flag)
 603 {
 604   return NILP (flag) ? -1 : EQ (flag, Qt);
 605 }
 606
 607 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 608    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 609
 610 static bool
 611 inhibit_flag (int encoded_flag, bool var)
 612 {
 613   return 0 < encoded_flag + var;
 614 }
 615
 616 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 617   do {                                                  \
 618     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 619     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 620   } while (0)
 621
 622 static void
 623 CHECK_NATNUM_CAR (Lisp_Object x)
 624 {
 625   Lisp_Object tmp = XCAR (x);
 626   CHECK_NATNUM (tmp);
 627   XSETCAR (x, tmp);
 628 }
 629
 630 static void
 631 CHECK_NATNUM_CDR (Lisp_Object x)
 632 {
 633   Lisp_Object tmp = XCDR (x);
 634   CHECK_NATNUM (tmp);
 635   XSETCDR (x, tmp);
 636 }
 637
 638 /* True if CODING's destination can be grown.  */
 639
 640 static bool
 641 growable_destination (struct coding_system *coding)
 642 {
 643   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
 644 }
 645
 646
 647 /* Safely get one byte from the source text pointed by SRC which ends
 648    at SRC_END, and set C to that byte.  If there are not enough bytes
 649    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 650    and a multibyte character is found at SRC, set C to the
 651    negative value of the character code.  The caller should declare
 652    and set these variables appropriately in advance:
 653         src, src_end, multibytep */
 654
 655 #define ONE_MORE_BYTE(c)                                \
 656   do {                                                  \
 657     if (src == src_end)                                 \
 658       {                                                 \
 659         if (src_base < src)                             \
 660           record_conversion_result                      \
 661             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 662         goto no_more_source;                            \
 663       }                                                 \
 664     c = *src++;                                         \
 665     if (multibytep && (c & 0x80))                       \
 666       {                                                 \
 667         if ((c & 0xFE) == 0xC0)                         \
 668           c = ((c & 1) << 6) | *src++;                  \
 669         else                                            \
 670           {                                             \
 671             src--;                                      \
 672             c = - string_char (src, &src, NULL);        \
 673             record_conversion_result                    \
 674               (coding, CODING_RESULT_INVALID_SRC);      \
 675           }                                             \
 676       }                                                 \
 677     consumed_chars++;                                   \
 678   } while (0)
 679
 680 /* Safely get two bytes from the source text pointed by SRC which ends
 681    at SRC_END, and set C1 and C2 to those bytes while skipping the
 682    heading multibyte characters.  If there are not enough bytes in the
 683    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 684    a multibyte character is found for C2, set C2 to the negative value
 685    of the character code.  The caller should declare and set these
 686    variables appropriately in advance:
 687         src, src_end, multibytep
 688    It is intended that this macro is used in detect_coding_utf_16.  */
 689
 690 #define TWO_MORE_BYTES(c1, c2)                          \
 691   do {                                                  \
 692     do {                                                \
 693       if (src == src_end)                               \
 694         goto no_more_source;                            \
 695       c1 = *src++;                                      \
 696       if (multibytep && (c1 & 0x80))                    \
 697         {                                               \
 698           if ((c1 & 0xFE) == 0xC0)                      \
 699             c1 = ((c1 & 1) << 6) | *src++;              \
 700           else                                          \
 701             {                                           \
 702               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 703               c1 = -1;                                  \
 704             }                                           \
 705         }                                               \
 706     } while (c1 < 0);                                   \
 707     if (src == src_end)                                 \
 708       goto no_more_source;                              \
 709     c2 = *src++;                                        \
 710     if (multibytep && (c2 & 0x80))                      \
 711       {                                                 \
 712         if ((c2 & 0xFE) == 0xC0)                        \
 713           c2 = ((c2 & 1) << 6) | *src++;                \
 714         else                                            \
 715           c2 = -1;                                      \
 716       }                                                 \
 717   } while (0)
 718
 719
 720 /* Store a byte C in the place pointed by DST and increment DST to the
 721    next free point, and increment PRODUCED_CHARS.  The caller should
 722    assure that C is 0..127, and declare and set the variable `dst'
 723    appropriately in advance.
 724 */
 725
 726
 727 #define EMIT_ONE_ASCII_BYTE(c)  \
 728   do {                          \
 729     produced_chars++;           \
 730     *dst++ = (c);               \
 731   } while (0)
 732
 733
 734 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 735
 736 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 737   do {                                  \
 738     produced_chars += 2;                \
 739     *dst++ = (c1), *dst++ = (c2);       \
 740   } while (0)
 741
 742
 743 /* Store a byte C in the place pointed by DST and increment DST to the
 744    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 745    store in an appropriate multibyte form.  The caller should
 746    declare and set the variables `dst' and `multibytep' appropriately
 747    in advance.  */
 748
 749 #define EMIT_ONE_BYTE(c)                \
 750   do {                                  \
 751     produced_chars++;                   \
 752     if (multibytep)                     \
 753       {                                 \
 754         unsigned ch = (c);              \
 755         if (ch >= 0x80)                 \
 756           ch = BYTE8_TO_CHAR (ch);      \
 757         CHAR_STRING_ADVANCE (ch, dst);  \
 758       }                                 \
 759     else                                \
 760       *dst++ = (c);                     \
 761   } while (0)
 762
 763
 764 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 765
 766 #define EMIT_TWO_BYTES(c1, c2)          \
 767   do {                                  \
 768     produced_chars += 2;                \
 769     if (multibytep)                     \
 770       {                                 \
 771         unsigned ch;                    \
 772                                         \
 773         ch = (c1);                      \
 774         if (ch >= 0x80)                 \
 775           ch = BYTE8_TO_CHAR (ch);      \
 776         CHAR_STRING_ADVANCE (ch, dst);  \
 777         ch = (c2);                      \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781       }                                 \
 782     else                                \
 783       {                                 \
 784         *dst++ = (c1);                  \
 785         *dst++ = (c2);                  \
 786       }                                 \
 787   } while (0)
 788
 789
 790 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 791   do {                                  \
 792     EMIT_ONE_BYTE (c1);                 \
 793     EMIT_TWO_BYTES (c2, c3);            \
 794   } while (0)
 795
 796
 797 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 798   do {                                          \
 799     EMIT_TWO_BYTES (c1, c2);                    \
 800     EMIT_TWO_BYTES (c3, c4);                    \
 801   } while (0)
 802
 803
 804 static void
 805 record_conversion_result (struct coding_system *coding,
 806                           enum coding_result_code result)
 807 {
 808   coding->result = result;
 809   switch (result)
 810     {
 811     case CODING_RESULT_INSUFFICIENT_SRC:
 812       Vlast_code_conversion_error = Qinsufficient_source;
 813       break;
 814     case CODING_RESULT_INVALID_SRC:
 815       Vlast_code_conversion_error = Qinvalid_source;
 816       break;
 817     case CODING_RESULT_INTERRUPT:
 818       Vlast_code_conversion_error = Qinterrupted;
 819       break;
 820     case CODING_RESULT_INSUFFICIENT_DST:
 821       /* Don't record this error in Vlast_code_conversion_error
 822          because it happens just temporarily and is resolved when the
 823          whole conversion is finished.  */
 824       break;
 825     case CODING_RESULT_SUCCESS:
 826       break;
 827     default:
 828       Vlast_code_conversion_error = intern ("Unknown error");
 829     }
 830 }
 831
 832 /* These wrapper macros are used to preserve validity of pointers into
 833    buffer text across calls to decode_char, encode_char, etc, which
 834    could cause relocation of buffers if it loads a charset map,
 835    because loading a charset map allocates large structures.  */
 836
 837 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 838   do {                                                                       \
 839     ptrdiff_t offset;                                                        \
 840                                                                              \
 841     charset_map_loaded = 0;                                                  \
 842     c = DECODE_CHAR (charset, code);                                         \
 843     if (charset_map_loaded                                                   \
 844         && (offset = coding_change_source (coding)))                         \
 845       {                                                                      \
 846         src += offset;                                                       \
 847         src_base += offset;                                                  \
 848         src_end += offset;                                                   \
 849       }                                                                      \
 850   } while (0)
 851
 852 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 853   do {                                                                  \
 854     ptrdiff_t offset;                                                   \
 855                                                                         \
 856     charset_map_loaded = 0;                                             \
 857     code = ENCODE_CHAR (charset, c);                                    \
 858     if (charset_map_loaded                                              \
 859         && (offset = coding_change_destination (coding)))               \
 860       {                                                                 \
 861         dst += offset;                                                  \
 862         dst_end += offset;                                              \
 863       }                                                                 \
 864   } while (0)
 865
 866 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 867   do {                                                                  \
 868     ptrdiff_t offset;                                                   \
 869                                                                         \
 870     charset_map_loaded = 0;                                             \
 871     charset = char_charset (c, charset_list, code_return);              \
 872     if (charset_map_loaded                                              \
 873         && (offset = coding_change_destination (coding)))               \
 874       {                                                                 \
 875         dst += offset;                                                  \
 876         dst_end += offset;                                              \
 877       }                                                                 \
 878   } while (0)
 879
 880 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 881   do {                                                                  \
 882     ptrdiff_t offset;                                                   \
 883                                                                         \
 884     charset_map_loaded = 0;                                             \
 885     result = CHAR_CHARSET_P (c, charset);                               \
 886     if (charset_map_loaded                                              \
 887         && (offset = coding_change_destination (coding)))               \
 888       {                                                                 \
 889         dst += offset;                                                  \
 890         dst_end += offset;                                              \
 891       }                                                                 \
 892   } while (0)
 893
 894
 895 /* If there are at least BYTES length of room at dst, allocate memory
 896    for coding->destination and update dst and dst_end.  We don't have
 897    to take care of coding->source which will be relocated.  It is
 898    handled by calling coding_set_source in encode_coding.  */
 899
 900 #define ASSURE_DESTINATION(bytes)                               \
 901   do {                                                          \
 902     if (dst + (bytes) >= dst_end)                               \
 903       {                                                         \
 904         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 905                                                                 \
 906         dst = alloc_destination (coding, more_bytes, dst);      \
 907         dst_end = coding->destination + coding->dst_bytes;      \
 908       }                                                         \
 909   } while (0)
 910
 911
 912 /* Store multibyte form of the character C in P, and advance P to the
 913    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 914    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 915    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 916
 917 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 918
 919 /* Return the character code of character whose multibyte form is at
 920    P, and advance P to the end of the multibyte form.  This used to be
 921    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 922    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 923
 924 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 925
 926 /* Set coding->source from coding->src_object.  */
 927
 928 static void
 929 coding_set_source (struct coding_system *coding)
 930 {
 931   if (BUFFERP (coding->src_object))
 932     {
 933       struct buffer *buf = XBUFFER (coding->src_object);
 934
 935       if (coding->src_pos < 0)
 936         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 937       else
 938         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 939     }
 940   else if (STRINGP (coding->src_object))
 941     {
 942       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 943     }
 944   else
 945     {
 946       /* Otherwise, the source is C string and is never relocated
 947          automatically.  Thus we don't have to update anything.  */
 948     }
 949 }
 950
 951
 952 /* Set coding->source from coding->src_object, and return how many
 953    bytes coding->source was changed.  */
 954
 955 static ptrdiff_t
 956 coding_change_source (struct coding_system *coding)
 957 {
 958   const unsigned char *orig = coding->source;
 959   coding_set_source (coding);
 960   return coding->source - orig;
 961 }
 962
 963
 964 /* Set coding->destination from coding->dst_object.  */
 965
 966 static void
 967 coding_set_destination (struct coding_system *coding)
 968 {
 969   if (BUFFERP (coding->dst_object))
 970     {
 971       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 972         {
 973           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 974           coding->dst_bytes = (GAP_END_ADDR
 975                                - (coding->src_bytes - coding->consumed)
 976                                - coding->destination);
 977         }
 978       else
 979         {
 980           /* We are sure that coding->dst_pos_byte is before the gap
 981              of the buffer. */
 982           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 983                                  + coding->dst_pos_byte - BEG_BYTE);
 984           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 985                                - coding->destination);
 986         }
 987     }
 988   else
 989     {
 990       /* Otherwise, the destination is C string and is never relocated
 991          automatically.  Thus we don't have to update anything.  */
 992     }
 993 }
 994
 995
 996 /* Set coding->destination from coding->dst_object, and return how
 997    many bytes coding->destination was changed.  */
 998
 999 static ptrdiff_t
1000 coding_change_destination (struct coding_system *coding)
1001 {
1002   const unsigned char *orig = coding->destination;
1003   coding_set_destination (coding);
1004   return coding->destination - orig;
1005 }
1006
1007
1008 static void
1009 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1010 {
1011   ptrdiff_t newbytes;
1012   if (INT_ADD_WRAPV (coding->dst_bytes, bytes, &newbytes)
1013       || SIZE_MAX < newbytes)
1014     string_overflow ();
1015   coding->destination = xrealloc (coding->destination, newbytes);
1016   coding->dst_bytes = newbytes;
1017 }
1018
1019 static void
1020 coding_alloc_by_making_gap (struct coding_system *coding,
1021                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1022 {
1023   if (EQ (coding->src_object, coding->dst_object))
1024     {
1025       /* The gap may contain the produced data at the head and not-yet
1026          consumed data at the tail.  To preserve those data, we at
1027          first make the gap size to zero, then increase the gap
1028          size.  */
1029       ptrdiff_t add = GAP_SIZE;
1030
1031       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1032       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1033       make_gap (bytes);
1034       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1035       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1036     }
1037   else
1038     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1039 }
1040
1041
1042 static unsigned char *
1043 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1044                    unsigned char *dst)
1045 {
1046   ptrdiff_t offset = dst - coding->destination;
1047
1048   if (BUFFERP (coding->dst_object))
1049     {
1050       struct buffer *buf = XBUFFER (coding->dst_object);
1051
1052       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1053     }
1054   else
1055     coding_alloc_by_realloc (coding, nbytes);
1056   coding_set_destination (coding);
1057   dst = coding->destination + offset;
1058   return dst;
1059 }
1060
1061 /** Macros for annotations.  */
1062
1063 /* An annotation data is stored in the array coding->charbuf in this
1064    format:
1065      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1066    LENGTH is the number of elements in the annotation.
1067    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1068    NCHARS is the number of characters in the text annotated.
1069
1070    The format of the following elements depend on ANNOTATION_MASK.
1071
1072    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1073    follows:
1074      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1075
1076    NBYTES is the number of bytes specified in the header part of
1077    old-style emacs-mule encoding, or 0 for the other kind of
1078    composition.
1079
1080    METHOD is one of enum composition_method.
1081
1082    Optional COMPOSITION-COMPONENTS are characters and composition
1083    rules.
1084
1085    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1086    follows.
1087
1088    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1089    recover from an invalid annotation, and should be skipped by
1090    produce_annotation.  */
1091
1092 /* Maximum length of the header of annotation data.  */
1093 #define MAX_ANNOTATION_LENGTH 5
1094
1095 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1096   do {                                                  \
1097     *(buf)++ = -(len);                                  \
1098     *(buf)++ = (mask);                                  \
1099     *(buf)++ = (nchars);                                \
1100     coding->annotated = 1;                              \
1101   } while (0);
1102
1103 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1104   do {                                                                      \
1105     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1106     *buf++ = nbytes;                                                        \
1107     *buf++ = method;                                                        \
1108   } while (0)
1109
1110
1111 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1112   do {                                                                  \
1113     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1114     *buf++ = id;                                                        \
1115   } while (0)
1116
1117
1118 /* Bitmasks for coding->eol_seen.  */
1119
1120 #define EOL_SEEN_NONE   0
1121 #define EOL_SEEN_LF     1
1122 #define EOL_SEEN_CR     2
1123 #define EOL_SEEN_CRLF   4
1124
1125 \f
1126 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1127
1128
1129
1130 \f
1131 /*** 3. UTF-8 ***/
1132
1133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1134    Return true if a text is encoded in UTF-8.  */
1135
1136 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1137 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1138 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1139 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1140 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1141 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1142
1143 #define UTF_8_BOM_1 0xEF
1144 #define UTF_8_BOM_2 0xBB
1145 #define UTF_8_BOM_3 0xBF
1146
1147 /* Unlike the other detect_coding_XXX, this function counts the number
1148    of characters and checks the EOL format.  */
1149
1150 static bool
1151 detect_coding_utf_8 (struct coding_system *coding,
1152                      struct coding_detection_info *detect_info)
1153 {
1154   const unsigned char *src = coding->source, *src_base;
1155   const unsigned char *src_end = coding->source + coding->src_bytes;
1156   bool multibytep = coding->src_multibyte;
1157   ptrdiff_t consumed_chars = 0;
1158   bool bom_found = 0;
1159   ptrdiff_t nchars = coding->head_ascii;
1160   int eol_seen = coding->eol_seen;
1161
1162   detect_info->checked |= CATEGORY_MASK_UTF_8;
1163   /* A coding system of this category is always ASCII compatible.  */
1164   src += nchars;
1165
1166   if (src == coding->source     /* BOM should be at the head.  */
1167       && src + 3 < src_end      /* BOM is 3-byte long.  */
1168       && src[0] == UTF_8_BOM_1
1169       && src[1] == UTF_8_BOM_2
1170       && src[2] == UTF_8_BOM_3)
1171     {
1172       bom_found = 1;
1173       src += 3;
1174       nchars++;
1175     }
1176
1177   while (1)
1178     {
1179       int c, c1, c2, c3, c4;
1180
1181       src_base = src;
1182       ONE_MORE_BYTE (c);
1183       if (c < 0 || UTF_8_1_OCTET_P (c))
1184         {
1185           nchars++;
1186           if (c == '\r')
1187             {
1188               if (src < src_end && *src == '\n')
1189                 {
1190                   eol_seen |= EOL_SEEN_CRLF;
1191                   src++;
1192                   nchars++;
1193                 }
1194               else
1195                 eol_seen |= EOL_SEEN_CR;
1196             }
1197           else if (c == '\n')
1198             eol_seen |= EOL_SEEN_LF;
1199           continue;
1200         }
1201       ONE_MORE_BYTE (c1);
1202       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1203         break;
1204       if (UTF_8_2_OCTET_LEADING_P (c))
1205         {
1206           nchars++;
1207           continue;
1208         }
1209       ONE_MORE_BYTE (c2);
1210       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1211         break;
1212       if (UTF_8_3_OCTET_LEADING_P (c))
1213         {
1214           nchars++;
1215           continue;
1216         }
1217       ONE_MORE_BYTE (c3);
1218       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1219         break;
1220       if (UTF_8_4_OCTET_LEADING_P (c))
1221         {
1222           nchars++;
1223           continue;
1224         }
1225       ONE_MORE_BYTE (c4);
1226       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1227         break;
1228       if (UTF_8_5_OCTET_LEADING_P (c))
1229         {
1230           nchars++;
1231           continue;
1232         }
1233       break;
1234     }
1235   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1236   return 0;
1237
1238  no_more_source:
1239   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1240     {
1241       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1242       return 0;
1243     }
1244   if (bom_found)
1245     {
1246       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1247       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1248     }
1249   else
1250     {
1251       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1252       if (nchars < src_end - coding->source)
1253         /* The found characters are less than source bytes, which
1254            means that we found a valid non-ASCII characters.  */
1255         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1256     }
1257   coding->detected_utf8_bytes = src_base - coding->source;
1258   coding->detected_utf8_chars = nchars;
1259   return 1;
1260 }
1261
1262
1263 static void
1264 decode_coding_utf_8 (struct coding_system *coding)
1265 {
1266   const unsigned char *src = coding->source + coding->consumed;
1267   const unsigned char *src_end = coding->source + coding->src_bytes;
1268   const unsigned char *src_base;
1269   int *charbuf = coding->charbuf + coding->charbuf_used;
1270   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1271   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1272   bool multibytep = coding->src_multibyte;
1273   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1274   bool eol_dos
1275     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1276   int byte_after_cr = -1;
1277
1278   if (bom != utf_without_bom)
1279     {
1280       int c1, c2, c3;
1281
1282       src_base = src;
1283       ONE_MORE_BYTE (c1);
1284       if (! UTF_8_3_OCTET_LEADING_P (c1))
1285         src = src_base;
1286       else
1287         {
1288           ONE_MORE_BYTE (c2);
1289           if (! UTF_8_EXTRA_OCTET_P (c2))
1290             src = src_base;
1291           else
1292             {
1293               ONE_MORE_BYTE (c3);
1294               if (! UTF_8_EXTRA_OCTET_P (c3))
1295                 src = src_base;
1296               else
1297                 {
1298                   if ((c1 != UTF_8_BOM_1)
1299                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1300                     src = src_base;
1301                   else
1302                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1303                 }
1304             }
1305         }
1306     }
1307   CODING_UTF_8_BOM (coding) = utf_without_bom;
1308
1309   while (1)
1310     {
1311       int c, c1, c2, c3, c4, c5;
1312
1313       src_base = src;
1314       consumed_chars_base = consumed_chars;
1315
1316       if (charbuf >= charbuf_end)
1317         {
1318           if (byte_after_cr >= 0)
1319             src_base--;
1320           break;
1321         }
1322
1323       /* In the simple case, rapidly handle ordinary characters */
1324       if (multibytep && ! eol_dos
1325           && charbuf < charbuf_end - 6 && src < src_end - 6)
1326         {
1327           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1328             {
1329               c1 = *src;
1330               if (c1 & 0x80)
1331                 break;
1332               src++;
1333               consumed_chars++;
1334               *charbuf++ = c1;
1335
1336               c1 = *src;
1337               if (c1 & 0x80)
1338                 break;
1339               src++;
1340               consumed_chars++;
1341               *charbuf++ = c1;
1342
1343               c1 = *src;
1344               if (c1 & 0x80)
1345                 break;
1346               src++;
1347               consumed_chars++;
1348               *charbuf++ = c1;
1349
1350               c1 = *src;
1351               if (c1 & 0x80)
1352                 break;
1353               src++;
1354               consumed_chars++;
1355               *charbuf++ = c1;
1356             }
1357           /* If we handled at least one character, restart the main loop.  */
1358           if (src != src_base)
1359             continue;
1360         }
1361
1362       if (byte_after_cr >= 0)
1363         c1 = byte_after_cr, byte_after_cr = -1;
1364       else
1365         ONE_MORE_BYTE (c1);
1366       if (c1 < 0)
1367         {
1368           c = - c1;
1369         }
1370       else if (UTF_8_1_OCTET_P (c1))
1371         {
1372           if (eol_dos && c1 == '\r')
1373             ONE_MORE_BYTE (byte_after_cr);
1374           c = c1;
1375         }
1376       else
1377         {
1378           ONE_MORE_BYTE (c2);
1379           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1380             goto invalid_code;
1381           if (UTF_8_2_OCTET_LEADING_P (c1))
1382             {
1383               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1384               /* Reject overlong sequences here and below.  Encoders
1385                  producing them are incorrect, they can be misleading,
1386                  and they mess up read/write invariance.  */
1387               if (c < 128)
1388                 goto invalid_code;
1389             }
1390           else
1391             {
1392               ONE_MORE_BYTE (c3);
1393               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1394                 goto invalid_code;
1395               if (UTF_8_3_OCTET_LEADING_P (c1))
1396                 {
1397                   c = (((c1 & 0xF) << 12)
1398                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1399                   if (c < 0x800
1400                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1401                     goto invalid_code;
1402                 }
1403               else
1404                 {
1405                   ONE_MORE_BYTE (c4);
1406                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1407                     goto invalid_code;
1408                   if (UTF_8_4_OCTET_LEADING_P (c1))
1409                     {
1410                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1411                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1412                     if (c < 0x10000)
1413                       goto invalid_code;
1414                     }
1415                   else
1416                     {
1417                       ONE_MORE_BYTE (c5);
1418                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1419                         goto invalid_code;
1420                       if (UTF_8_5_OCTET_LEADING_P (c1))
1421                         {
1422                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1423                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1424                                | (c5 & 0x3F));
1425                           if ((c > MAX_CHAR) || (c < 0x200000))
1426                             goto invalid_code;
1427                         }
1428                       else
1429                         goto invalid_code;
1430                     }
1431                 }
1432             }
1433         }
1434
1435       *charbuf++ = c;
1436       continue;
1437
1438     invalid_code:
1439       src = src_base;
1440       consumed_chars = consumed_chars_base;
1441       ONE_MORE_BYTE (c);
1442       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1443     }
1444
1445  no_more_source:
1446   coding->consumed_char += consumed_chars_base;
1447   coding->consumed = src_base - coding->source;
1448   coding->charbuf_used = charbuf - coding->charbuf;
1449 }
1450
1451
1452 bool
1453 encode_coding_utf_8 (struct coding_system *coding)
1454 {
1455   bool multibytep = coding->dst_multibyte;
1456   int *charbuf = coding->charbuf;
1457   int *charbuf_end = charbuf + coding->charbuf_used;
1458   unsigned char *dst = coding->destination + coding->produced;
1459   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1460   ptrdiff_t produced_chars = 0;
1461   int c;
1462
1463   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1464     {
1465       ASSURE_DESTINATION (3);
1466       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1467       CODING_UTF_8_BOM (coding) = utf_without_bom;
1468     }
1469
1470   if (multibytep)
1471     {
1472       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1473
1474       while (charbuf < charbuf_end)
1475         {
1476           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1477
1478           ASSURE_DESTINATION (safe_room);
1479           c = *charbuf++;
1480           if (CHAR_BYTE8_P (c))
1481             {
1482               c = CHAR_TO_BYTE8 (c);
1483               EMIT_ONE_BYTE (c);
1484             }
1485           else
1486             {
1487               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1488               for (p = str; p < pend; p++)
1489                 EMIT_ONE_BYTE (*p);
1490             }
1491         }
1492     }
1493   else
1494     {
1495       int safe_room = MAX_MULTIBYTE_LENGTH;
1496
1497       while (charbuf < charbuf_end)
1498         {
1499           ASSURE_DESTINATION (safe_room);
1500           c = *charbuf++;
1501           if (CHAR_BYTE8_P (c))
1502             *dst++ = CHAR_TO_BYTE8 (c);
1503           else
1504             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1505         }
1506       produced_chars = dst - (coding->destination + coding->produced);
1507     }
1508   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1509   coding->produced_char += produced_chars;
1510   coding->produced = dst - coding->destination;
1511   return 0;
1512 }
1513
1514
1515 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1516    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1517
1518 static bool
1519 detect_coding_utf_16 (struct coding_system *coding,
1520                       struct coding_detection_info *detect_info)
1521 {
1522   const unsigned char *src = coding->source;
1523   const unsigned char *src_end = coding->source + coding->src_bytes;
1524   bool multibytep = coding->src_multibyte;
1525   int c1, c2;
1526
1527   detect_info->checked |= CATEGORY_MASK_UTF_16;
1528   if (coding->mode & CODING_MODE_LAST_BLOCK
1529       && (coding->src_chars & 1))
1530     {
1531       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1532       return 0;
1533     }
1534
1535   TWO_MORE_BYTES (c1, c2);
1536   if ((c1 == 0xFF) && (c2 == 0xFE))
1537     {
1538       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1539                              | CATEGORY_MASK_UTF_16_AUTO);
1540       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1541                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1542                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1543     }
1544   else if ((c1 == 0xFE) && (c2 == 0xFF))
1545     {
1546       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1547                              | CATEGORY_MASK_UTF_16_AUTO);
1548       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1549                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1550                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1551     }
1552   else if (c2 < 0)
1553     {
1554       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1555       return 0;
1556     }
1557   else
1558     {
1559       /* We check the dispersion of Eth and Oth bytes where E is even and
1560          O is odd.  If both are high, we assume binary data.*/
1561       unsigned char e[256], o[256];
1562       unsigned e_num = 1, o_num = 1;
1563
1564       memset (e, 0, 256);
1565       memset (o, 0, 256);
1566       e[c1] = 1;
1567       o[c2] = 1;
1568
1569       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1570                                 |CATEGORY_MASK_UTF_16_BE
1571                                 | CATEGORY_MASK_UTF_16_LE);
1572
1573       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1574              != CATEGORY_MASK_UTF_16)
1575         {
1576           TWO_MORE_BYTES (c1, c2);
1577           if (c2 < 0)
1578             break;
1579           if (! e[c1])
1580             {
1581               e[c1] = 1;
1582               e_num++;
1583               if (e_num >= 128)
1584                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1585             }
1586           if (! o[c2])
1587             {
1588               o[c2] = 1;
1589               o_num++;
1590               if (o_num >= 128)
1591                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1592             }
1593         }
1594       return 0;
1595     }
1596
1597  no_more_source:
1598   return 1;
1599 }
1600
1601 static void
1602 decode_coding_utf_16 (struct coding_system *coding)
1603 {
1604   const unsigned char *src = coding->source + coding->consumed;
1605   const unsigned char *src_end = coding->source + coding->src_bytes;
1606   const unsigned char *src_base;
1607   int *charbuf = coding->charbuf + coding->charbuf_used;
1608   /* We may produces at most 3 chars in one loop.  */
1609   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1610   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1611   bool multibytep = coding->src_multibyte;
1612   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1613   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1614   int surrogate = CODING_UTF_16_SURROGATE (coding);
1615   bool eol_dos
1616     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1617   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1618
1619   if (bom == utf_with_bom)
1620     {
1621       int c, c1, c2;
1622
1623       src_base = src;
1624       ONE_MORE_BYTE (c1);
1625       ONE_MORE_BYTE (c2);
1626       c = (c1 << 8) | c2;
1627
1628       if (endian == utf_16_big_endian
1629           ? c != 0xFEFF : c != 0xFFFE)
1630         {
1631           /* The first two bytes are not BOM.  Treat them as bytes
1632              for a normal character.  */
1633           src = src_base;
1634         }
1635       CODING_UTF_16_BOM (coding) = utf_without_bom;
1636     }
1637   else if (bom == utf_detect_bom)
1638     {
1639       /* We have already tried to detect BOM and failed in
1640          detect_coding.  */
1641       CODING_UTF_16_BOM (coding) = utf_without_bom;
1642     }
1643
1644   while (1)
1645     {
1646       int c, c1, c2;
1647
1648       src_base = src;
1649       consumed_chars_base = consumed_chars;
1650
1651       if (charbuf >= charbuf_end)
1652         {
1653           if (byte_after_cr1 >= 0)
1654             src_base -= 2;
1655           break;
1656         }
1657
1658       if (byte_after_cr1 >= 0)
1659         c1 = byte_after_cr1, byte_after_cr1 = -1;
1660       else
1661         ONE_MORE_BYTE (c1);
1662       if (c1 < 0)
1663         {
1664           *charbuf++ = -c1;
1665           continue;
1666         }
1667       if (byte_after_cr2 >= 0)
1668         c2 = byte_after_cr2, byte_after_cr2 = -1;
1669       else
1670         ONE_MORE_BYTE (c2);
1671       if (c2 < 0)
1672         {
1673           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1674           *charbuf++ = -c2;
1675           continue;
1676         }
1677       c = (endian == utf_16_big_endian
1678            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1679
1680       if (surrogate)
1681         {
1682           if (! UTF_16_LOW_SURROGATE_P (c))
1683             {
1684               if (endian == utf_16_big_endian)
1685                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1686               else
1687                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1688               *charbuf++ = c1;
1689               *charbuf++ = c2;
1690               if (UTF_16_HIGH_SURROGATE_P (c))
1691                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1692               else
1693                 *charbuf++ = c;
1694             }
1695           else
1696             {
1697               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1698               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1699               *charbuf++ = 0x10000 + c;
1700             }
1701         }
1702       else
1703         {
1704           if (UTF_16_HIGH_SURROGATE_P (c))
1705             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1706           else
1707             {
1708               if (eol_dos && c == '\r')
1709                 {
1710                   ONE_MORE_BYTE (byte_after_cr1);
1711                   ONE_MORE_BYTE (byte_after_cr2);
1712                 }
1713               *charbuf++ = c;
1714             }
1715         }
1716     }
1717
1718  no_more_source:
1719   coding->consumed_char += consumed_chars_base;
1720   coding->consumed = src_base - coding->source;
1721   coding->charbuf_used = charbuf - coding->charbuf;
1722 }
1723
1724 static bool
1725 encode_coding_utf_16 (struct coding_system *coding)
1726 {
1727   bool multibytep = coding->dst_multibyte;
1728   int *charbuf = coding->charbuf;
1729   int *charbuf_end = charbuf + coding->charbuf_used;
1730   unsigned char *dst = coding->destination + coding->produced;
1731   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1732   int safe_room = 8;
1733   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1734   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1735   ptrdiff_t produced_chars = 0;
1736   int c;
1737
1738   if (bom != utf_without_bom)
1739     {
1740       ASSURE_DESTINATION (safe_room);
1741       if (big_endian)
1742         EMIT_TWO_BYTES (0xFE, 0xFF);
1743       else
1744         EMIT_TWO_BYTES (0xFF, 0xFE);
1745       CODING_UTF_16_BOM (coding) = utf_without_bom;
1746     }
1747
1748   while (charbuf < charbuf_end)
1749     {
1750       ASSURE_DESTINATION (safe_room);
1751       c = *charbuf++;
1752       if (c > MAX_UNICODE_CHAR)
1753         c = coding->default_char;
1754
1755       if (c < 0x10000)
1756         {
1757           if (big_endian)
1758             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1759           else
1760             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1761         }
1762       else
1763         {
1764           int c1, c2;
1765
1766           c -= 0x10000;
1767           c1 = (c >> 10) + 0xD800;
1768           c2 = (c & 0x3FF) + 0xDC00;
1769           if (big_endian)
1770             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1771           else
1772             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1773         }
1774     }
1775   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1776   coding->produced = dst - coding->destination;
1777   coding->produced_char += produced_chars;
1778   return 0;
1779 }
1780
1781 \f
1782 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1783
1784 /* Emacs' internal format for representation of multiple character
1785    sets is a kind of multi-byte encoding, i.e. characters are
1786    represented by variable-length sequences of one-byte codes.
1787
1788    ASCII characters and control characters (e.g. `tab', `newline') are
1789    represented by one-byte sequences which are their ASCII codes, in
1790    the range 0x00 through 0x7F.
1791
1792    8-bit characters of the range 0x80..0x9F are represented by
1793    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1794    code + 0x20).
1795
1796    8-bit characters of the range 0xA0..0xFF are represented by
1797    one-byte sequences which are their 8-bit code.
1798
1799    The other characters are represented by a sequence of `base
1800    leading-code', optional `extended leading-code', and one or two
1801    `position-code's.  The length of the sequence is determined by the
1802    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1803    whereas extended leading-code and position-code take the range 0xA0
1804    through 0xFF.  See `charset.h' for more details about leading-code
1805    and position-code.
1806
1807    --- CODE RANGE of Emacs' internal format ---
1808    character set        range
1809    -------------        -----
1810    ascii                0x00..0x7F
1811    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1812    eight-bit-graphic    0xA0..0xBF
1813    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1814    ---------------------------------------------
1815
1816    As this is the internal character representation, the format is
1817    usually not used externally (i.e. in a file or in a data sent to a
1818    process).  But, it is possible to have a text externally in this
1819    format (i.e. by encoding by the coding system `emacs-mule').
1820
1821    In that case, a sequence of one-byte codes has a slightly different
1822    form.
1823
1824    At first, all characters in eight-bit-control are represented by
1825    one-byte sequences which are their 8-bit code.
1826
1827    Next, character composition data are represented by the byte
1828    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1829    where,
1830         METHOD is 0xF2 plus one of composition method (enum
1831         composition_method),
1832
1833         BYTES is 0xA0 plus a byte length of this composition data,
1834
1835         CHARS is 0xA0 plus a number of characters composed by this
1836         data,
1837
1838         COMPONENTs are characters of multibyte form or composition
1839         rules encoded by two-byte of ASCII codes.
1840
1841    In addition, for backward compatibility, the following formats are
1842    also recognized as composition data on decoding.
1843
1844    0x80 MSEQ ...
1845    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1846
1847    Here,
1848         MSEQ is a multibyte form but in these special format:
1849           ASCII: 0xA0 ASCII_CODE+0x80,
1850           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1851         RULE is a one byte code of the range 0xA0..0xF0 that
1852         represents a composition rule.
1853   */
1854
1855 char emacs_mule_bytes[256];
1856
1857
1858 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1859    Return true if a text is encoded in 'emacs-mule'.  */
1860
1861 static bool
1862 detect_coding_emacs_mule (struct coding_system *coding,
1863                           struct coding_detection_info *detect_info)
1864 {
1865   const unsigned char *src = coding->source, *src_base;
1866   const unsigned char *src_end = coding->source + coding->src_bytes;
1867   bool multibytep = coding->src_multibyte;
1868   ptrdiff_t consumed_chars = 0;
1869   int c;
1870   int found = 0;
1871
1872   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1873   /* A coding system of this category is always ASCII compatible.  */
1874   src += coding->head_ascii;
1875
1876   while (1)
1877     {
1878       src_base = src;
1879       ONE_MORE_BYTE (c);
1880       if (c < 0)
1881         continue;
1882       if (c == 0x80)
1883         {
1884           /* Perhaps the start of composite character.  We simply skip
1885              it because analyzing it is too heavy for detecting.  But,
1886              at least, we check that the composite character
1887              constitutes of more than 4 bytes.  */
1888           const unsigned char *src_start;
1889
1890         repeat:
1891           src_start = src;
1892           do
1893             {
1894               ONE_MORE_BYTE (c);
1895             }
1896           while (c >= 0xA0);
1897
1898           if (src - src_start <= 4)
1899             break;
1900           found = CATEGORY_MASK_EMACS_MULE;
1901           if (c == 0x80)
1902             goto repeat;
1903         }
1904
1905       if (c < 0x80)
1906         {
1907           if (c < 0x20
1908               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1909             break;
1910         }
1911       else
1912         {
1913           int more_bytes = emacs_mule_bytes[c] - 1;
1914
1915           while (more_bytes > 0)
1916             {
1917               ONE_MORE_BYTE (c);
1918               if (c < 0xA0)
1919                 {
1920                   src--;        /* Unread the last byte.  */
1921                   break;
1922                 }
1923               more_bytes--;
1924             }
1925           if (more_bytes != 0)
1926             break;
1927           found = CATEGORY_MASK_EMACS_MULE;
1928         }
1929     }
1930   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1931   return 0;
1932
1933  no_more_source:
1934   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1935     {
1936       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1937       return 0;
1938     }
1939   detect_info->found |= found;
1940   return 1;
1941 }
1942
1943
1944 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1945    character.  If CMP_STATUS indicates that we must expect MSEQ or
1946    RULE described above, decode it and return the negative value of
1947    the decoded character or rule.  If an invalid byte is found, return
1948    -1.  If SRC is too short, return -2.  */
1949
1950 static int
1951 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1952                  int *nbytes, int *nchars, int *id,
1953                  struct composition_status *cmp_status)
1954 {
1955   const unsigned char *src_end = coding->source + coding->src_bytes;
1956   const unsigned char *src_base = src;
1957   bool multibytep = coding->src_multibyte;
1958   int charset_ID;
1959   unsigned code;
1960   int c;
1961   ptrdiff_t consumed_chars = 0;
1962   bool mseq_found = 0;
1963
1964   ONE_MORE_BYTE (c);
1965   if (c < 0)
1966     {
1967       c = -c;
1968       charset_ID = emacs_mule_charset[0];
1969     }
1970   else
1971     {
1972       if (c >= 0xA0)
1973         {
1974           if (cmp_status->state != COMPOSING_NO
1975               && cmp_status->old_form)
1976             {
1977               if (cmp_status->state == COMPOSING_CHAR)
1978                 {
1979                   if (c == 0xA0)
1980                     {
1981                       ONE_MORE_BYTE (c);
1982                       c -= 0x80;
1983                       if (c < 0)
1984                         goto invalid_code;
1985                     }
1986                   else
1987                     c -= 0x20;
1988                   mseq_found = 1;
1989                 }
1990               else
1991                 {
1992                   *nbytes = src - src_base;
1993                   *nchars = consumed_chars;
1994                   return -c;
1995                 }
1996             }
1997           else
1998             goto invalid_code;
1999         }
2000
2001       switch (emacs_mule_bytes[c])
2002         {
2003         case 2:
2004           if ((charset_ID = emacs_mule_charset[c]) < 0)
2005             goto invalid_code;
2006           ONE_MORE_BYTE (c);
2007           if (c < 0xA0)
2008             goto invalid_code;
2009           code = c & 0x7F;
2010           break;
2011
2012         case 3:
2013           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2014               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2015             {
2016               ONE_MORE_BYTE (c);
2017               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2018                 goto invalid_code;
2019               ONE_MORE_BYTE (c);
2020               if (c < 0xA0)
2021                 goto invalid_code;
2022               code = c & 0x7F;
2023             }
2024           else
2025             {
2026               if ((charset_ID = emacs_mule_charset[c]) < 0)
2027                 goto invalid_code;
2028               ONE_MORE_BYTE (c);
2029               if (c < 0xA0)
2030                 goto invalid_code;
2031               code = (c & 0x7F) << 8;
2032               ONE_MORE_BYTE (c);
2033               if (c < 0xA0)
2034                 goto invalid_code;
2035               code |= c & 0x7F;
2036             }
2037           break;
2038
2039         case 4:
2040           ONE_MORE_BYTE (c);
2041           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2042             goto invalid_code;
2043           ONE_MORE_BYTE (c);
2044           if (c < 0xA0)
2045             goto invalid_code;
2046           code = (c & 0x7F) << 8;
2047           ONE_MORE_BYTE (c);
2048           if (c < 0xA0)
2049             goto invalid_code;
2050           code |= c & 0x7F;
2051           break;
2052
2053         case 1:
2054           code = c;
2055           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2056           break;
2057
2058         default:
2059           emacs_abort ();
2060         }
2061       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2062                           CHARSET_FROM_ID (charset_ID), code, c);
2063       if (c < 0)
2064         goto invalid_code;
2065     }
2066   *nbytes = src - src_base;
2067   *nchars = consumed_chars;
2068   if (id)
2069     *id = charset_ID;
2070   return (mseq_found ? -c : c);
2071
2072  no_more_source:
2073   return -2;
2074
2075  invalid_code:
2076   return -1;
2077 }
2078
2079
2080 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2081
2082 /* Handle these composition sequence ('|': the end of header elements,
2083    BYTES and CHARS >= 0xA0):
2084
2085    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2086    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2087    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2088
2089    and these old form:
2090
2091    (4) relative composition: 0x80 | MSEQ ... MSEQ
2092    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2093
2094    When the starter 0x80 and the following header elements are found,
2095    this annotation header is produced.
2096
2097         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2098
2099    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2100    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2101
2102    Then, upon reading the following elements, these codes are produced
2103    until the composition end is found:
2104
2105    (1) CHAR ... CHAR
2106    (2) ALT ... ALT CHAR ... CHAR
2107    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2108    (4) CHAR ... CHAR
2109    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2110
2111    When the composition end is found, LENGTH and NCHARS in the
2112    annotation header is updated as below:
2113
2114    (1) LENGTH: unchanged, NCHARS: unchanged
2115    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2116    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2117    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2118    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2119
2120    If an error is found while composing, the annotation header is
2121    changed to the original composition header (plus filler -1s) as
2122    below:
2123
2124    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2125    (5)          [ 0x80 0xFF -1 -1- -1 ]
2126
2127    and the sequence [ -2 DECODED-RULE ] is changed to the original
2128    byte sequence as below:
2129         o the original byte sequence is B: [ B -1 ]
2130         o the original byte sequence is B1 B2: [ B1 B2 ]
2131
2132    Most of the routines are implemented by macros because many
2133    variables and labels in the caller decode_coding_emacs_mule must be
2134    accessible, and they are usually called just once (thus doesn't
2135    increase the size of compiled object).  */
2136
2137 /* Decode a composition rule represented by C as a component of
2138    composition sequence of Emacs 20 style.  Set RULE to the decoded
2139    rule. */
2140
2141 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2142   do {                                                  \
2143     int gref, nref;                                     \
2144                                                         \
2145     c -= 0xA0;                                          \
2146     if (c < 0 || c >= 81)                               \
2147       goto invalid_code;                                \
2148     gref = c / 9, nref = c % 9;                         \
2149     if (gref == 4) gref = 10;                           \
2150     if (nref == 4) nref = 10;                           \
2151     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2152   } while (0)
2153
2154
2155 /* Decode a composition rule represented by C and the following byte
2156    at SRC as a component of composition sequence of Emacs 21 style.
2157    Set RULE to the decoded rule.  */
2158
2159 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2160   do {                                                  \
2161     int gref, nref;                                     \
2162                                                         \
2163     gref = c - 0x20;                                    \
2164     if (gref < 0 || gref >= 81)                         \
2165       goto invalid_code;                                \
2166     ONE_MORE_BYTE (c);                                  \
2167     nref = c - 0x20;                                    \
2168     if (nref < 0 || nref >= 81)                         \
2169       goto invalid_code;                                \
2170     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2171   } while (0)
2172
2173
2174 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2175    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2176    byte length of this composition information, CHARS is the number of
2177    characters composed by this composition.  */
2178
2179 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2180   do {                                                                  \
2181     enum composition_method method = c - 0xF2;                          \
2182     int nbytes, nchars;                                                 \
2183                                                                         \
2184     ONE_MORE_BYTE (c);                                                  \
2185     if (c < 0)                                                          \
2186       goto invalid_code;                                                \
2187     nbytes = c - 0xA0;                                                  \
2188     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2189       goto invalid_code;                                                \
2190     ONE_MORE_BYTE (c);                                                  \
2191     nchars = c - 0xA0;                                                  \
2192     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2193       goto invalid_code;                                                \
2194     cmp_status->old_form = 0;                                           \
2195     cmp_status->method = method;                                        \
2196     if (method == COMPOSITION_RELATIVE)                                 \
2197       cmp_status->state = COMPOSING_CHAR;                               \
2198     else                                                                \
2199       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2200     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2201     cmp_status->nchars = nchars;                                        \
2202     cmp_status->ncomps = nbytes - 4;                                    \
2203     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2204   } while (0)
2205
2206
2207 /* Start of Emacs 20 style format for relative composition.  */
2208
2209 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2210   do {                                                          \
2211     cmp_status->old_form = 1;                                   \
2212     cmp_status->method = COMPOSITION_RELATIVE;                  \
2213     cmp_status->state = COMPOSING_CHAR;                         \
2214     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2215     cmp_status->nchars = cmp_status->ncomps = 0;                \
2216     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2217   } while (0)
2218
2219
2220 /* Start of Emacs 20 style format for rule-base composition.  */
2221
2222 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2223   do {                                                          \
2224     cmp_status->old_form = 1;                                   \
2225     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2226     cmp_status->state = COMPOSING_CHAR;                         \
2227     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2228     cmp_status->nchars = cmp_status->ncomps = 0;                \
2229     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2230   } while (0)
2231
2232
2233 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2234   do {                                                  \
2235     const unsigned char *current_src = src;             \
2236                                                         \
2237     ONE_MORE_BYTE (c);                                  \
2238     if (c < 0)                                          \
2239       goto invalid_code;                                \
2240     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2241         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2242       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2243     else if (c < 0xA0)                                  \
2244       goto invalid_code;                                \
2245     else if (c < 0xC0)                                  \
2246       {                                                 \
2247         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2248         /* Re-read C as a composition component.  */    \
2249         src = current_src;                              \
2250       }                                                 \
2251     else if (c == 0xFF)                                 \
2252       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2253     else                                                \
2254       goto invalid_code;                                \
2255   } while (0)
2256
2257 #define EMACS_MULE_COMPOSITION_END()                            \
2258   do {                                                          \
2259     int idx = - cmp_status->length;                             \
2260                                                                 \
2261     if (cmp_status->old_form)                                   \
2262       charbuf[idx + 2] = cmp_status->nchars;                    \
2263     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2264       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2265     cmp_status->state = COMPOSING_NO;                           \
2266   } while (0)
2267
2268
2269 static int
2270 emacs_mule_finish_composition (int *charbuf,
2271                                struct composition_status *cmp_status)
2272 {
2273   int idx = - cmp_status->length;
2274   int new_chars;
2275
2276   if (cmp_status->old_form && cmp_status->nchars > 0)
2277     {
2278       charbuf[idx + 2] = cmp_status->nchars;
2279       new_chars = 0;
2280       if (cmp_status->method == COMPOSITION_WITH_RULE
2281           && cmp_status->state == COMPOSING_CHAR)
2282         {
2283           /* The last rule was invalid.  */
2284           int rule = charbuf[-1] + 0xA0;
2285
2286           charbuf[-2] = BYTE8_TO_CHAR (rule);
2287           charbuf[-1] = -1;
2288           new_chars = 1;
2289         }
2290     }
2291   else
2292     {
2293       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2294
2295       if (cmp_status->method == COMPOSITION_WITH_RULE)
2296         {
2297           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2298           charbuf[idx++] = -3;
2299           charbuf[idx++] = 0;
2300           new_chars = 1;
2301         }
2302       else
2303         {
2304           int nchars = charbuf[idx + 1] + 0xA0;
2305           int nbytes = charbuf[idx + 2] + 0xA0;
2306
2307           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2308           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2309           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2310           charbuf[idx++] = -1;
2311           new_chars = 4;
2312         }
2313     }
2314   cmp_status->state = COMPOSING_NO;
2315   return new_chars;
2316 }
2317
2318 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2319   do {                                                                    \
2320     if (cmp_status->state != COMPOSING_NO)                                \
2321       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2322   } while (0)
2323
2324
2325 static void
2326 decode_coding_emacs_mule (struct coding_system *coding)
2327 {
2328   const unsigned char *src = coding->source + coding->consumed;
2329   const unsigned char *src_end = coding->source + coding->src_bytes;
2330   const unsigned char *src_base;
2331   int *charbuf = coding->charbuf + coding->charbuf_used;
2332   /* We may produce two annotations (charset and composition) in one
2333      loop and one more charset annotation at the end.  */
2334   int *charbuf_end
2335     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2336       /* We can produce up to 2 characters in a loop.  */
2337       - 1;
2338   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2339   bool multibytep = coding->src_multibyte;
2340   ptrdiff_t char_offset = coding->produced_char;
2341   ptrdiff_t last_offset = char_offset;
2342   int last_id = charset_ascii;
2343   bool eol_dos
2344     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2345   int byte_after_cr = -1;
2346   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2347
2348   if (cmp_status->state != COMPOSING_NO)
2349     {
2350       int i;
2351
2352       if (charbuf_end - charbuf < cmp_status->length)
2353         emacs_abort ();
2354       for (i = 0; i < cmp_status->length; i++)
2355         *charbuf++ = cmp_status->carryover[i];
2356       coding->annotated = 1;
2357     }
2358
2359   while (1)
2360     {
2361       int c;
2362       int id UNINIT;
2363
2364       src_base = src;
2365       consumed_chars_base = consumed_chars;
2366
2367       if (charbuf >= charbuf_end)
2368         {
2369           if (byte_after_cr >= 0)
2370             src_base--;
2371           break;
2372         }
2373
2374       if (byte_after_cr >= 0)
2375         c = byte_after_cr, byte_after_cr = -1;
2376       else
2377         ONE_MORE_BYTE (c);
2378
2379       if (c < 0 || c == 0x80)
2380         {
2381           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2382           if (c < 0)
2383             {
2384               *charbuf++ = -c;
2385               char_offset++;
2386             }
2387           else
2388             DECODE_EMACS_MULE_COMPOSITION_START ();
2389           continue;
2390         }
2391
2392       if (c < 0x80)
2393         {
2394           if (eol_dos && c == '\r')
2395             ONE_MORE_BYTE (byte_after_cr);
2396           id = charset_ascii;
2397           if (cmp_status->state != COMPOSING_NO)
2398             {
2399               if (cmp_status->old_form)
2400                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2401               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2402                 cmp_status->ncomps--;
2403             }
2404         }
2405       else
2406         {
2407           int nchars UNINIT, nbytes UNINIT;
2408           /* emacs_mule_char can load a charset map from a file, which
2409              allocates a large structure and might cause buffer text
2410              to be relocated as result.  Thus, we need to remember the
2411              original pointer to buffer text, and fix up all related
2412              pointers after the call.  */
2413           const unsigned char *orig = coding->source;
2414           ptrdiff_t offset;
2415
2416           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2417                                cmp_status);
2418           offset = coding->source - orig;
2419           if (offset)
2420             {
2421               src += offset;
2422               src_base += offset;
2423               src_end += offset;
2424             }
2425           if (c < 0)
2426             {
2427               if (c == -1)
2428                 goto invalid_code;
2429               if (c == -2)
2430                 break;
2431             }
2432           src = src_base + nbytes;
2433           consumed_chars = consumed_chars_base + nchars;
2434           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2435             cmp_status->ncomps -= nchars;
2436         }
2437
2438       /* Now if C >= 0, we found a normally encoded character, if C <
2439          0, we found an old-style composition component character or
2440          rule.  */
2441
2442       if (cmp_status->state == COMPOSING_NO)
2443         {
2444           if (last_id != id)
2445             {
2446               if (last_id != charset_ascii)
2447                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2448                                   last_id);
2449               last_id = id;
2450               last_offset = char_offset;
2451             }
2452           *charbuf++ = c;
2453           char_offset++;
2454         }
2455       else if (cmp_status->state == COMPOSING_CHAR)
2456         {
2457           if (cmp_status->old_form)
2458             {
2459               if (c >= 0)
2460                 {
2461                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2462                   *charbuf++ = c;
2463                   char_offset++;
2464                 }
2465               else
2466                 {
2467                   *charbuf++ = -c;
2468                   cmp_status->nchars++;
2469                   cmp_status->length++;
2470                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2471                     EMACS_MULE_COMPOSITION_END ();
2472                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2473                     cmp_status->state = COMPOSING_RULE;
2474                 }
2475             }
2476           else
2477             {
2478               *charbuf++ = c;
2479               cmp_status->length++;
2480               cmp_status->nchars--;
2481               if (cmp_status->nchars == 0)
2482                 EMACS_MULE_COMPOSITION_END ();
2483             }
2484         }
2485       else if (cmp_status->state == COMPOSING_RULE)
2486         {
2487           int rule;
2488
2489           if (c >= 0)
2490             {
2491               EMACS_MULE_COMPOSITION_END ();
2492               *charbuf++ = c;
2493               char_offset++;
2494             }
2495           else
2496             {
2497               c = -c;
2498               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2499               if (rule < 0)
2500                 goto invalid_code;
2501               *charbuf++ = -2;
2502               *charbuf++ = rule;
2503               cmp_status->length += 2;
2504               cmp_status->state = COMPOSING_CHAR;
2505             }
2506         }
2507       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2508         {
2509           *charbuf++ = c;
2510           cmp_status->length++;
2511           if (cmp_status->ncomps == 0)
2512             cmp_status->state = COMPOSING_CHAR;
2513           else if (cmp_status->ncomps > 0)
2514             {
2515               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2516                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2517             }
2518           else
2519             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2520         }
2521       else                      /* COMPOSING_COMPONENT_RULE */
2522         {
2523           int rule;
2524
2525           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2526           if (rule < 0)
2527             goto invalid_code;
2528           *charbuf++ = -2;
2529           *charbuf++ = rule;
2530           cmp_status->length += 2;
2531           cmp_status->ncomps--;
2532           if (cmp_status->ncomps > 0)
2533             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2534           else
2535             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2536         }
2537       continue;
2538
2539     invalid_code:
2540       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2541       src = src_base;
2542       consumed_chars = consumed_chars_base;
2543       ONE_MORE_BYTE (c);
2544       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2545       char_offset++;
2546     }
2547
2548  no_more_source:
2549   if (cmp_status->state != COMPOSING_NO)
2550     {
2551       if (coding->mode & CODING_MODE_LAST_BLOCK)
2552         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2553       else
2554         {
2555           int i;
2556
2557           charbuf -= cmp_status->length;
2558           for (i = 0; i < cmp_status->length; i++)
2559             cmp_status->carryover[i] = charbuf[i];
2560         }
2561     }
2562   if (last_id != charset_ascii)
2563     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2564   coding->consumed_char += consumed_chars_base;
2565   coding->consumed = src_base - coding->source;
2566   coding->charbuf_used = charbuf - coding->charbuf;
2567 }
2568
2569
2570 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2571   do {                                          \
2572     if (id < 0xA0)                              \
2573       codes[0] = id, codes[1] = 0;              \
2574     else if (id < 0xE0)                         \
2575       codes[0] = 0x9A, codes[1] = id;           \
2576     else if (id < 0xF0)                         \
2577       codes[0] = 0x9B, codes[1] = id;           \
2578     else if (id < 0xF5)                         \
2579       codes[0] = 0x9C, codes[1] = id;           \
2580     else                                        \
2581       codes[0] = 0x9D, codes[1] = id;           \
2582   } while (0);
2583
2584
2585 static bool
2586 encode_coding_emacs_mule (struct coding_system *coding)
2587 {
2588   bool multibytep = coding->dst_multibyte;
2589   int *charbuf = coding->charbuf;
2590   int *charbuf_end = charbuf + coding->charbuf_used;
2591   unsigned char *dst = coding->destination + coding->produced;
2592   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2593   int safe_room = 8;
2594   ptrdiff_t produced_chars = 0;
2595   Lisp_Object attrs, charset_list;
2596   int c;
2597   int preferred_charset_id = -1;
2598
2599   CODING_GET_INFO (coding, attrs, charset_list);
2600   if (! EQ (charset_list, Vemacs_mule_charset_list))
2601     {
2602       charset_list = Vemacs_mule_charset_list;
2603       ASET (attrs, coding_attr_charset_list, charset_list);
2604     }
2605
2606   while (charbuf < charbuf_end)
2607     {
2608       ASSURE_DESTINATION (safe_room);
2609       c = *charbuf++;
2610
2611       if (c < 0)
2612         {
2613           /* Handle an annotation.  */
2614           switch (*charbuf)
2615             {
2616             case CODING_ANNOTATE_COMPOSITION_MASK:
2617               /* Not yet implemented.  */
2618               break;
2619             case CODING_ANNOTATE_CHARSET_MASK:
2620               preferred_charset_id = charbuf[3];
2621               if (preferred_charset_id >= 0
2622                   && NILP (Fmemq (make_number (preferred_charset_id),
2623                                   charset_list)))
2624                 preferred_charset_id = -1;
2625               break;
2626             default:
2627               emacs_abort ();
2628             }
2629           charbuf += -c - 1;
2630           continue;
2631         }
2632
2633       if (ASCII_CHAR_P (c))
2634         EMIT_ONE_ASCII_BYTE (c);
2635       else if (CHAR_BYTE8_P (c))
2636         {
2637           c = CHAR_TO_BYTE8 (c);
2638           EMIT_ONE_BYTE (c);
2639         }
2640       else
2641         {
2642           struct charset *charset;
2643           unsigned code;
2644           int dimension;
2645           int emacs_mule_id;
2646           unsigned char leading_codes[2];
2647
2648           if (preferred_charset_id >= 0)
2649             {
2650               bool result;
2651
2652               charset = CHARSET_FROM_ID (preferred_charset_id);
2653               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2654               if (result)
2655                 code = ENCODE_CHAR (charset, c);
2656               else
2657                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2658                                      &code, charset);
2659             }
2660           else
2661             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2662                                  &code, charset);
2663           if (! charset)
2664             {
2665               c = coding->default_char;
2666               if (ASCII_CHAR_P (c))
2667                 {
2668                   EMIT_ONE_ASCII_BYTE (c);
2669                   continue;
2670                 }
2671               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2672                                    &code, charset);
2673             }
2674           dimension = CHARSET_DIMENSION (charset);
2675           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2676           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2677           EMIT_ONE_BYTE (leading_codes[0]);
2678           if (leading_codes[1])
2679             EMIT_ONE_BYTE (leading_codes[1]);
2680           if (dimension == 1)
2681             EMIT_ONE_BYTE (code | 0x80);
2682           else
2683             {
2684               code |= 0x8080;
2685               EMIT_ONE_BYTE (code >> 8);
2686               EMIT_ONE_BYTE (code & 0xFF);
2687             }
2688         }
2689     }
2690   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2691   coding->produced_char += produced_chars;
2692   coding->produced = dst - coding->destination;
2693   return 0;
2694 }
2695
2696 \f
2697 /*** 7. ISO2022 handlers ***/
2698
2699 /* The following note describes the coding system ISO2022 briefly.
2700    Since the intention of this note is to help understand the
2701    functions in this file, some parts are NOT ACCURATE or are OVERLY
2702    SIMPLIFIED.  For thorough understanding, please refer to the
2703    original document of ISO2022.  This is equivalent to the standard
2704    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2705
2706    ISO2022 provides many mechanisms to encode several character sets
2707    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2708    is encoded using bytes less than 128.  This may make the encoded
2709    text a little bit longer, but the text passes more easily through
2710    several types of gateway, some of which strip off the MSB (Most
2711    Significant Bit).
2712
2713    There are two kinds of character sets: control character sets and
2714    graphic character sets.  The former contain control characters such
2715    as `newline' and `escape' to provide control functions (control
2716    functions are also provided by escape sequences).  The latter
2717    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2718    two control character sets and many graphic character sets.
2719
2720    Graphic character sets are classified into one of the following
2721    four classes, according to the number of bytes (DIMENSION) and
2722    number of characters in one dimension (CHARS) of the set:
2723    - DIMENSION1_CHARS94
2724    - DIMENSION1_CHARS96
2725    - DIMENSION2_CHARS94
2726    - DIMENSION2_CHARS96
2727
2728    In addition, each character set is assigned an identification tag,
2729    unique for each set, called the "final character" (denoted as <F>
2730    hereafter).  The <F> of each character set is decided by ECMA(*)
2731    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2732    (0x30..0x3F are for private use only).
2733
2734    Note (*): ECMA = European Computer Manufacturers Association
2735
2736    Here are examples of graphic character sets [NAME(<F>)]:
2737         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2738         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2739         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2740         o DIMENSION2_CHARS96 -- none for the moment
2741
2742    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2743         C0 [0x00..0x1F] -- control character plane 0
2744         GL [0x20..0x7F] -- graphic character plane 0
2745         C1 [0x80..0x9F] -- control character plane 1
2746         GR [0xA0..0xFF] -- graphic character plane 1
2747
2748    A control character set is directly designated and invoked to C0 or
2749    C1 by an escape sequence.  The most common case is that:
2750    - ISO646's  control character set is designated/invoked to C0, and
2751    - ISO6429's control character set is designated/invoked to C1,
2752    and usually these designations/invocations are omitted in encoded
2753    text.  In a 7-bit environment, only C0 can be used, and a control
2754    character for C1 is encoded by an appropriate escape sequence to
2755    fit into the environment.  All control characters for C1 are
2756    defined to have corresponding escape sequences.
2757
2758    A graphic character set is at first designated to one of four
2759    graphic registers (G0 through G3), then these graphic registers are
2760    invoked to GL or GR.  These designations and invocations can be
2761    done independently.  The most common case is that G0 is invoked to
2762    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2763    these invocations and designations are omitted in encoded text.
2764    In a 7-bit environment, only GL can be used.
2765
2766    When a graphic character set of CHARS94 is invoked to GL, codes
2767    0x20 and 0x7F of the GL area work as control characters SPACE and
2768    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2769    be used.
2770
2771    There are two ways of invocation: locking-shift and single-shift.
2772    With locking-shift, the invocation lasts until the next different
2773    invocation, whereas with single-shift, the invocation affects the
2774    following character only and doesn't affect the locking-shift
2775    state.  Invocations are done by the following control characters or
2776    escape sequences:
2777
2778    ----------------------------------------------------------------------
2779    abbrev  function                  cntrl escape seq   description
2780    ----------------------------------------------------------------------
2781    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2782    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2783    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2784    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2785    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2786    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2787    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2788    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2789    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2790    ----------------------------------------------------------------------
2791    (*) These are not used by any known coding system.
2792
2793    Control characters for these functions are defined by macros
2794    ISO_CODE_XXX in `coding.h'.
2795
2796    Designations are done by the following escape sequences:
2797    ----------------------------------------------------------------------
2798    escape sequence      description
2799    ----------------------------------------------------------------------
2800    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2801    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2802    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2803    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2804    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2805    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2806    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2807    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2808    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2809    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2810    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2811    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2812    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2813    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2814    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2815    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2816    ----------------------------------------------------------------------
2817
2818    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2819    of dimension 1, chars 94, and final character <F>, etc...
2820
2821    Note (*): Although these designations are not allowed in ISO2022,
2822    Emacs accepts them on decoding, and produces them on encoding
2823    CHARS96 character sets in a coding system which is characterized as
2824    7-bit environment, non-locking-shift, and non-single-shift.
2825
2826    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2827    '(' must be omitted.  We refer to this as "short-form" hereafter.
2828
2829    Now you may notice that there are a lot of ways of encoding the
2830    same multilingual text in ISO2022.  Actually, there exist many
2831    coding systems such as Compound Text (used in X11's inter client
2832    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2833    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2834    localized platforms), and all of these are variants of ISO2022.
2835
2836    In addition to the above, Emacs handles two more kinds of escape
2837    sequences: ISO6429's direction specification and Emacs' private
2838    sequence for specifying character composition.
2839
2840    ISO6429's direction specification takes the following form:
2841         o CSI ']'      -- end of the current direction
2842         o CSI '0' ']'  -- end of the current direction
2843         o CSI '1' ']'  -- start of left-to-right text
2844         o CSI '2' ']'  -- start of right-to-left text
2845    The control character CSI (0x9B: control sequence introducer) is
2846    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2847
2848    Character composition specification takes the following form:
2849         o ESC '0' -- start relative composition
2850         o ESC '1' -- end composition
2851         o ESC '2' -- start rule-base composition (*)
2852         o ESC '3' -- start relative composition with alternate chars  (**)
2853         o ESC '4' -- start rule-base composition with alternate chars  (**)
2854   Since these are not standard escape sequences of any ISO standard,
2855   the use of them with these meanings is restricted to Emacs only.
2856
2857   (*) This form is used only in Emacs 20.7 and older versions,
2858   but newer versions can safely decode it.
2859   (**) This form is used only in Emacs 21.1 and newer versions,
2860   and older versions can't decode it.
2861
2862   Here's a list of example usages of these composition escape
2863   sequences (categorized by `enum composition_method').
2864
2865   COMPOSITION_RELATIVE:
2866         ESC 0 CHAR [ CHAR ] ESC 1
2867   COMPOSITION_WITH_RULE:
2868         ESC 2 CHAR [ RULE CHAR ] ESC 1
2869   COMPOSITION_WITH_ALTCHARS:
2870         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2871   COMPOSITION_WITH_RULE_ALTCHARS:
2872         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2873
2874 static enum iso_code_class_type iso_code_class[256];
2875
2876 #define SAFE_CHARSET_P(coding, id)      \
2877   ((id) <= (coding)->max_charset_id     \
2878    && (coding)->safe_charsets[id] != 255)
2879
2880 static void
2881 setup_iso_safe_charsets (Lisp_Object attrs)
2882 {
2883   Lisp_Object charset_list, safe_charsets;
2884   Lisp_Object request;
2885   Lisp_Object reg_usage;
2886   Lisp_Object tail;
2887   EMACS_INT reg94, reg96;
2888   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2889   int max_charset_id;
2890
2891   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2892   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2893       && ! EQ (charset_list, Viso_2022_charset_list))
2894     {
2895       charset_list = Viso_2022_charset_list;
2896       ASET (attrs, coding_attr_charset_list, charset_list);
2897       ASET (attrs, coding_attr_safe_charsets, Qnil);
2898     }
2899
2900   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2901     return;
2902
2903   max_charset_id = 0;
2904   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2905     {
2906       int id = XINT (XCAR (tail));
2907       if (max_charset_id < id)
2908         max_charset_id = id;
2909     }
2910
2911   safe_charsets = make_uninit_string (max_charset_id + 1);
2912   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2913   request = AREF (attrs, coding_attr_iso_request);
2914   reg_usage = AREF (attrs, coding_attr_iso_usage);
2915   reg94 = XINT (XCAR (reg_usage));
2916   reg96 = XINT (XCDR (reg_usage));
2917
2918   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2919     {
2920       Lisp_Object id;
2921       Lisp_Object reg;
2922       struct charset *charset;
2923
2924       id = XCAR (tail);
2925       charset = CHARSET_FROM_ID (XINT (id));
2926       reg = Fcdr (Fassq (id, request));
2927       if (! NILP (reg))
2928         SSET (safe_charsets, XINT (id), XINT (reg));
2929       else if (charset->iso_chars_96)
2930         {
2931           if (reg96 < 4)
2932             SSET (safe_charsets, XINT (id), reg96);
2933         }
2934       else
2935         {
2936           if (reg94 < 4)
2937             SSET (safe_charsets, XINT (id), reg94);
2938         }
2939     }
2940   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2941 }
2942
2943
2944 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2945    Return true if a text is encoded in one of ISO-2022 based coding
2946    systems.  */
2947
2948 static bool
2949 detect_coding_iso_2022 (struct coding_system *coding,
2950                         struct coding_detection_info *detect_info)
2951 {
2952   const unsigned char *src = coding->source, *src_base = src;
2953   const unsigned char *src_end = coding->source + coding->src_bytes;
2954   bool multibytep = coding->src_multibyte;
2955   bool single_shifting = 0;
2956   int id;
2957   int c, c1;
2958   ptrdiff_t consumed_chars = 0;
2959   int i;
2960   int rejected = 0;
2961   int found = 0;
2962   int composition_count = -1;
2963
2964   detect_info->checked |= CATEGORY_MASK_ISO;
2965
2966   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2967     {
2968       struct coding_system *this = &(coding_categories[i]);
2969       Lisp_Object attrs, val;
2970
2971       if (this->id < 0)
2972         continue;
2973       attrs = CODING_ID_ATTRS (this->id);
2974       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2975           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2976         setup_iso_safe_charsets (attrs);
2977       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2978       this->max_charset_id = SCHARS (val) - 1;
2979       this->safe_charsets = SDATA (val);
2980     }
2981
2982   /* A coding system of this category is always ASCII compatible.  */
2983   src += coding->head_ascii;
2984
2985   while (rejected != CATEGORY_MASK_ISO)
2986     {
2987       src_base = src;
2988       ONE_MORE_BYTE (c);
2989       switch (c)
2990         {
2991         case ISO_CODE_ESC:
2992           if (inhibit_iso_escape_detection)
2993             break;
2994           single_shifting = 0;
2995           ONE_MORE_BYTE (c);
2996           if (c == 'N' || c == 'O')
2997             {
2998               /* ESC <Fe> for SS2 or SS3.  */
2999               single_shifting = 1;
3000               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3001             }
3002           else if (c == '1')
3003             {
3004               /* End of composition.  */
3005               if (composition_count < 0
3006                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3007                 /* Invalid */
3008                 break;
3009               composition_count = -1;
3010               found |= CATEGORY_MASK_ISO;
3011             }
3012           else if (c >= '0' && c <= '4')
3013             {
3014               /* ESC <Fp> for start/end composition.  */
3015               composition_count = 0;
3016             }
3017           else
3018             {
3019               if (c >= '(' && c <= '/')
3020                 {
3021                   /* Designation sequence for a charset of dimension 1.  */
3022                   ONE_MORE_BYTE (c1);
3023                   if (c1 < ' ' || c1 >= 0x80
3024                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3025                     {
3026                       /* Invalid designation sequence.  Just ignore.  */
3027                       if (c1 >= 0x80)
3028                         rejected |= (CATEGORY_MASK_ISO_7BIT
3029                                      | CATEGORY_MASK_ISO_7_ELSE);
3030                       break;
3031                     }
3032                 }
3033               else if (c == '$')
3034                 {
3035                   /* Designation sequence for a charset of dimension 2.  */
3036                   ONE_MORE_BYTE (c);
3037                   if (c >= '@' && c <= 'B')
3038                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3039                     id = iso_charset_table[1][0][c];
3040                   else if (c >= '(' && c <= '/')
3041                     {
3042                       ONE_MORE_BYTE (c1);
3043                       if (c1 < ' ' || c1 >= 0x80
3044                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3045                         {
3046                           /* Invalid designation sequence.  Just ignore.  */
3047                           if (c1 >= 0x80)
3048                             rejected |= (CATEGORY_MASK_ISO_7BIT
3049                                          | CATEGORY_MASK_ISO_7_ELSE);
3050                           break;
3051                         }
3052                     }
3053                   else
3054                     {
3055                       /* Invalid designation sequence.  Just ignore it.  */
3056                       if (c >= 0x80)
3057                         rejected |= (CATEGORY_MASK_ISO_7BIT
3058                                      | CATEGORY_MASK_ISO_7_ELSE);
3059                       break;
3060                     }
3061                 }
3062               else
3063                 {
3064                   /* Invalid escape sequence.  Just ignore it.  */
3065                   if (c >= 0x80)
3066                     rejected |= (CATEGORY_MASK_ISO_7BIT
3067                                  | CATEGORY_MASK_ISO_7_ELSE);
3068                   break;
3069                 }
3070
3071               /* We found a valid designation sequence for CHARSET.  */
3072               rejected |= CATEGORY_MASK_ISO_8BIT;
3073               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3074                                   id))
3075                 found |= CATEGORY_MASK_ISO_7;
3076               else
3077                 rejected |= CATEGORY_MASK_ISO_7;
3078               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3079                                   id))
3080                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3081               else
3082                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3083               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3084                                   id))
3085                 found |= CATEGORY_MASK_ISO_7_ELSE;
3086               else
3087                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3088               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3089                                   id))
3090                 found |= CATEGORY_MASK_ISO_8_ELSE;
3091               else
3092                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3093             }
3094           break;
3095
3096         case ISO_CODE_SO:
3097         case ISO_CODE_SI:
3098           /* Locking shift out/in.  */
3099           if (inhibit_iso_escape_detection)
3100             break;
3101           single_shifting = 0;
3102           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3103           break;
3104
3105         case ISO_CODE_CSI:
3106           /* Control sequence introducer.  */
3107           single_shifting = 0;
3108           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3109           found |= CATEGORY_MASK_ISO_8_ELSE;
3110           goto check_extra_latin;
3111
3112         case ISO_CODE_SS2:
3113         case ISO_CODE_SS3:
3114           /* Single shift.   */
3115           if (inhibit_iso_escape_detection)
3116             break;
3117           single_shifting = 0;
3118           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3119           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3120               & CODING_ISO_FLAG_SINGLE_SHIFT)
3121             {
3122               found |= CATEGORY_MASK_ISO_8_1;
3123               single_shifting = 1;
3124             }
3125           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3126               & CODING_ISO_FLAG_SINGLE_SHIFT)
3127             {
3128               found |= CATEGORY_MASK_ISO_8_2;
3129               single_shifting = 1;
3130             }
3131           if (single_shifting)
3132             break;
3133           goto check_extra_latin;
3134
3135         default:
3136           if (c < 0)
3137             continue;
3138           if (c < 0x80)
3139             {
3140               if (composition_count >= 0)
3141                 composition_count++;
3142               single_shifting = 0;
3143               break;
3144             }
3145           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3146           if (c >= 0xA0)
3147             {
3148               found |= CATEGORY_MASK_ISO_8_1;
3149               /* Check the length of succeeding codes of the range
3150                  0xA0..0FF.  If the byte length is even, we include
3151                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3152                  only when we are not single shifting.  */
3153               if (! single_shifting
3154                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3155                 {
3156                   ptrdiff_t len = 1;
3157                   while (src < src_end)
3158                     {
3159                       src_base = src;
3160                       ONE_MORE_BYTE (c);
3161                       if (c < 0xA0)
3162                         {
3163                           src = src_base;
3164                           break;
3165                         }
3166                       len++;
3167                     }
3168
3169                   if (len & 1 && src < src_end)
3170                     {
3171                       rejected |= CATEGORY_MASK_ISO_8_2;
3172                       if (composition_count >= 0)
3173                         composition_count += len;
3174                     }
3175                   else
3176                     {
3177                       found |= CATEGORY_MASK_ISO_8_2;
3178                       if (composition_count >= 0)
3179                         composition_count += len / 2;
3180                     }
3181                 }
3182               break;
3183             }
3184         check_extra_latin:
3185           if (! VECTORP (Vlatin_extra_code_table)
3186               || NILP (AREF (Vlatin_extra_code_table, c)))
3187             {
3188               rejected = CATEGORY_MASK_ISO;
3189               break;
3190             }
3191           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3192               & CODING_ISO_FLAG_LATIN_EXTRA)
3193             found |= CATEGORY_MASK_ISO_8_1;
3194           else
3195             rejected |= CATEGORY_MASK_ISO_8_1;
3196           rejected |= CATEGORY_MASK_ISO_8_2;
3197           break;
3198         }
3199     }
3200   detect_info->rejected |= CATEGORY_MASK_ISO;
3201   return 0;
3202
3203  no_more_source:
3204   detect_info->rejected |= rejected;
3205   detect_info->found |= (found & ~rejected);
3206   return 1;
3207 }
3208
3209
3210 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3211    escape sequence should be kept.  */
3212 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3213   do {                                                                  \
3214     int id, prev;                                                       \
3215                                                                         \
3216     if (final < '0' || final >= 128                                     \
3217         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3218         || !SAFE_CHARSET_P (coding, id))                                \
3219       {                                                                 \
3220         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3221         chars_96 = -1;                                                  \
3222         break;                                                          \
3223       }                                                                 \
3224     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3225     if (id == charset_jisx0201_roman)                                   \
3226       {                                                                 \
3227         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3228           id = charset_ascii;                                           \
3229       }                                                                 \
3230     else if (id == charset_jisx0208_1978)                               \
3231       {                                                                 \
3232         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3233           id = charset_jisx0208;                                        \
3234       }                                                                 \
3235     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3236     /* If there was an invalid designation to REG previously, and this  \
3237        designation is ASCII to REG, we should keep this designation     \
3238        sequence.  */                                                    \
3239     if (prev == -2 && id == charset_ascii)                              \
3240       chars_96 = -1;                                                    \
3241   } while (0)
3242
3243
3244 /* Handle these composition sequence (ALT: alternate char):
3245
3246    (1) relative composition: ESC 0 CHAR ... ESC 1
3247    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3248    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3249    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3250
3251    When the start sequence (ESC 0/2/3/4) is found, this annotation
3252    header is produced.
3253
3254         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3255
3256    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3257    produced until the end sequence (ESC 1) is found:
3258
3259    (1) CHAR ... CHAR
3260    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3261    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3262    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3263
3264    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3265    annotation header is updated as below:
3266
3267    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3268    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3269    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3270    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3271
3272    If an error is found while composing, the annotation header is
3273    changed to:
3274
3275         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3276
3277    and the sequence [ -2 DECODED-RULE ] is changed to the original
3278    byte sequence as below:
3279         o the original byte sequence is B: [ B -1 ]
3280         o the original byte sequence is B1 B2: [ B1 B2 ]
3281    and the sequence [ -1 -1 ] is changed to the original byte
3282    sequence:
3283         [ ESC '0' ]
3284 */
3285
3286 /* Decode a composition rule C1 and maybe one more byte from the
3287    source, and set RULE to the encoded composition rule.  If the rule
3288    is invalid, goto invalid_code.  */
3289
3290 #define DECODE_COMPOSITION_RULE(rule)                                   \
3291   do {                                                                  \
3292     rule = c1 - 32;                                                     \
3293     if (rule < 0)                                                       \
3294       goto invalid_code;                                                \
3295     if (rule < 81)              /* old format (before ver.21) */        \
3296       {                                                                 \
3297         int gref = (rule) / 9;                                          \
3298         int nref = (rule) % 9;                                          \
3299         if (gref == 4) gref = 10;                                       \
3300         if (nref == 4) nref = 10;                                       \
3301         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3302       }                                                                 \
3303     else                        /* new format (after ver.21) */         \
3304       {                                                                 \
3305         int b;                                                          \
3306                                                                         \
3307         ONE_MORE_BYTE (b);                                              \
3308         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3309           goto invalid_code;                                            \
3310         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3311         rule += 0x100;   /* Distinguish it from the old format.  */     \
3312       }                                                                 \
3313   } while (0)
3314
3315 #define ENCODE_COMPOSITION_RULE(rule)                           \
3316   do {                                                          \
3317     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3318                                                                 \
3319     if (rule < 0x100)           /* old format */                \
3320       {                                                         \
3321         if (gref == 10) gref = 4;                               \
3322         if (nref == 10) nref = 4;                               \
3323         charbuf[idx] = 32 + gref * 9 + nref;                    \
3324         charbuf[idx + 1] = -1;                                  \
3325         new_chars++;                                            \
3326       }                                                         \
3327     else                                /* new format */        \
3328       {                                                         \
3329         charbuf[idx] = 32 + 81 + gref;                          \
3330         charbuf[idx + 1] = 32 + nref;                           \
3331         new_chars += 2;                                         \
3332       }                                                         \
3333   } while (0)
3334
3335 /* Finish the current composition as invalid.  */
3336
3337 static int
3338 finish_composition (int *charbuf, struct composition_status *cmp_status)
3339 {
3340   int idx = - cmp_status->length;
3341   int new_chars;
3342
3343   /* Recover the original ESC sequence */
3344   charbuf[idx++] = ISO_CODE_ESC;
3345   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3346                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3347                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3348                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3349                     : '4');
3350   charbuf[idx++] = -2;
3351   charbuf[idx++] = 0;
3352   charbuf[idx++] = -1;
3353   new_chars = cmp_status->nchars;
3354   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3355     for (; idx < 0; idx++)
3356       {
3357         int elt = charbuf[idx];
3358
3359         if (elt == -2)
3360           {
3361             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3362             idx++;
3363           }
3364         else if (elt == -1)
3365           {
3366             charbuf[idx++] = ISO_CODE_ESC;
3367             charbuf[idx] = '0';
3368             new_chars += 2;
3369           }
3370       }
3371   cmp_status->state = COMPOSING_NO;
3372   return new_chars;
3373 }
3374
3375 /* If characters are under composition, finish the composition.  */
3376 #define MAYBE_FINISH_COMPOSITION()                              \
3377   do {                                                          \
3378     if (cmp_status->state != COMPOSING_NO)                      \
3379       char_offset += finish_composition (charbuf, cmp_status);  \
3380   } while (0)
3381
3382 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3383
3384    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3385    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3386    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3387    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3388
3389    Produce this annotation sequence now:
3390
3391    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3392 */
3393
3394 #define DECODE_COMPOSITION_START(c1)                                       \
3395   do {                                                                     \
3396     if (c1 == '0'                                                          \
3397         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3398              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3399             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3400                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3401       {                                                                    \
3402         *charbuf++ = -1;                                                   \
3403         *charbuf++= -1;                                                    \
3404         cmp_status->state = COMPOSING_CHAR;                                \
3405         cmp_status->length += 2;                                           \
3406       }                                                                    \
3407     else                                                                   \
3408       {                                                                    \
3409         MAYBE_FINISH_COMPOSITION ();                                       \
3410         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3411                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3412                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3413                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3414         cmp_status->state                                                  \
3415           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3416         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3417         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3418         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3419         coding->annotated = 1;                                             \
3420       }                                                                    \
3421   } while (0)
3422
3423
3424 /* Handle composition end sequence ESC 1.  */
3425
3426 #define DECODE_COMPOSITION_END()                                        \
3427   do {                                                                  \
3428     if (cmp_status->nchars == 0                                         \
3429         || ((cmp_status->state == COMPOSING_CHAR)                       \
3430             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3431       {                                                                 \
3432         MAYBE_FINISH_COMPOSITION ();                                    \
3433         goto invalid_code;                                              \
3434       }                                                                 \
3435     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3436       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3437     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3438       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3439     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3440     char_offset += cmp_status->nchars;                                  \
3441     cmp_status->state = COMPOSING_NO;                                   \
3442   } while (0)
3443
3444 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3445
3446 #define STORE_COMPOSITION_RULE(rule)    \
3447   do {                                  \
3448     *charbuf++ = -2;                    \
3449     *charbuf++ = rule;                  \
3450     cmp_status->length += 2;            \
3451     cmp_status->state--;                \
3452   } while (0)
3453
3454 /* Store a composed char or a component char C in charbuf, and update
3455    cmp_status.  */
3456
3457 #define STORE_COMPOSITION_CHAR(c)                                       \
3458   do {                                                                  \
3459     *charbuf++ = (c);                                                   \
3460     cmp_status->length++;                                               \
3461     if (cmp_status->state == COMPOSING_CHAR)                            \
3462       cmp_status->nchars++;                                             \
3463     else                                                                \
3464       cmp_status->ncomps++;                                             \
3465     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3466         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3467             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3468       cmp_status->state++;                                              \
3469   } while (0)
3470
3471
3472 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3473
3474 static void
3475 decode_coding_iso_2022 (struct coding_system *coding)
3476 {
3477   const unsigned char *src = coding->source + coding->consumed;
3478   const unsigned char *src_end = coding->source + coding->src_bytes;
3479   const unsigned char *src_base;
3480   int *charbuf = coding->charbuf + coding->charbuf_used;
3481   /* We may produce two annotations (charset and composition) in one
3482      loop and one more charset annotation at the end.  */
3483   int *charbuf_end
3484     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3485   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3486   bool multibytep = coding->src_multibyte;
3487   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3488   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3489   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3490   int charset_id_2, charset_id_3;
3491   struct charset *charset;
3492   int c;
3493   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3494   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3495   ptrdiff_t char_offset = coding->produced_char;
3496   ptrdiff_t last_offset = char_offset;
3497   int last_id = charset_ascii;
3498   bool eol_dos
3499     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3500   int byte_after_cr = -1;
3501   int i;
3502
3503   setup_iso_safe_charsets (attrs);
3504   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3505
3506   if (cmp_status->state != COMPOSING_NO)
3507     {
3508       if (charbuf_end - charbuf < cmp_status->length)
3509         emacs_abort ();
3510       for (i = 0; i < cmp_status->length; i++)
3511         *charbuf++ = cmp_status->carryover[i];
3512       coding->annotated = 1;
3513     }
3514
3515   while (1)
3516     {
3517       int c1, c2, c3;
3518
3519       src_base = src;
3520       consumed_chars_base = consumed_chars;
3521
3522       if (charbuf >= charbuf_end)
3523         {
3524           if (byte_after_cr >= 0)
3525             src_base--;
3526           break;
3527         }
3528
3529       if (byte_after_cr >= 0)
3530         c1 = byte_after_cr, byte_after_cr = -1;
3531       else
3532         ONE_MORE_BYTE (c1);
3533       if (c1 < 0)
3534         goto invalid_code;
3535
3536       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3537         {
3538           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3539           char_offset++;
3540           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3541           continue;
3542         }
3543
3544       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3545         {
3546           if (c1 == ISO_CODE_ESC)
3547             {
3548               if (src + 1 >= src_end)
3549                 goto no_more_source;
3550               *charbuf++ = ISO_CODE_ESC;
3551               char_offset++;
3552               if (src[0] == '%' && src[1] == '@')
3553                 {
3554                   src += 2;
3555                   consumed_chars += 2;
3556                   char_offset += 2;
3557                   /* We are sure charbuf can contain two more chars. */
3558                   *charbuf++ = '%';
3559                   *charbuf++ = '@';
3560                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3561                 }
3562             }
3563           else
3564             {
3565               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3566               char_offset++;
3567             }
3568           continue;
3569         }
3570
3571       if ((cmp_status->state == COMPOSING_RULE
3572            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3573           && c1 != ISO_CODE_ESC)
3574         {
3575           int rule;
3576
3577           DECODE_COMPOSITION_RULE (rule);
3578           STORE_COMPOSITION_RULE (rule);
3579           continue;
3580         }
3581
3582       /* We produce at most one character.  */
3583       switch (iso_code_class [c1])
3584         {
3585         case ISO_0x20_or_0x7F:
3586           if (charset_id_0 < 0
3587               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3588             /* This is SPACE or DEL.  */
3589             charset = CHARSET_FROM_ID (charset_ascii);
3590           else
3591             charset = CHARSET_FROM_ID (charset_id_0);
3592           break;
3593
3594         case ISO_graphic_plane_0:
3595           if (charset_id_0 < 0)
3596             charset = CHARSET_FROM_ID (charset_ascii);
3597           else
3598             charset = CHARSET_FROM_ID (charset_id_0);
3599           break;
3600
3601         case ISO_0xA0_or_0xFF:
3602           if (charset_id_1 < 0
3603               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3604               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3605             goto invalid_code;
3606           /* This is a graphic character, we fall down ... */
3607           FALLTHROUGH;
3608         case ISO_graphic_plane_1:
3609           if (charset_id_1 < 0)
3610             goto invalid_code;
3611           charset = CHARSET_FROM_ID (charset_id_1);
3612           break;
3613
3614         case ISO_control_0:
3615           if (eol_dos && c1 == '\r')
3616             ONE_MORE_BYTE (byte_after_cr);
3617           MAYBE_FINISH_COMPOSITION ();
3618           charset = CHARSET_FROM_ID (charset_ascii);
3619           break;
3620
3621         case ISO_control_1:
3622           goto invalid_code;
3623
3624         case ISO_shift_out:
3625           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3626               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3627             goto invalid_code;
3628           CODING_ISO_INVOCATION (coding, 0) = 1;
3629           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3630           continue;
3631
3632         case ISO_shift_in:
3633           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3634             goto invalid_code;
3635           CODING_ISO_INVOCATION (coding, 0) = 0;
3636           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3637           continue;
3638
3639         case ISO_single_shift_2_7:
3640           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3641             goto invalid_code;
3642           FALLTHROUGH;
3643         case ISO_single_shift_2:
3644           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3645             goto invalid_code;
3646           /* SS2 is handled as an escape sequence of ESC 'N' */
3647           c1 = 'N';
3648           goto label_escape_sequence;
3649
3650         case ISO_single_shift_3:
3651           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3652             goto invalid_code;
3653           /* SS2 is handled as an escape sequence of ESC 'O' */
3654           c1 = 'O';
3655           goto label_escape_sequence;
3656
3657         case ISO_control_sequence_introducer:
3658           /* CSI is handled as an escape sequence of ESC '[' ...  */
3659           c1 = '[';
3660           goto label_escape_sequence;
3661
3662         case ISO_escape:
3663           ONE_MORE_BYTE (c1);
3664         label_escape_sequence:
3665           /* Escape sequences handled here are invocation,
3666              designation, direction specification, and character
3667              composition specification.  */
3668           switch (c1)
3669             {
3670             case '&':           /* revision of following character set */
3671               ONE_MORE_BYTE (c1);
3672               if (!(c1 >= '@' && c1 <= '~'))
3673                 goto invalid_code;
3674               ONE_MORE_BYTE (c1);
3675               if (c1 != ISO_CODE_ESC)
3676                 goto invalid_code;
3677               ONE_MORE_BYTE (c1);
3678               goto label_escape_sequence;
3679
3680             case '$':           /* designation of 2-byte character set */
3681               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3682                 goto invalid_code;
3683               {
3684                 int reg, chars96;
3685
3686                 ONE_MORE_BYTE (c1);
3687                 if (c1 >= '@' && c1 <= 'B')
3688                   {     /* designation of JISX0208.1978, GB2312.1980,
3689                            or JISX0208.1980 */
3690                     reg = 0, chars96 = 0;
3691                   }
3692                 else if (c1 >= 0x28 && c1 <= 0x2B)
3693                   { /* designation of DIMENSION2_CHARS94 character set */
3694                     reg = c1 - 0x28, chars96 = 0;
3695                     ONE_MORE_BYTE (c1);
3696                   }
3697                 else if (c1 >= 0x2C && c1 <= 0x2F)
3698                   { /* designation of DIMENSION2_CHARS96 character set */
3699                     reg = c1 - 0x2C, chars96 = 1;
3700                     ONE_MORE_BYTE (c1);
3701                   }
3702                 else
3703                   goto invalid_code;
3704                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3705                 /* We must update these variables now.  */
3706                 if (reg == 0)
3707                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3708                 else if (reg == 1)
3709                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3710                 if (chars96 < 0)
3711                   goto invalid_code;
3712               }
3713               continue;
3714
3715             case 'n':           /* invocation of locking-shift-2 */
3716               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3717                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3718                 goto invalid_code;
3719               CODING_ISO_INVOCATION (coding, 0) = 2;
3720               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3721               continue;
3722
3723             case 'o':           /* invocation of locking-shift-3 */
3724               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3725                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3726                 goto invalid_code;
3727               CODING_ISO_INVOCATION (coding, 0) = 3;
3728               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3729               continue;
3730
3731             case 'N':           /* invocation of single-shift-2 */
3732               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3733                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3734                 goto invalid_code;
3735               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3736               if (charset_id_2 < 0)
3737                 charset = CHARSET_FROM_ID (charset_ascii);
3738               else
3739                 charset = CHARSET_FROM_ID (charset_id_2);
3740               ONE_MORE_BYTE (c1);
3741               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3742                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3743                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3744                           ? c1 >= 0x80 : c1 < 0x80)))
3745                 goto invalid_code;
3746               break;
3747
3748             case 'O':           /* invocation of single-shift-3 */
3749               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3750                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3751                 goto invalid_code;
3752               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3753               if (charset_id_3 < 0)
3754                 charset = CHARSET_FROM_ID (charset_ascii);
3755               else
3756                 charset = CHARSET_FROM_ID (charset_id_3);
3757               ONE_MORE_BYTE (c1);
3758               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3759                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3760                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3761                           ? c1 >= 0x80 : c1 < 0x80)))
3762                 goto invalid_code;
3763               break;
3764
3765             case '0': case '2': case '3': case '4': /* start composition */
3766               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3767                 goto invalid_code;
3768               if (last_id != charset_ascii)
3769                 {
3770                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3771                   last_id = charset_ascii;
3772                   last_offset = char_offset;
3773                 }
3774               DECODE_COMPOSITION_START (c1);
3775               continue;
3776
3777             case '1':           /* end composition */
3778               if (cmp_status->state == COMPOSING_NO)
3779                 goto invalid_code;
3780               DECODE_COMPOSITION_END ();
3781               continue;
3782
3783             case '[':           /* specification of direction */
3784               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3785                 goto invalid_code;
3786               /* For the moment, nested direction is not supported.
3787                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3788                  left-to-right, and nonzero means right-to-left.  */
3789               ONE_MORE_BYTE (c1);
3790               switch (c1)
3791                 {
3792                 case ']':       /* end of the current direction */
3793                   coding->mode &= ~CODING_MODE_DIRECTION;
3794                   break;
3795
3796                 case '0':       /* end of the current direction */
3797                 case '1':       /* start of left-to-right direction */
3798                   ONE_MORE_BYTE (c1);
3799                   if (c1 == ']')
3800                     coding->mode &= ~CODING_MODE_DIRECTION;
3801                   else
3802                     goto invalid_code;
3803                   break;
3804
3805                 case '2':       /* start of right-to-left direction */
3806                   ONE_MORE_BYTE (c1);
3807                   if (c1 == ']')
3808                     coding->mode |= CODING_MODE_DIRECTION;
3809                   else
3810                     goto invalid_code;
3811                   break;
3812
3813                 default:
3814                   goto invalid_code;
3815                 }
3816               continue;
3817
3818             case '%':
3819               ONE_MORE_BYTE (c1);
3820               if (c1 == '/')
3821                 {
3822                   /* CTEXT extended segment:
3823                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3824                      We keep these bytes as is for the moment.
3825                      They may be decoded by post-read-conversion.  */
3826                   int dim, M, L;
3827                   int size;
3828
3829                   ONE_MORE_BYTE (dim);
3830                   if (dim < '0' || dim > '4')
3831                     goto invalid_code;
3832                   ONE_MORE_BYTE (M);
3833                   if (M < 128)
3834                     goto invalid_code;
3835                   ONE_MORE_BYTE (L);
3836                   if (L < 128)
3837                     goto invalid_code;
3838                   size = ((M - 128) * 128) + (L - 128);
3839                   if (charbuf + 6 > charbuf_end)
3840                     goto break_loop;
3841                   *charbuf++ = ISO_CODE_ESC;
3842                   *charbuf++ = '%';
3843                   *charbuf++ = '/';
3844                   *charbuf++ = dim;
3845                   *charbuf++ = BYTE8_TO_CHAR (M);
3846                   *charbuf++ = BYTE8_TO_CHAR (L);
3847                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3848                 }
3849               else if (c1 == 'G')
3850                 {
3851                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3852                      ESC % G --UTF-8-BYTES-- ESC % @
3853                      We keep these bytes as is for the moment.
3854                      They may be decoded by post-read-conversion.  */
3855                   if (charbuf + 3 > charbuf_end)
3856                     goto break_loop;
3857                   *charbuf++ = ISO_CODE_ESC;
3858                   *charbuf++ = '%';
3859                   *charbuf++ = 'G';
3860                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3861                 }
3862               else
3863                 goto invalid_code;
3864               continue;
3865               break;
3866
3867             default:
3868               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3869                 goto invalid_code;
3870               {
3871                 int reg, chars96;
3872
3873                 if (c1 >= 0x28 && c1 <= 0x2B)
3874                   { /* designation of DIMENSION1_CHARS94 character set */
3875                     reg = c1 - 0x28, chars96 = 0;
3876                     ONE_MORE_BYTE (c1);
3877                   }
3878                 else if (c1 >= 0x2C && c1 <= 0x2F)
3879                   { /* designation of DIMENSION1_CHARS96 character set */
3880                     reg = c1 - 0x2C, chars96 = 1;
3881                     ONE_MORE_BYTE (c1);
3882                   }
3883                 else
3884                   goto invalid_code;
3885                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3886                 /* We must update these variables now.  */
3887                 if (reg == 0)
3888                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3889                 else if (reg == 1)
3890                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3891                 if (chars96 < 0)
3892                   goto invalid_code;
3893               }
3894               continue;
3895             }
3896           break;
3897
3898         default:
3899           emacs_abort ();
3900         }
3901
3902       if (cmp_status->state == COMPOSING_NO
3903           && charset->id != charset_ascii
3904           && last_id != charset->id)
3905         {
3906           if (last_id != charset_ascii)
3907             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3908           last_id = charset->id;
3909           last_offset = char_offset;
3910         }
3911
3912       /* Now we know CHARSET and 1st position code C1 of a character.
3913          Produce a decoded character while getting 2nd and 3rd
3914          position codes C2, C3 if necessary.  */
3915       if (CHARSET_DIMENSION (charset) > 1)
3916         {
3917           ONE_MORE_BYTE (c2);
3918           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3919               || ((c1 & 0x80) != (c2 & 0x80)))
3920             /* C2 is not in a valid range.  */
3921             goto invalid_code;
3922           if (CHARSET_DIMENSION (charset) == 2)
3923             c1 = (c1 << 8) | c2;
3924           else
3925             {
3926               ONE_MORE_BYTE (c3);
3927               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3928                   || ((c1 & 0x80) != (c3 & 0x80)))
3929                 /* C3 is not in a valid range.  */
3930                 goto invalid_code;
3931               c1 = (c1 << 16) | (c2 << 8) | c2;
3932             }
3933         }
3934       c1 &= 0x7F7F7F;
3935       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3936       if (c < 0)
3937         {
3938           MAYBE_FINISH_COMPOSITION ();
3939           for (; src_base < src; src_base++, char_offset++)
3940             {
3941               if (ASCII_CHAR_P (*src_base))
3942                 *charbuf++ = *src_base;
3943               else
3944                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3945             }
3946         }
3947       else if (cmp_status->state == COMPOSING_NO)
3948         {
3949           *charbuf++ = c;
3950           char_offset++;
3951         }
3952       else if ((cmp_status->state == COMPOSING_CHAR
3953                 ? cmp_status->nchars
3954                 : cmp_status->ncomps)
3955                >= MAX_COMPOSITION_COMPONENTS)
3956         {
3957           /* Too long composition.  */
3958           MAYBE_FINISH_COMPOSITION ();
3959           *charbuf++ = c;
3960           char_offset++;
3961         }
3962       else
3963         STORE_COMPOSITION_CHAR (c);
3964       continue;
3965
3966     invalid_code:
3967       MAYBE_FINISH_COMPOSITION ();
3968       src = src_base;
3969       consumed_chars = consumed_chars_base;
3970       ONE_MORE_BYTE (c);
3971       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
3972       char_offset++;
3973       /* Reset the invocation and designation status to the safest
3974          one; i.e. designate ASCII to the graphic register 0, and
3975          invoke that register to the graphic plane 0.  This typically
3976          helps the case that a designation sequence for ASCII "ESC (
3977          B" is somehow broken (e.g. broken by a newline).  */
3978       CODING_ISO_INVOCATION (coding, 0) = 0;
3979       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3980       charset_id_0 = charset_ascii;
3981       continue;
3982
3983     break_loop:
3984       break;
3985     }
3986
3987  no_more_source:
3988   if (cmp_status->state != COMPOSING_NO)
3989     {
3990       if (coding->mode & CODING_MODE_LAST_BLOCK)
3991         MAYBE_FINISH_COMPOSITION ();
3992       else
3993         {
3994           charbuf -= cmp_status->length;
3995           for (i = 0; i < cmp_status->length; i++)
3996             cmp_status->carryover[i] = charbuf[i];
3997         }
3998     }
3999   else if (last_id != charset_ascii)
4000     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4001   coding->consumed_char += consumed_chars_base;
4002   coding->consumed = src_base - coding->source;
4003   coding->charbuf_used = charbuf - coding->charbuf;
4004 }
4005
4006
4007 /* ISO2022 encoding stuff.  */
4008
4009 /*
4010    It is not enough to say just "ISO2022" on encoding, we have to
4011    specify more details.  In Emacs, each coding system of ISO2022
4012    variant has the following specifications:
4013         1. Initial designation to G0 thru G3.
4014         2. Allows short-form designation?
4015         3. ASCII should be designated to G0 before control characters?
4016         4. ASCII should be designated to G0 at end of line?
4017         5. 7-bit environment or 8-bit environment?
4018         6. Use locking-shift?
4019         7. Use Single-shift?
4020    And the following two are only for Japanese:
4021         8. Use ASCII in place of JIS0201-1976-Roman?
4022         9. Use JISX0208-1983 in place of JISX0208-1978?
4023    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4024    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4025    details.
4026 */
4027
4028 /* Produce codes (escape sequence) for designating CHARSET to graphic
4029    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4030    '@', 'A', or 'B' and the coding system CODING allows, produce
4031    designation sequence of short-form.  */
4032
4033 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4034   do {                                                                  \
4035     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4036     const char *intermediate_char_94 = "()*+";                          \
4037     const char *intermediate_char_96 = ",-./";                          \
4038     int revision = -1;                                                  \
4039                                                                         \
4040     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4041       revision = CHARSET_ISO_REVISION (charset);                        \
4042                                                                         \
4043     if (revision >= 0)                                                  \
4044       {                                                                 \
4045         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4046         EMIT_ONE_BYTE ('@' + revision);                                 \
4047       }                                                                 \
4048     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4049     if (CHARSET_DIMENSION (charset) == 1)                               \
4050       {                                                                 \
4051         int b;                                                          \
4052         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4053           b = intermediate_char_94[reg];                                \
4054         else                                                            \
4055           b = intermediate_char_96[reg];                                \
4056         EMIT_ONE_ASCII_BYTE (b);                                        \
4057       }                                                                 \
4058     else                                                                \
4059       {                                                                 \
4060         EMIT_ONE_ASCII_BYTE ('$');                                      \
4061         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4062           {                                                             \
4063             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4064                 || reg != 0                                             \
4065                 || final_char < '@' || final_char > 'B')                \
4066               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4067           }                                                             \
4068         else                                                            \
4069           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4070       }                                                                 \
4071     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4072                                                                         \
4073     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4074   } while (0)
4075
4076
4077 /* The following two macros produce codes (control character or escape
4078    sequence) for ISO2022 single-shift functions (single-shift-2 and
4079    single-shift-3).  */
4080
4081 #define ENCODE_SINGLE_SHIFT_2                                           \
4082   do {                                                                  \
4083     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4084       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4085     else                                                                \
4086       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4087     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4088   } while (0)
4089
4090
4091 #define ENCODE_SINGLE_SHIFT_3                                           \
4092   do {                                                                  \
4093     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4094       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4095     else                                                                \
4096       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4097     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4098   } while (0)
4099
4100
4101 /* The following four macros produce codes (control character or
4102    escape sequence) for ISO2022 locking-shift functions (shift-in,
4103    shift-out, locking-shift-2, and locking-shift-3).  */
4104
4105 #define ENCODE_SHIFT_IN                                 \
4106   do {                                                  \
4107     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4108     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4109   } while (0)
4110
4111
4112 #define ENCODE_SHIFT_OUT                                \
4113   do {                                                  \
4114     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4115     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4116   } while (0)
4117
4118
4119 #define ENCODE_LOCKING_SHIFT_2                          \
4120   do {                                                  \
4121     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4122     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4123   } while (0)
4124
4125
4126 #define ENCODE_LOCKING_SHIFT_3                          \
4127   do {                                                  \
4128     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4129     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4130   } while (0)
4131
4132
4133 /* Produce codes for a DIMENSION1 character whose character set is
4134    CHARSET and whose position-code is C1.  Designation and invocation
4135    sequences are also produced in advance if necessary.  */
4136
4137 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4138   do {                                                                  \
4139     int id = CHARSET_ID (charset);                                      \
4140                                                                         \
4141     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4142         && id == charset_ascii)                                         \
4143       {                                                                 \
4144         id = charset_jisx0201_roman;                                    \
4145         charset = CHARSET_FROM_ID (id);                                 \
4146       }                                                                 \
4147                                                                         \
4148     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4149       {                                                                 \
4150         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4151           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4152         else                                                            \
4153           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4154         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4155         break;                                                          \
4156       }                                                                 \
4157     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4158       {                                                                 \
4159         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4160         break;                                                          \
4161       }                                                                 \
4162     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4163       {                                                                 \
4164         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4165         break;                                                          \
4166       }                                                                 \
4167     else                                                                \
4168       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4169          must invoke it, or, at first, designate it to some graphic     \
4170          register.  Then repeat the loop to actually produce the        \
4171          character.  */                                                 \
4172       dst = encode_invocation_designation (charset, coding, dst,        \
4173                                            &produced_chars);            \
4174   } while (1)
4175
4176
4177 /* Produce codes for a DIMENSION2 character whose character set is
4178    CHARSET and whose position-codes are C1 and C2.  Designation and
4179    invocation codes are also produced in advance if necessary.  */
4180
4181 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4182   do {                                                                  \
4183     int id = CHARSET_ID (charset);                                      \
4184                                                                         \
4185     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4186         && id == charset_jisx0208)                                      \
4187       {                                                                 \
4188         id = charset_jisx0208_1978;                                     \
4189         charset = CHARSET_FROM_ID (id);                                 \
4190       }                                                                 \
4191                                                                         \
4192     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4193       {                                                                 \
4194         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4195           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4196         else                                                            \
4197           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4198         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4199         break;                                                          \
4200       }                                                                 \
4201     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4202       {                                                                 \
4203         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4204         break;                                                          \
4205       }                                                                 \
4206     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4207       {                                                                 \
4208         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4209         break;                                                          \
4210       }                                                                 \
4211     else                                                                \
4212       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4213          must invoke it, or, at first, designate it to some graphic     \
4214          register.  Then repeat the loop to actually produce the        \
4215          character.  */                                                 \
4216       dst = encode_invocation_designation (charset, coding, dst,        \
4217                                            &produced_chars);            \
4218   } while (1)
4219
4220
4221 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4222   do {                                                                     \
4223     unsigned code;                                                         \
4224     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4225                                                                            \
4226     if (CHARSET_DIMENSION (charset) == 1)                                  \
4227       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4228     else                                                                   \
4229       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4230   } while (0)
4231
4232
4233 /* Produce designation and invocation codes at a place pointed by DST
4234    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4235    Return new DST.  */
4236
4237 static unsigned char *
4238 encode_invocation_designation (struct charset *charset,
4239                                struct coding_system *coding,
4240                                unsigned char *dst, ptrdiff_t *p_nchars)
4241 {
4242   bool multibytep = coding->dst_multibyte;
4243   ptrdiff_t produced_chars = *p_nchars;
4244   int reg;                      /* graphic register number */
4245   int id = CHARSET_ID (charset);
4246
4247   /* At first, check designations.  */
4248   for (reg = 0; reg < 4; reg++)
4249     if (id == CODING_ISO_DESIGNATION (coding, reg))
4250       break;
4251
4252   if (reg >= 4)
4253     {
4254       /* CHARSET is not yet designated to any graphic registers.  */
4255       /* At first check the requested designation.  */
4256       reg = CODING_ISO_REQUEST (coding, id);
4257       if (reg < 0)
4258         /* Since CHARSET requests no special designation, designate it
4259            to graphic register 0.  */
4260         reg = 0;
4261
4262       ENCODE_DESIGNATION (charset, reg, coding);
4263     }
4264
4265   if (CODING_ISO_INVOCATION (coding, 0) != reg
4266       && CODING_ISO_INVOCATION (coding, 1) != reg)
4267     {
4268       /* Since the graphic register REG is not invoked to any graphic
4269          planes, invoke it to graphic plane 0.  */
4270       switch (reg)
4271         {
4272         case 0:                 /* graphic register 0 */
4273           ENCODE_SHIFT_IN;
4274           break;
4275
4276         case 1:                 /* graphic register 1 */
4277           ENCODE_SHIFT_OUT;
4278           break;
4279
4280         case 2:                 /* graphic register 2 */
4281           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4282             ENCODE_SINGLE_SHIFT_2;
4283           else
4284             ENCODE_LOCKING_SHIFT_2;
4285           break;
4286
4287         case 3:                 /* graphic register 3 */
4288           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4289             ENCODE_SINGLE_SHIFT_3;
4290           else
4291             ENCODE_LOCKING_SHIFT_3;
4292           break;
4293
4294         default:
4295           break;
4296         }
4297     }
4298
4299   *p_nchars = produced_chars;
4300   return dst;
4301 }
4302
4303
4304 /* Produce codes for designation and invocation to reset the graphic
4305    planes and registers to initial state.  */
4306 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4307   do {                                                                  \
4308     int reg;                                                            \
4309     struct charset *charset;                                            \
4310                                                                         \
4311     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4312       ENCODE_SHIFT_IN;                                                  \
4313     for (reg = 0; reg < 4; reg++)                                       \
4314       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4315           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4316               != CODING_ISO_INITIAL (coding, reg)))                     \
4317         {                                                               \
4318           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4319           ENCODE_DESIGNATION (charset, reg, coding);                    \
4320         }                                                               \
4321   } while (0)
4322
4323
4324 /* Produce designation sequences of charsets in the line started from
4325    CHARBUF to a place pointed by DST, and return the number of
4326    produced bytes.  DST should not directly point a buffer text area
4327    which may be relocated by char_charset call.
4328
4329    If the current block ends before any end-of-line, we may fail to
4330    find all the necessary designations.  */
4331
4332 static ptrdiff_t
4333 encode_designation_at_bol (struct coding_system *coding,
4334                            int *charbuf, int *charbuf_end,
4335                            unsigned char *dst)
4336 {
4337   unsigned char *orig = dst;
4338   struct charset *charset;
4339   /* Table of charsets to be designated to each graphic register.  */
4340   int r[4];
4341   int c, found = 0, reg;
4342   ptrdiff_t produced_chars = 0;
4343   bool multibytep = coding->dst_multibyte;
4344   Lisp_Object attrs;
4345   Lisp_Object charset_list;
4346
4347   attrs = CODING_ID_ATTRS (coding->id);
4348   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4349   if (EQ (charset_list, Qiso_2022))
4350     charset_list = Viso_2022_charset_list;
4351
4352   for (reg = 0; reg < 4; reg++)
4353     r[reg] = -1;
4354
4355   while (charbuf < charbuf_end && found < 4)
4356     {
4357       int id;
4358
4359       c = *charbuf++;
4360       if (c == '\n')
4361         break;
4362       charset = char_charset (c, charset_list, NULL);
4363       id = CHARSET_ID (charset);
4364       reg = CODING_ISO_REQUEST (coding, id);
4365       if (reg >= 0 && r[reg] < 0)
4366         {
4367           found++;
4368           r[reg] = id;
4369         }
4370     }
4371
4372   if (found)
4373     {
4374       for (reg = 0; reg < 4; reg++)
4375         if (r[reg] >= 0
4376             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4377           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4378     }
4379
4380   return dst - orig;
4381 }
4382
4383 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4384
4385 static bool
4386 encode_coding_iso_2022 (struct coding_system *coding)
4387 {
4388   bool multibytep = coding->dst_multibyte;
4389   int *charbuf = coding->charbuf;
4390   int *charbuf_end = charbuf + coding->charbuf_used;
4391   unsigned char *dst = coding->destination + coding->produced;
4392   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4393   int safe_room = 16;
4394   bool bol_designation
4395     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4396        && CODING_ISO_BOL (coding));
4397   ptrdiff_t produced_chars = 0;
4398   Lisp_Object attrs, eol_type, charset_list;
4399   bool ascii_compatible;
4400   int c;
4401   int preferred_charset_id = -1;
4402
4403   CODING_GET_INFO (coding, attrs, charset_list);
4404   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4405   if (VECTORP (eol_type))
4406     eol_type = Qunix;
4407
4408   setup_iso_safe_charsets (attrs);
4409   /* Charset list may have been changed.  */
4410   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4411   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4412
4413   ascii_compatible
4414     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4415        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4416                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4417
4418   while (charbuf < charbuf_end)
4419     {
4420       ASSURE_DESTINATION (safe_room);
4421
4422       if (bol_designation)
4423         {
4424           /* We have to produce designation sequences if any now.  */
4425           unsigned char desig_buf[16];
4426           ptrdiff_t nbytes;
4427           ptrdiff_t offset;
4428
4429           charset_map_loaded = 0;
4430           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4431                                               desig_buf);
4432           if (charset_map_loaded
4433               && (offset = coding_change_destination (coding)))
4434             {
4435               dst += offset;
4436               dst_end += offset;
4437             }
4438           memcpy (dst, desig_buf, nbytes);
4439           dst += nbytes;
4440           /* We are sure that designation sequences are all ASCII bytes.  */
4441           produced_chars += nbytes;
4442           bol_designation = 0;
4443           ASSURE_DESTINATION (safe_room);
4444         }
4445
4446       c = *charbuf++;
4447
4448       if (c < 0)
4449         {
4450           /* Handle an annotation.  */
4451           switch (*charbuf)
4452             {
4453             case CODING_ANNOTATE_COMPOSITION_MASK:
4454               /* Not yet implemented.  */
4455               break;
4456             case CODING_ANNOTATE_CHARSET_MASK:
4457               preferred_charset_id = charbuf[2];
4458               if (preferred_charset_id >= 0
4459                   && NILP (Fmemq (make_number (preferred_charset_id),
4460                                   charset_list)))
4461                 preferred_charset_id = -1;
4462               break;
4463             default:
4464               emacs_abort ();
4465             }
4466           charbuf += -c - 1;
4467           continue;
4468         }
4469
4470       /* Now encode the character C.  */
4471       if (c < 0x20 || c == 0x7F)
4472         {
4473           if (c == '\n'
4474               || (c == '\r' && EQ (eol_type, Qmac)))
4475             {
4476               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4477                 ENCODE_RESET_PLANE_AND_REGISTER ();
4478               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4479                 {
4480                   int i;
4481
4482                   for (i = 0; i < 4; i++)
4483                     CODING_ISO_DESIGNATION (coding, i)
4484                       = CODING_ISO_INITIAL (coding, i);
4485                 }
4486               bol_designation = ((CODING_ISO_FLAGS (coding)
4487                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4488                                  != 0);
4489             }
4490           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4491             ENCODE_RESET_PLANE_AND_REGISTER ();
4492           EMIT_ONE_ASCII_BYTE (c);
4493         }
4494       else if (ASCII_CHAR_P (c))
4495         {
4496           if (ascii_compatible)
4497             EMIT_ONE_ASCII_BYTE (c);
4498           else
4499             {
4500               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4501               ENCODE_ISO_CHARACTER (charset, c);
4502             }
4503         }
4504       else if (CHAR_BYTE8_P (c))
4505         {
4506           c = CHAR_TO_BYTE8 (c);
4507           EMIT_ONE_BYTE (c);
4508         }
4509       else
4510         {
4511           struct charset *charset;
4512
4513           if (preferred_charset_id >= 0)
4514             {
4515               bool result;
4516
4517               charset = CHARSET_FROM_ID (preferred_charset_id);
4518               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4519               if (! result)
4520                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4521                                      NULL, charset);
4522             }
4523           else
4524             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4525                                  NULL, charset);
4526           if (!charset)
4527             {
4528               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4529                 {
4530                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4531                   charset = CHARSET_FROM_ID (charset_ascii);
4532                 }
4533               else
4534                 {
4535                   c = coding->default_char;
4536                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4537                                        charset_list, NULL, charset);
4538                 }
4539             }
4540           ENCODE_ISO_CHARACTER (charset, c);
4541         }
4542     }
4543
4544   if (coding->mode & CODING_MODE_LAST_BLOCK
4545       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4546     {
4547       ASSURE_DESTINATION (safe_room);
4548       ENCODE_RESET_PLANE_AND_REGISTER ();
4549     }
4550   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4551   CODING_ISO_BOL (coding) = bol_designation;
4552   coding->produced_char += produced_chars;
4553   coding->produced = dst - coding->destination;
4554   return 0;
4555 }
4556
4557 \f
4558 /*** 8,9. SJIS and BIG5 handlers ***/
4559
4560 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4561    quite widely.  So, for the moment, Emacs supports them in the bare
4562    C code.  But, in the future, they may be supported only by CCL.  */
4563
4564 /* SJIS is a coding system encoding three character sets: ASCII, right
4565    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4566    as is.  A character of charset katakana-jisx0201 is encoded by
4567    "position-code + 0x80".  A character of charset japanese-jisx0208
4568    is encoded in 2-byte but two position-codes are divided and shifted
4569    so that it fit in the range below.
4570
4571    --- CODE RANGE of SJIS ---
4572    (character set)      (range)
4573    ASCII                0x00 .. 0x7F
4574    KATAKANA-JISX0201    0xA0 .. 0xDF
4575    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4576             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4577    -------------------------------
4578
4579 */
4580
4581 /* BIG5 is a coding system encoding two character sets: ASCII and
4582    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4583    character set and is encoded in two-byte.
4584
4585    --- CODE RANGE of BIG5 ---
4586    (character set)      (range)
4587    ASCII                0x00 .. 0x7F
4588    Big5 (1st byte)      0xA1 .. 0xFE
4589         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4590    --------------------------
4591
4592   */
4593
4594 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4595    Return true if a text is encoded in SJIS.  */
4596
4597 static bool
4598 detect_coding_sjis (struct coding_system *coding,
4599                     struct coding_detection_info *detect_info)
4600 {
4601   const unsigned char *src = coding->source, *src_base;
4602   const unsigned char *src_end = coding->source + coding->src_bytes;
4603   bool multibytep = coding->src_multibyte;
4604   ptrdiff_t consumed_chars = 0;
4605   int found = 0;
4606   int c;
4607   Lisp_Object attrs, charset_list;
4608   int max_first_byte_of_2_byte_code;
4609
4610   CODING_GET_INFO (coding, attrs, charset_list);
4611   max_first_byte_of_2_byte_code
4612     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4613
4614   detect_info->checked |= CATEGORY_MASK_SJIS;
4615   /* A coding system of this category is always ASCII compatible.  */
4616   src += coding->head_ascii;
4617
4618   while (1)
4619     {
4620       src_base = src;
4621       ONE_MORE_BYTE (c);
4622       if (c < 0x80)
4623         continue;
4624       if ((c >= 0x81 && c <= 0x9F)
4625           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4626         {
4627           ONE_MORE_BYTE (c);
4628           if (c < 0x40 || c == 0x7F || c > 0xFC)
4629             break;
4630           found = CATEGORY_MASK_SJIS;
4631         }
4632       else if (c >= 0xA0 && c < 0xE0)
4633         found = CATEGORY_MASK_SJIS;
4634       else
4635         break;
4636     }
4637   detect_info->rejected |= CATEGORY_MASK_SJIS;
4638   return 0;
4639
4640  no_more_source:
4641   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4642     {
4643       detect_info->rejected |= CATEGORY_MASK_SJIS;
4644       return 0;
4645     }
4646   detect_info->found |= found;
4647   return 1;
4648 }
4649
4650 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4651    Return true if a text is encoded in BIG5.  */
4652
4653 static bool
4654 detect_coding_big5 (struct coding_system *coding,
4655                     struct coding_detection_info *detect_info)
4656 {
4657   const unsigned char *src = coding->source, *src_base;
4658   const unsigned char *src_end = coding->source + coding->src_bytes;
4659   bool multibytep = coding->src_multibyte;
4660   ptrdiff_t consumed_chars = 0;
4661   int found = 0;
4662   int c;
4663
4664   detect_info->checked |= CATEGORY_MASK_BIG5;
4665   /* A coding system of this category is always ASCII compatible.  */
4666   src += coding->head_ascii;
4667
4668   while (1)
4669     {
4670       src_base = src;
4671       ONE_MORE_BYTE (c);
4672       if (c < 0x80)
4673         continue;
4674       if (c >= 0xA1)
4675         {
4676           ONE_MORE_BYTE (c);
4677           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4678             return 0;
4679           found = CATEGORY_MASK_BIG5;
4680         }
4681       else
4682         break;
4683     }
4684   detect_info->rejected |= CATEGORY_MASK_BIG5;
4685   return 0;
4686
4687  no_more_source:
4688   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4689     {
4690       detect_info->rejected |= CATEGORY_MASK_BIG5;
4691       return 0;
4692     }
4693   detect_info->found |= found;
4694   return 1;
4695 }
4696
4697 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4698
4699 static void
4700 decode_coding_sjis (struct coding_system *coding)
4701 {
4702   const unsigned char *src = coding->source + coding->consumed;
4703   const unsigned char *src_end = coding->source + coding->src_bytes;
4704   const unsigned char *src_base;
4705   int *charbuf = coding->charbuf + coding->charbuf_used;
4706   /* We may produce one charset annotation in one loop and one more at
4707      the end.  */
4708   int *charbuf_end
4709     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4710   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4711   bool multibytep = coding->src_multibyte;
4712   struct charset *charset_roman, *charset_kanji, *charset_kana;
4713   struct charset *charset_kanji2;
4714   Lisp_Object attrs, charset_list, val;
4715   ptrdiff_t char_offset = coding->produced_char;
4716   ptrdiff_t last_offset = char_offset;
4717   int last_id = charset_ascii;
4718   bool eol_dos
4719     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4720   int byte_after_cr = -1;
4721
4722   CODING_GET_INFO (coding, attrs, charset_list);
4723
4724   val = charset_list;
4725   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4726   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4727   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4728   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4729
4730   while (1)
4731     {
4732       int c, c1;
4733       struct charset *charset;
4734
4735       src_base = src;
4736       consumed_chars_base = consumed_chars;
4737
4738       if (charbuf >= charbuf_end)
4739         {
4740           if (byte_after_cr >= 0)
4741             src_base--;
4742           break;
4743         }
4744
4745       if (byte_after_cr >= 0)
4746         c = byte_after_cr, byte_after_cr = -1;
4747       else
4748         ONE_MORE_BYTE (c);
4749       if (c < 0)
4750         goto invalid_code;
4751       if (c < 0x80)
4752         {
4753           if (eol_dos && c == '\r')
4754             ONE_MORE_BYTE (byte_after_cr);
4755           charset = charset_roman;
4756         }
4757       else if (c == 0x80 || c == 0xA0)
4758         goto invalid_code;
4759       else if (c >= 0xA1 && c <= 0xDF)
4760         {
4761           /* SJIS -> JISX0201-Kana */
4762           c &= 0x7F;
4763           charset = charset_kana;
4764         }
4765       else if (c <= 0xEF)
4766         {
4767           /* SJIS -> JISX0208 */
4768           ONE_MORE_BYTE (c1);
4769           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4770             goto invalid_code;
4771           c = (c << 8) | c1;
4772           SJIS_TO_JIS (c);
4773           charset = charset_kanji;
4774         }
4775       else if (c <= 0xFC && charset_kanji2)
4776         {
4777           /* SJIS -> JISX0213-2 */
4778           ONE_MORE_BYTE (c1);
4779           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4780             goto invalid_code;
4781           c = (c << 8) | c1;
4782           SJIS_TO_JIS2 (c);
4783           charset = charset_kanji2;
4784         }
4785       else
4786         goto invalid_code;
4787       if (charset->id != charset_ascii
4788           && last_id != charset->id)
4789         {
4790           if (last_id != charset_ascii)
4791             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4792           last_id = charset->id;
4793           last_offset = char_offset;
4794         }
4795       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4796       *charbuf++ = c;
4797       char_offset++;
4798       continue;
4799
4800     invalid_code:
4801       src = src_base;
4802       consumed_chars = consumed_chars_base;
4803       ONE_MORE_BYTE (c);
4804       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4805       char_offset++;
4806     }
4807
4808  no_more_source:
4809   if (last_id != charset_ascii)
4810     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4811   coding->consumed_char += consumed_chars_base;
4812   coding->consumed = src_base - coding->source;
4813   coding->charbuf_used = charbuf - coding->charbuf;
4814 }
4815
4816 static void
4817 decode_coding_big5 (struct coding_system *coding)
4818 {
4819   const unsigned char *src = coding->source + coding->consumed;
4820   const unsigned char *src_end = coding->source + coding->src_bytes;
4821   const unsigned char *src_base;
4822   int *charbuf = coding->charbuf + coding->charbuf_used;
4823   /* We may produce one charset annotation in one loop and one more at
4824      the end.  */
4825   int *charbuf_end
4826     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4827   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4828   bool multibytep = coding->src_multibyte;
4829   struct charset *charset_roman, *charset_big5;
4830   Lisp_Object attrs, charset_list, val;
4831   ptrdiff_t char_offset = coding->produced_char;
4832   ptrdiff_t last_offset = char_offset;
4833   int last_id = charset_ascii;
4834   bool eol_dos
4835     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4836   int byte_after_cr = -1;
4837
4838   CODING_GET_INFO (coding, attrs, charset_list);
4839   val = charset_list;
4840   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4841   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4842
4843   while (1)
4844     {
4845       int c, c1;
4846       struct charset *charset;
4847
4848       src_base = src;
4849       consumed_chars_base = consumed_chars;
4850
4851       if (charbuf >= charbuf_end)
4852         {
4853           if (byte_after_cr >= 0)
4854             src_base--;
4855           break;
4856         }
4857
4858       if (byte_after_cr >= 0)
4859         c = byte_after_cr, byte_after_cr = -1;
4860       else
4861         ONE_MORE_BYTE (c);
4862
4863       if (c < 0)
4864         goto invalid_code;
4865       if (c < 0x80)
4866         {
4867           if (eol_dos && c == '\r')
4868             ONE_MORE_BYTE (byte_after_cr);
4869           charset = charset_roman;
4870         }
4871       else
4872         {
4873           /* BIG5 -> Big5 */
4874           if (c < 0xA1 || c > 0xFE)
4875             goto invalid_code;
4876           ONE_MORE_BYTE (c1);
4877           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4878             goto invalid_code;
4879           c = c << 8 | c1;
4880           charset = charset_big5;
4881         }
4882       if (charset->id != charset_ascii
4883           && last_id != charset->id)
4884         {
4885           if (last_id != charset_ascii)
4886             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4887           last_id = charset->id;
4888           last_offset = char_offset;
4889         }
4890       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4891       *charbuf++ = c;
4892       char_offset++;
4893       continue;
4894
4895     invalid_code:
4896       src = src_base;
4897       consumed_chars = consumed_chars_base;
4898       ONE_MORE_BYTE (c);
4899       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4900       char_offset++;
4901     }
4902
4903  no_more_source:
4904   if (last_id != charset_ascii)
4905     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4906   coding->consumed_char += consumed_chars_base;
4907   coding->consumed = src_base - coding->source;
4908   coding->charbuf_used = charbuf - coding->charbuf;
4909 }
4910
4911 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4912    This function can encode charsets `ascii', `katakana-jisx0201',
4913    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4914    are sure that all these charsets are registered as official charset
4915    (i.e. do not have extended leading-codes).  Characters of other
4916    charsets are produced without any encoding.  */
4917
4918 static bool
4919 encode_coding_sjis (struct coding_system *coding)
4920 {
4921   bool multibytep = coding->dst_multibyte;
4922   int *charbuf = coding->charbuf;
4923   int *charbuf_end = charbuf + coding->charbuf_used;
4924   unsigned char *dst = coding->destination + coding->produced;
4925   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4926   int safe_room = 4;
4927   ptrdiff_t produced_chars = 0;
4928   Lisp_Object attrs, charset_list, val;
4929   bool ascii_compatible;
4930   struct charset *charset_kanji, *charset_kana;
4931   struct charset *charset_kanji2;
4932   int c;
4933
4934   CODING_GET_INFO (coding, attrs, charset_list);
4935   val = XCDR (charset_list);
4936   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4937   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4938   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4939
4940   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4941
4942   while (charbuf < charbuf_end)
4943     {
4944       ASSURE_DESTINATION (safe_room);
4945       c = *charbuf++;
4946       /* Now encode the character C.  */
4947       if (ASCII_CHAR_P (c) && ascii_compatible)
4948         EMIT_ONE_ASCII_BYTE (c);
4949       else if (CHAR_BYTE8_P (c))
4950         {
4951           c = CHAR_TO_BYTE8 (c);
4952           EMIT_ONE_BYTE (c);
4953         }
4954       else
4955         {
4956           unsigned code;
4957           struct charset *charset;
4958           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4959                                &code, charset);
4960
4961           if (!charset)
4962             {
4963               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4964                 {
4965                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4966                   charset = CHARSET_FROM_ID (charset_ascii);
4967                 }
4968               else
4969                 {
4970                   c = coding->default_char;
4971                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4972                                        charset_list, &code, charset);
4973                 }
4974             }
4975           if (code == CHARSET_INVALID_CODE (charset))
4976             emacs_abort ();
4977           if (charset == charset_kanji)
4978             {
4979               int c1, c2;
4980               JIS_TO_SJIS (code);
4981               c1 = code >> 8, c2 = code & 0xFF;
4982               EMIT_TWO_BYTES (c1, c2);
4983             }
4984           else if (charset == charset_kana)
4985             EMIT_ONE_BYTE (code | 0x80);
4986           else if (charset_kanji2 && charset == charset_kanji2)
4987             {
4988               int c1, c2;
4989
4990               c1 = code >> 8;
4991               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4992                   || c1 == 0x28
4993                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4994                 {
4995                   JIS_TO_SJIS2 (code);
4996                   c1 = code >> 8, c2 = code & 0xFF;
4997                   EMIT_TWO_BYTES (c1, c2);
4998                 }
4999               else
5000                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5001             }
5002           else
5003             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5004         }
5005     }
5006   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5007   coding->produced_char += produced_chars;
5008   coding->produced = dst - coding->destination;
5009   return 0;
5010 }
5011
5012 static bool
5013 encode_coding_big5 (struct coding_system *coding)
5014 {
5015   bool multibytep = coding->dst_multibyte;
5016   int *charbuf = coding->charbuf;
5017   int *charbuf_end = charbuf + coding->charbuf_used;
5018   unsigned char *dst = coding->destination + coding->produced;
5019   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5020   int safe_room = 4;
5021   ptrdiff_t produced_chars = 0;
5022   Lisp_Object attrs, charset_list, val;
5023   bool ascii_compatible;
5024   struct charset *charset_big5;
5025   int c;
5026
5027   CODING_GET_INFO (coding, attrs, charset_list);
5028   val = XCDR (charset_list);
5029   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5030   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5031
5032   while (charbuf < charbuf_end)
5033     {
5034       ASSURE_DESTINATION (safe_room);
5035       c = *charbuf++;
5036       /* Now encode the character C.  */
5037       if (ASCII_CHAR_P (c) && ascii_compatible)
5038         EMIT_ONE_ASCII_BYTE (c);
5039       else if (CHAR_BYTE8_P (c))
5040         {
5041           c = CHAR_TO_BYTE8 (c);
5042           EMIT_ONE_BYTE (c);
5043         }
5044       else
5045         {
5046           unsigned code;
5047           struct charset *charset;
5048           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5049                                &code, charset);
5050
5051           if (! charset)
5052             {
5053               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5054                 {
5055                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5056                   charset = CHARSET_FROM_ID (charset_ascii);
5057                 }
5058               else
5059                 {
5060                   c = coding->default_char;
5061                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5062                                        charset_list, &code, charset);
5063                 }
5064             }
5065           if (code == CHARSET_INVALID_CODE (charset))
5066             emacs_abort ();
5067           if (charset == charset_big5)
5068             {
5069               int c1, c2;
5070
5071               c1 = code >> 8, c2 = code & 0xFF;
5072               EMIT_TWO_BYTES (c1, c2);
5073             }
5074           else
5075             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5076         }
5077     }
5078   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5079   coding->produced_char += produced_chars;
5080   coding->produced = dst - coding->destination;
5081   return 0;
5082 }
5083
5084 \f
5085 /*** 10. CCL handlers ***/
5086
5087 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5088    Return true if a text is encoded in a coding system of which
5089    encoder/decoder are written in CCL program.  */
5090
5091 static bool
5092 detect_coding_ccl (struct coding_system *coding,
5093                    struct coding_detection_info *detect_info)
5094 {
5095   const unsigned char *src = coding->source, *src_base;
5096   const unsigned char *src_end = coding->source + coding->src_bytes;
5097   bool multibytep = coding->src_multibyte;
5098   ptrdiff_t consumed_chars = 0;
5099   int found = 0;
5100   unsigned char *valids;
5101   ptrdiff_t head_ascii = coding->head_ascii;
5102   Lisp_Object attrs;
5103
5104   detect_info->checked |= CATEGORY_MASK_CCL;
5105
5106   coding = &coding_categories[coding_category_ccl];
5107   valids = CODING_CCL_VALIDS (coding);
5108   attrs = CODING_ID_ATTRS (coding->id);
5109   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5110     src += head_ascii;
5111
5112   while (1)
5113     {
5114       int c;
5115
5116       src_base = src;
5117       ONE_MORE_BYTE (c);
5118       if (c < 0 || ! valids[c])
5119         break;
5120       if ((valids[c] > 1))
5121         found = CATEGORY_MASK_CCL;
5122     }
5123   detect_info->rejected |= CATEGORY_MASK_CCL;
5124   return 0;
5125
5126  no_more_source:
5127   detect_info->found |= found;
5128   return 1;
5129 }
5130
5131 static void
5132 decode_coding_ccl (struct coding_system *coding)
5133 {
5134   const unsigned char *src = coding->source + coding->consumed;
5135   const unsigned char *src_end = coding->source + coding->src_bytes;
5136   int *charbuf = coding->charbuf + coding->charbuf_used;
5137   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5138   ptrdiff_t consumed_chars = 0;
5139   bool multibytep = coding->src_multibyte;
5140   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5141   int source_charbuf[1024];
5142   int source_byteidx[1025];
5143   Lisp_Object attrs, charset_list;
5144
5145   CODING_GET_INFO (coding, attrs, charset_list);
5146
5147   while (1)
5148     {
5149       const unsigned char *p = src;
5150       ptrdiff_t offset;
5151       int i = 0;
5152
5153       if (multibytep)
5154         {
5155           while (i < 1024 && p < src_end)
5156             {
5157               source_byteidx[i] = p - src;
5158               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5159             }
5160           source_byteidx[i] = p - src;
5161         }
5162       else
5163         while (i < 1024 && p < src_end)
5164           source_charbuf[i++] = *p++;
5165
5166       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5167         ccl->last_block = true;
5168       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5169       charset_map_loaded = 0;
5170       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5171                   charset_list);
5172       if (charset_map_loaded
5173           && (offset = coding_change_source (coding)))
5174         {
5175           p += offset;
5176           src += offset;
5177           src_end += offset;
5178         }
5179       charbuf += ccl->produced;
5180       if (multibytep)
5181         src += source_byteidx[ccl->consumed];
5182       else
5183         src += ccl->consumed;
5184       consumed_chars += ccl->consumed;
5185       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5186         break;
5187     }
5188
5189   switch (ccl->status)
5190     {
5191     case CCL_STAT_SUSPEND_BY_SRC:
5192       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5193       break;
5194     case CCL_STAT_SUSPEND_BY_DST:
5195       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5196       break;
5197     case CCL_STAT_QUIT:
5198     case CCL_STAT_INVALID_CMD:
5199       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5200       break;
5201     default:
5202       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5203       break;
5204     }
5205   coding->consumed_char += consumed_chars;
5206   coding->consumed = src - coding->source;
5207   coding->charbuf_used = charbuf - coding->charbuf;
5208 }
5209
5210 static bool
5211 encode_coding_ccl (struct coding_system *coding)
5212 {
5213   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5214   bool multibytep = coding->dst_multibyte;
5215   int *charbuf = coding->charbuf;
5216   int *charbuf_end = charbuf + coding->charbuf_used;
5217   unsigned char *dst = coding->destination + coding->produced;
5218   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5219   int destination_charbuf[1024];
5220   ptrdiff_t produced_chars = 0;
5221   int i;
5222   Lisp_Object attrs, charset_list;
5223
5224   CODING_GET_INFO (coding, attrs, charset_list);
5225   if (coding->consumed_char == coding->src_chars
5226       && coding->mode & CODING_MODE_LAST_BLOCK)
5227     ccl->last_block = true;
5228
5229   do
5230     {
5231       ptrdiff_t offset;
5232
5233       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5234       charset_map_loaded = 0;
5235       ccl_driver (ccl, charbuf, destination_charbuf,
5236                   charbuf_end - charbuf, 1024, charset_list);
5237       if (charset_map_loaded
5238           && (offset = coding_change_destination (coding)))
5239         dst += offset;
5240       if (multibytep)
5241         {
5242           ASSURE_DESTINATION (ccl->produced * 2);
5243           for (i = 0; i < ccl->produced; i++)
5244             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5245         }
5246       else
5247         {
5248           ASSURE_DESTINATION (ccl->produced);
5249           for (i = 0; i < ccl->produced; i++)
5250             *dst++ = destination_charbuf[i] & 0xFF;
5251           produced_chars += ccl->produced;
5252         }
5253       charbuf += ccl->consumed;
5254       if (ccl->status == CCL_STAT_QUIT
5255           || ccl->status == CCL_STAT_INVALID_CMD)
5256         break;
5257     }
5258   while (charbuf < charbuf_end);
5259
5260   switch (ccl->status)
5261     {
5262     case CCL_STAT_SUSPEND_BY_SRC:
5263       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5264       break;
5265     case CCL_STAT_SUSPEND_BY_DST:
5266       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5267       break;
5268     case CCL_STAT_QUIT:
5269     case CCL_STAT_INVALID_CMD:
5270       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5271       break;
5272     default:
5273       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5274       break;
5275     }
5276
5277   coding->produced_char += produced_chars;
5278   coding->produced = dst - coding->destination;
5279   return 0;
5280 }
5281
5282 \f
5283 /*** 10, 11. no-conversion handlers ***/
5284
5285 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5286
5287 static void
5288 decode_coding_raw_text (struct coding_system *coding)
5289 {
5290   bool eol_dos
5291     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5292
5293   coding->chars_at_source = 1;
5294   coding->consumed_char = coding->src_chars;
5295   coding->consumed = coding->src_bytes;
5296   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5297     {
5298       coding->consumed_char--;
5299       coding->consumed--;
5300       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5301     }
5302   else
5303     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5304 }
5305
5306 static bool
5307 encode_coding_raw_text (struct coding_system *coding)
5308 {
5309   bool multibytep = coding->dst_multibyte;
5310   int *charbuf = coding->charbuf;
5311   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5312   unsigned char *dst = coding->destination + coding->produced;
5313   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5314   ptrdiff_t produced_chars = 0;
5315   int c;
5316
5317   if (multibytep)
5318     {
5319       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5320
5321       if (coding->src_multibyte)
5322         while (charbuf < charbuf_end)
5323           {
5324             ASSURE_DESTINATION (safe_room);
5325             c = *charbuf++;
5326             if (ASCII_CHAR_P (c))
5327               EMIT_ONE_ASCII_BYTE (c);
5328             else if (CHAR_BYTE8_P (c))
5329               {
5330                 c = CHAR_TO_BYTE8 (c);
5331                 EMIT_ONE_BYTE (c);
5332               }
5333             else
5334               {
5335                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5336
5337                 CHAR_STRING_ADVANCE (c, p1);
5338                 do
5339                   {
5340                     EMIT_ONE_BYTE (*p0);
5341                     p0++;
5342                   }
5343                 while (p0 < p1);
5344               }
5345           }
5346       else
5347         while (charbuf < charbuf_end)
5348           {
5349             ASSURE_DESTINATION (safe_room);
5350             c = *charbuf++;
5351             EMIT_ONE_BYTE (c);
5352           }
5353     }
5354   else
5355     {
5356       if (coding->src_multibyte)
5357         {
5358           int safe_room = MAX_MULTIBYTE_LENGTH;
5359
5360           while (charbuf < charbuf_end)
5361             {
5362               ASSURE_DESTINATION (safe_room);
5363               c = *charbuf++;
5364               if (ASCII_CHAR_P (c))
5365                 *dst++ = c;
5366               else if (CHAR_BYTE8_P (c))
5367                 *dst++ = CHAR_TO_BYTE8 (c);
5368               else
5369                 CHAR_STRING_ADVANCE (c, dst);
5370             }
5371         }
5372       else
5373         {
5374           ASSURE_DESTINATION (charbuf_end - charbuf);
5375           while (charbuf < charbuf_end && dst < dst_end)
5376             *dst++ = *charbuf++;
5377         }
5378       produced_chars = dst - (coding->destination + coding->produced);
5379     }
5380   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5381   coding->produced_char += produced_chars;
5382   coding->produced = dst - coding->destination;
5383   return 0;
5384 }
5385
5386 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5387    Return true if a text is encoded in a charset-based coding system.  */
5388
5389 static bool
5390 detect_coding_charset (struct coding_system *coding,
5391                        struct coding_detection_info *detect_info)
5392 {
5393   const unsigned char *src = coding->source, *src_base;
5394   const unsigned char *src_end = coding->source + coding->src_bytes;
5395   bool multibytep = coding->src_multibyte;
5396   ptrdiff_t consumed_chars = 0;
5397   Lisp_Object attrs, valids, name;
5398   int found = 0;
5399   ptrdiff_t head_ascii = coding->head_ascii;
5400   bool check_latin_extra = 0;
5401
5402   detect_info->checked |= CATEGORY_MASK_CHARSET;
5403
5404   coding = &coding_categories[coding_category_charset];
5405   attrs = CODING_ID_ATTRS (coding->id);
5406   valids = AREF (attrs, coding_attr_charset_valids);
5407   name = CODING_ID_NAME (coding->id);
5408   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5409                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5410       || strncmp (SSDATA (SYMBOL_NAME (name)),
5411                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5412     check_latin_extra = 1;
5413
5414   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5415     src += head_ascii;
5416
5417   while (1)
5418     {
5419       int c;
5420       Lisp_Object val;
5421       struct charset *charset;
5422       int dim, idx;
5423
5424       src_base = src;
5425       ONE_MORE_BYTE (c);
5426       if (c < 0)
5427         continue;
5428       val = AREF (valids, c);
5429       if (NILP (val))
5430         break;
5431       if (c >= 0x80)
5432         {
5433           if (c < 0xA0
5434               && check_latin_extra
5435               && (!VECTORP (Vlatin_extra_code_table)
5436                   || NILP (AREF (Vlatin_extra_code_table, c))))
5437             break;
5438           found = CATEGORY_MASK_CHARSET;
5439         }
5440       if (INTEGERP (val))
5441         {
5442           charset = CHARSET_FROM_ID (XFASTINT (val));
5443           dim = CHARSET_DIMENSION (charset);
5444           for (idx = 1; idx < dim; idx++)
5445             {
5446               if (src == src_end)
5447                 goto too_short;
5448               ONE_MORE_BYTE (c);
5449               if (c < charset->code_space[(dim - 1 - idx) * 4]
5450                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5451                 break;
5452             }
5453           if (idx < dim)
5454             break;
5455         }
5456       else
5457         {
5458           idx = 1;
5459           for (; CONSP (val); val = XCDR (val))
5460             {
5461               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5462               dim = CHARSET_DIMENSION (charset);
5463               while (idx < dim)
5464                 {
5465                   if (src == src_end)
5466                     goto too_short;
5467                   ONE_MORE_BYTE (c);
5468                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5469                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5470                     break;
5471                   idx++;
5472                 }
5473               if (idx == dim)
5474                 {
5475                   val = Qnil;
5476                   break;
5477                 }
5478             }
5479           if (CONSP (val))
5480             break;
5481         }
5482     }
5483  too_short:
5484   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5485   return 0;
5486
5487  no_more_source:
5488   detect_info->found |= found;
5489   return 1;
5490 }
5491
5492 static void
5493 decode_coding_charset (struct coding_system *coding)
5494 {
5495   const unsigned char *src = coding->source + coding->consumed;
5496   const unsigned char *src_end = coding->source + coding->src_bytes;
5497   const unsigned char *src_base;
5498   int *charbuf = coding->charbuf + coding->charbuf_used;
5499   /* We may produce one charset annotation in one loop and one more at
5500      the end.  */
5501   int *charbuf_end
5502     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5503   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5504   bool multibytep = coding->src_multibyte;
5505   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5506   Lisp_Object valids;
5507   ptrdiff_t char_offset = coding->produced_char;
5508   ptrdiff_t last_offset = char_offset;
5509   int last_id = charset_ascii;
5510   bool eol_dos
5511     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5512   int byte_after_cr = -1;
5513
5514   valids = AREF (attrs, coding_attr_charset_valids);
5515
5516   while (1)
5517     {
5518       int c;
5519       Lisp_Object val;
5520       struct charset *charset;
5521       int dim;
5522       int len = 1;
5523       unsigned code;
5524
5525       src_base = src;
5526       consumed_chars_base = consumed_chars;
5527
5528       if (charbuf >= charbuf_end)
5529         {
5530           if (byte_after_cr >= 0)
5531             src_base--;
5532           break;
5533         }
5534
5535       if (byte_after_cr >= 0)
5536         {
5537           c = byte_after_cr;
5538           byte_after_cr = -1;
5539         }
5540       else
5541         {
5542           ONE_MORE_BYTE (c);
5543           if (eol_dos && c == '\r')
5544             ONE_MORE_BYTE (byte_after_cr);
5545         }
5546       if (c < 0)
5547         goto invalid_code;
5548       code = c;
5549
5550       val = AREF (valids, c);
5551       if (! INTEGERP (val) && ! CONSP (val))
5552         goto invalid_code;
5553       if (INTEGERP (val))
5554         {
5555           charset = CHARSET_FROM_ID (XFASTINT (val));
5556           dim = CHARSET_DIMENSION (charset);
5557           while (len < dim)
5558             {
5559               ONE_MORE_BYTE (c);
5560               code = (code << 8) | c;
5561               len++;
5562             }
5563           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5564                               charset, code, c);
5565         }
5566       else
5567         {
5568           /* VAL is a list of charset IDs.  It is assured that the
5569              list is sorted by charset dimensions (smaller one
5570              comes first).  */
5571           while (CONSP (val))
5572             {
5573               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5574               dim = CHARSET_DIMENSION (charset);
5575               while (len < dim)
5576                 {
5577                   ONE_MORE_BYTE (c);
5578                   code = (code << 8) | c;
5579                   len++;
5580                 }
5581               CODING_DECODE_CHAR (coding, src, src_base,
5582                                   src_end, charset, code, c);
5583               if (c >= 0)
5584                 break;
5585               val = XCDR (val);
5586             }
5587         }
5588       if (c < 0)
5589         goto invalid_code;
5590       if (charset->id != charset_ascii
5591           && last_id != charset->id)
5592         {
5593           if (last_id != charset_ascii)
5594             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5595           last_id = charset->id;
5596           last_offset = char_offset;
5597         }
5598
5599       *charbuf++ = c;
5600       char_offset++;
5601       continue;
5602
5603     invalid_code:
5604       src = src_base;
5605       consumed_chars = consumed_chars_base;
5606       ONE_MORE_BYTE (c);
5607       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5608       char_offset++;
5609     }
5610
5611  no_more_source:
5612   if (last_id != charset_ascii)
5613     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5614   coding->consumed_char += consumed_chars_base;
5615   coding->consumed = src_base - coding->source;
5616   coding->charbuf_used = charbuf - coding->charbuf;
5617 }
5618
5619 static bool
5620 encode_coding_charset (struct coding_system *coding)
5621 {
5622   bool multibytep = coding->dst_multibyte;
5623   int *charbuf = coding->charbuf;
5624   int *charbuf_end = charbuf + coding->charbuf_used;
5625   unsigned char *dst = coding->destination + coding->produced;
5626   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5627   int safe_room = MAX_MULTIBYTE_LENGTH;
5628   ptrdiff_t produced_chars = 0;
5629   Lisp_Object attrs, charset_list;
5630   bool ascii_compatible;
5631   int c;
5632
5633   CODING_GET_INFO (coding, attrs, charset_list);
5634   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5635
5636   while (charbuf < charbuf_end)
5637     {
5638       struct charset *charset;
5639       unsigned code;
5640
5641       ASSURE_DESTINATION (safe_room);
5642       c = *charbuf++;
5643       if (ascii_compatible && ASCII_CHAR_P (c))
5644         EMIT_ONE_ASCII_BYTE (c);
5645       else if (CHAR_BYTE8_P (c))
5646         {
5647           c = CHAR_TO_BYTE8 (c);
5648           EMIT_ONE_BYTE (c);
5649         }
5650       else
5651         {
5652           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5653                                &code, charset);
5654
5655           if (charset)
5656             {
5657               if (CHARSET_DIMENSION (charset) == 1)
5658                 EMIT_ONE_BYTE (code);
5659               else if (CHARSET_DIMENSION (charset) == 2)
5660                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5661               else if (CHARSET_DIMENSION (charset) == 3)
5662                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5663               else
5664                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5665                                  (code >> 8) & 0xFF, code & 0xFF);
5666             }
5667           else
5668             {
5669               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5670                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5671               else
5672                 c = coding->default_char;
5673               EMIT_ONE_BYTE (c);
5674             }
5675         }
5676     }
5677
5678   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5679   coding->produced_char += produced_chars;
5680   coding->produced = dst - coding->destination;
5681   return 0;
5682 }
5683
5684 \f
5685 /*** 7. C library functions ***/
5686
5687 /* Setup coding context CODING from information about CODING_SYSTEM.
5688    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5689    CODING_SYSTEM is invalid, signal an error.  */
5690
5691 void
5692 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5693 {
5694   Lisp_Object attrs;
5695   Lisp_Object eol_type;
5696   Lisp_Object coding_type;
5697   Lisp_Object val;
5698
5699   if (NILP (coding_system))
5700     coding_system = Qundecided;
5701
5702   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5703
5704   attrs = CODING_ID_ATTRS (coding->id);
5705   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5706
5707   coding->mode = 0;
5708   if (VECTORP (eol_type))
5709     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5710                             | CODING_REQUIRE_DETECTION_MASK);
5711   else if (! EQ (eol_type, Qunix))
5712     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5713                             | CODING_REQUIRE_ENCODING_MASK);
5714   else
5715     coding->common_flags = 0;
5716   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5717     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5718   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5719     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5720   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5721     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5722
5723   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5724   coding->max_charset_id = SCHARS (val) - 1;
5725   coding->safe_charsets = SDATA (val);
5726   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5727   coding->carryover_bytes = 0;
5728   coding->raw_destination = 0;
5729
5730   coding_type = CODING_ATTR_TYPE (attrs);
5731   if (EQ (coding_type, Qundecided))
5732     {
5733       coding->detector = NULL;
5734       coding->decoder = decode_coding_raw_text;
5735       coding->encoder = encode_coding_raw_text;
5736       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5737       coding->spec.undecided.inhibit_nbd
5738         = (encode_inhibit_flag
5739            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5740       coding->spec.undecided.inhibit_ied
5741         = (encode_inhibit_flag
5742            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5743       coding->spec.undecided.prefer_utf_8
5744         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5745     }
5746   else if (EQ (coding_type, Qiso_2022))
5747     {
5748       int i;
5749       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5750
5751       /* Invoke graphic register 0 to plane 0.  */
5752       CODING_ISO_INVOCATION (coding, 0) = 0;
5753       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5754       CODING_ISO_INVOCATION (coding, 1)
5755         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5756       /* Setup the initial status of designation.  */
5757       for (i = 0; i < 4; i++)
5758         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5759       /* Not single shifting initially.  */
5760       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5761       /* Beginning of buffer should also be regarded as bol. */
5762       CODING_ISO_BOL (coding) = 1;
5763       coding->detector = detect_coding_iso_2022;
5764       coding->decoder = decode_coding_iso_2022;
5765       coding->encoder = encode_coding_iso_2022;
5766       if (flags & CODING_ISO_FLAG_SAFE)
5767         coding->mode |= CODING_MODE_SAFE_ENCODING;
5768       coding->common_flags
5769         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5770             | CODING_REQUIRE_FLUSHING_MASK);
5771       if (flags & CODING_ISO_FLAG_COMPOSITION)
5772         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5773       if (flags & CODING_ISO_FLAG_DESIGNATION)
5774         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5775       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5776         {
5777           setup_iso_safe_charsets (attrs);
5778           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5779           coding->max_charset_id = SCHARS (val) - 1;
5780           coding->safe_charsets = SDATA (val);
5781         }
5782       CODING_ISO_FLAGS (coding) = flags;
5783       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5784       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5785       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5786       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5787     }
5788   else if (EQ (coding_type, Qcharset))
5789     {
5790       coding->detector = detect_coding_charset;
5791       coding->decoder = decode_coding_charset;
5792       coding->encoder = encode_coding_charset;
5793       coding->common_flags
5794         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5795     }
5796   else if (EQ (coding_type, Qutf_8))
5797     {
5798       val = AREF (attrs, coding_attr_utf_bom);
5799       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5800                                    : EQ (val, Qt) ? utf_with_bom
5801                                    : utf_without_bom);
5802       coding->detector = detect_coding_utf_8;
5803       coding->decoder = decode_coding_utf_8;
5804       coding->encoder = encode_coding_utf_8;
5805       coding->common_flags
5806         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5807       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5808         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5809     }
5810   else if (EQ (coding_type, Qutf_16))
5811     {
5812       val = AREF (attrs, coding_attr_utf_bom);
5813       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5814                                     : EQ (val, Qt) ? utf_with_bom
5815                                     : utf_without_bom);
5816       val = AREF (attrs, coding_attr_utf_16_endian);
5817       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5818                                        : utf_16_little_endian);
5819       CODING_UTF_16_SURROGATE (coding) = 0;
5820       coding->detector = detect_coding_utf_16;
5821       coding->decoder = decode_coding_utf_16;
5822       coding->encoder = encode_coding_utf_16;
5823       coding->common_flags
5824         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5825       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5826         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5827     }
5828   else if (EQ (coding_type, Qccl))
5829     {
5830       coding->detector = detect_coding_ccl;
5831       coding->decoder = decode_coding_ccl;
5832       coding->encoder = encode_coding_ccl;
5833       coding->common_flags
5834         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5835             | CODING_REQUIRE_FLUSHING_MASK);
5836     }
5837   else if (EQ (coding_type, Qemacs_mule))
5838     {
5839       coding->detector = detect_coding_emacs_mule;
5840       coding->decoder = decode_coding_emacs_mule;
5841       coding->encoder = encode_coding_emacs_mule;
5842       coding->common_flags
5843         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5844       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5845           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5846         {
5847           Lisp_Object tail, safe_charsets;
5848           int max_charset_id = 0;
5849
5850           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5851                tail = XCDR (tail))
5852             if (max_charset_id < XFASTINT (XCAR (tail)))
5853               max_charset_id = XFASTINT (XCAR (tail));
5854           safe_charsets = make_uninit_string (max_charset_id + 1);
5855           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5856           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5857                tail = XCDR (tail))
5858             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5859           coding->max_charset_id = max_charset_id;
5860           coding->safe_charsets = SDATA (safe_charsets);
5861         }
5862       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5863       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5864     }
5865   else if (EQ (coding_type, Qshift_jis))
5866     {
5867       coding->detector = detect_coding_sjis;
5868       coding->decoder = decode_coding_sjis;
5869       coding->encoder = encode_coding_sjis;
5870       coding->common_flags
5871         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5872     }
5873   else if (EQ (coding_type, Qbig5))
5874     {
5875       coding->detector = detect_coding_big5;
5876       coding->decoder = decode_coding_big5;
5877       coding->encoder = encode_coding_big5;
5878       coding->common_flags
5879         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5880     }
5881   else                          /* EQ (coding_type, Qraw_text) */
5882     {
5883       coding->detector = NULL;
5884       coding->decoder = decode_coding_raw_text;
5885       coding->encoder = encode_coding_raw_text;
5886       if (! EQ (eol_type, Qunix))
5887         {
5888           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5889           if (! VECTORP (eol_type))
5890             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5891         }
5892
5893     }
5894
5895   return;
5896 }
5897
5898 /* Return a list of charsets supported by CODING.  */
5899
5900 Lisp_Object
5901 coding_charset_list (struct coding_system *coding)
5902 {
5903   Lisp_Object attrs, charset_list;
5904
5905   CODING_GET_INFO (coding, attrs, charset_list);
5906   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5907     {
5908       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5909
5910       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5911         charset_list = Viso_2022_charset_list;
5912     }
5913   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5914     {
5915       charset_list = Vemacs_mule_charset_list;
5916     }
5917   return charset_list;
5918 }
5919
5920
5921 /* Return a list of charsets supported by CODING-SYSTEM.  */
5922
5923 Lisp_Object
5924 coding_system_charset_list (Lisp_Object coding_system)
5925 {
5926   ptrdiff_t id;
5927   Lisp_Object attrs, charset_list;
5928
5929   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5930   attrs = CODING_ID_ATTRS (id);
5931
5932   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5933     {
5934       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5935
5936       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5937         charset_list = Viso_2022_charset_list;
5938       else
5939         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5940     }
5941   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5942     {
5943       charset_list = Vemacs_mule_charset_list;
5944     }
5945   else
5946     {
5947       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5948     }
5949   return charset_list;
5950 }
5951
5952
5953 /* Return raw-text or one of its subsidiaries that has the same
5954    eol_type as CODING-SYSTEM.  */
5955
5956 Lisp_Object
5957 raw_text_coding_system (Lisp_Object coding_system)
5958 {
5959   Lisp_Object spec, attrs;
5960   Lisp_Object eol_type, raw_text_eol_type;
5961
5962   if (NILP (coding_system))
5963     return Qraw_text;
5964   spec = CODING_SYSTEM_SPEC (coding_system);
5965   attrs = AREF (spec, 0);
5966
5967   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5968     return coding_system;
5969
5970   eol_type = AREF (spec, 2);
5971   if (VECTORP (eol_type))
5972     return Qraw_text;
5973   spec = CODING_SYSTEM_SPEC (Qraw_text);
5974   raw_text_eol_type = AREF (spec, 2);
5975   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5976           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5977           : AREF (raw_text_eol_type, 2));
5978 }
5979
5980 /* Return true if CODING corresponds to raw-text coding-system.  */
5981
5982 bool
5983 raw_text_coding_system_p (struct coding_system *coding)
5984 {
5985   return (coding->decoder == decode_coding_raw_text
5986           && coding->encoder == encode_coding_raw_text) ? true : false;
5987 }
5988
5989
5990 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5991    the subsidiary that has the same eol-spec as PARENT (if it is not
5992    nil and specifies end-of-line format) or the system's setting
5993    (system_eol_type).  */
5994
5995 Lisp_Object
5996 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5997 {
5998   Lisp_Object spec, eol_type;
5999
6000   if (NILP (coding_system))
6001     coding_system = Qraw_text;
6002   else
6003     CHECK_CODING_SYSTEM (coding_system);
6004   spec = CODING_SYSTEM_SPEC (coding_system);
6005   eol_type = AREF (spec, 2);
6006   if (VECTORP (eol_type))
6007     {
6008       Lisp_Object parent_eol_type;
6009
6010       if (! NILP (parent))
6011         {
6012           Lisp_Object parent_spec;
6013
6014           CHECK_CODING_SYSTEM (parent);
6015           parent_spec = CODING_SYSTEM_SPEC (parent);
6016           parent_eol_type = AREF (parent_spec, 2);
6017           if (VECTORP (parent_eol_type))
6018             parent_eol_type = system_eol_type;
6019         }
6020       else
6021         parent_eol_type = system_eol_type;
6022       if (EQ (parent_eol_type, Qunix))
6023         coding_system = AREF (eol_type, 0);
6024       else if (EQ (parent_eol_type, Qdos))
6025         coding_system = AREF (eol_type, 1);
6026       else if (EQ (parent_eol_type, Qmac))
6027         coding_system = AREF (eol_type, 2);
6028     }
6029   return coding_system;
6030 }
6031
6032
6033 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6034    decided for writing to a process.  If not, complement them, and
6035    return a new coding system.  */
6036
6037 Lisp_Object
6038 complement_process_encoding_system (Lisp_Object coding_system)
6039 {
6040   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6041   Lisp_Object spec, attrs;
6042   int i;
6043
6044   for (i = 0; i < 3; i++)
6045     {
6046       if (i == 1)
6047         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6048       else if (i == 2)
6049         coding_system = preferred_coding_system ();
6050       spec = CODING_SYSTEM_SPEC (coding_system);
6051       if (NILP (spec))
6052         continue;
6053       attrs = AREF (spec, 0);
6054       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6055         coding_base = CODING_ATTR_BASE_NAME (attrs);
6056       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6057         eol_base = coding_system;
6058       if (! NILP (coding_base) && ! NILP (eol_base))
6059         break;
6060     }
6061
6062   if (i > 0)
6063     /* The original CODING_SYSTEM didn't specify text-conversion or
6064        eol-conversion.  Be sure that we return a fully complemented
6065        coding system.  */
6066     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6067   return coding_system;
6068 }
6069
6070
6071 /* Emacs has a mechanism to automatically detect a coding system if it
6072    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6073    it's impossible to distinguish some coding systems accurately
6074    because they use the same range of codes.  So, at first, coding
6075    systems are categorized into 7, those are:
6076
6077    o coding-category-emacs-mule
6078
6079         The category for a coding system which has the same code range
6080         as Emacs' internal format.  Assigned the coding-system (Lisp
6081         symbol) `emacs-mule' by default.
6082
6083    o coding-category-sjis
6084
6085         The category for a coding system which has the same code range
6086         as SJIS.  Assigned the coding-system (Lisp
6087         symbol) `japanese-shift-jis' by default.
6088
6089    o coding-category-iso-7
6090
6091         The category for a coding system which has the same code range
6092         as ISO2022 of 7-bit environment.  This doesn't use any locking
6093         shift and single shift functions.  This can encode/decode all
6094         charsets.  Assigned the coding-system (Lisp symbol)
6095         `iso-2022-7bit' by default.
6096
6097    o coding-category-iso-7-tight
6098
6099         Same as coding-category-iso-7 except that this can
6100         encode/decode only the specified charsets.
6101
6102    o coding-category-iso-8-1
6103
6104         The category for a coding system which has the same code range
6105         as ISO2022 of 8-bit environment and graphic plane 1 used only
6106         for DIMENSION1 charset.  This doesn't use any locking shift
6107         and single shift functions.  Assigned the coding-system (Lisp
6108         symbol) `iso-latin-1' by default.
6109
6110    o coding-category-iso-8-2
6111
6112         The category for a coding system which has the same code range
6113         as ISO2022 of 8-bit environment and graphic plane 1 used only
6114         for DIMENSION2 charset.  This doesn't use any locking shift
6115         and single shift functions.  Assigned the coding-system (Lisp
6116         symbol) `japanese-iso-8bit' by default.
6117
6118    o coding-category-iso-7-else
6119
6120         The category for a coding system which has the same code range
6121         as ISO2022 of 7-bit environment but uses locking shift or
6122         single shift functions.  Assigned the coding-system (Lisp
6123         symbol) `iso-2022-7bit-lock' by default.
6124
6125    o coding-category-iso-8-else
6126
6127         The category for a coding system which has the same code range
6128         as ISO2022 of 8-bit environment but uses locking shift or
6129         single shift functions.  Assigned the coding-system (Lisp
6130         symbol) `iso-2022-8bit-ss2' by default.
6131
6132    o coding-category-big5
6133
6134         The category for a coding system which has the same code range
6135         as BIG5.  Assigned the coding-system (Lisp symbol)
6136         `cn-big5' by default.
6137
6138    o coding-category-utf-8
6139
6140         The category for a coding system which has the same code range
6141         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6142         symbol) `utf-8' by default.
6143
6144    o coding-category-utf-16-be
6145
6146         The category for a coding system in which a text has an
6147         Unicode signature (cf. Unicode Standard) in the order of BIG
6148         endian at the head.  Assigned the coding-system (Lisp symbol)
6149         `utf-16-be' by default.
6150
6151    o coding-category-utf-16-le
6152
6153         The category for a coding system in which a text has an
6154         Unicode signature (cf. Unicode Standard) in the order of
6155         LITTLE endian at the head.  Assigned the coding-system (Lisp
6156         symbol) `utf-16-le' by default.
6157
6158    o coding-category-ccl
6159
6160         The category for a coding system of which encoder/decoder is
6161         written in CCL programs.  The default value is nil, i.e., no
6162         coding system is assigned.
6163
6164    o coding-category-binary
6165
6166         The category for a coding system not categorized in any of the
6167         above.  Assigned the coding-system (Lisp symbol)
6168         `no-conversion' by default.
6169
6170    Each of them is a Lisp symbol and the value is an actual
6171    `coding-system's (this is also a Lisp symbol) assigned by a user.
6172    What Emacs does actually is to detect a category of coding system.
6173    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6174    decide only one possible category, it selects a category of the
6175    highest priority.  Priorities of categories are also specified by a
6176    user in a Lisp variable `coding-category-list'.
6177
6178 */
6179
6180 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6181                                            int eol_seen);
6182
6183
6184 /* Return the number of ASCII characters at the head of the source.
6185    By side effects, set coding->head_ascii and update
6186    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6187    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6188    reliable only when all the source bytes are ASCII.  */
6189
6190 static ptrdiff_t
6191 check_ascii (struct coding_system *coding)
6192 {
6193   const unsigned char *src, *end;
6194   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6195   int eol_seen = coding->eol_seen;
6196
6197   coding_set_source (coding);
6198   src = coding->source;
6199   end = src + coding->src_bytes;
6200
6201   if (inhibit_eol_conversion
6202       || SYMBOLP (eol_type))
6203     {
6204       /* We don't have to check EOL format.  */
6205       while (src < end && !( *src & 0x80))
6206         {
6207           if (*src++ == '\n')
6208             eol_seen |= EOL_SEEN_LF;
6209         }
6210     }
6211   else
6212     {
6213       end--;                /* We look ahead one byte for "CR LF".  */
6214       while (src < end)
6215         {
6216           int c = *src;
6217
6218           if (c & 0x80)
6219             break;
6220           src++;
6221           if (c == '\r')
6222             {
6223               if (*src == '\n')
6224                 {
6225                   eol_seen |= EOL_SEEN_CRLF;
6226                   src++;
6227                 }
6228               else
6229                 eol_seen |= EOL_SEEN_CR;
6230             }
6231           else if (c == '\n')
6232             eol_seen |= EOL_SEEN_LF;
6233         }
6234       if (src == end)
6235         {
6236           int c = *src;
6237
6238           /* All bytes but the last one C are ASCII.  */
6239           if (! (c & 0x80))
6240             {
6241               if (c == '\r')
6242                 eol_seen |= EOL_SEEN_CR;
6243               else if (c  == '\n')
6244                 eol_seen |= EOL_SEEN_LF;
6245               src++;
6246             }
6247         }
6248     }
6249   coding->head_ascii = src - coding->source;
6250   coding->eol_seen = eol_seen;
6251   return (coding->head_ascii);
6252 }
6253
6254
6255 /* Return the number of characters at the source if all the bytes are
6256    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6257    effects, update coding->eol_seen.  The value of coding->eol_seen is
6258    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6259    the value is reliable only when all the source bytes are valid
6260    UTF-8.  */
6261
6262 static ptrdiff_t
6263 check_utf_8 (struct coding_system *coding)
6264 {
6265   const unsigned char *src, *end;
6266   int eol_seen;
6267   ptrdiff_t nchars = coding->head_ascii;
6268
6269   if (coding->head_ascii < 0)
6270     check_ascii (coding);
6271   else
6272     coding_set_source (coding);
6273   src = coding->source + coding->head_ascii;
6274   /* We look ahead one byte for CR LF.  */
6275   end = coding->source + coding->src_bytes - 1;
6276   eol_seen = coding->eol_seen;
6277   while (src < end)
6278     {
6279       int c = *src;
6280
6281       if (UTF_8_1_OCTET_P (*src))
6282         {
6283           src++;
6284           if (c < 0x20)
6285             {
6286               if (c == '\r')
6287                 {
6288                   if (*src == '\n')
6289                     {
6290                       eol_seen |= EOL_SEEN_CRLF;
6291                       src++;
6292                       nchars++;
6293                     }
6294                   else
6295                     eol_seen |= EOL_SEEN_CR;
6296                 }
6297               else if (c == '\n')
6298                 eol_seen |= EOL_SEEN_LF;
6299             }
6300         }
6301       else if (UTF_8_2_OCTET_LEADING_P (c))
6302         {
6303           if (c < 0xC2          /* overlong sequence */
6304               || src + 1 >= end
6305               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6306             return -1;
6307           src += 2;
6308         }
6309       else if (UTF_8_3_OCTET_LEADING_P (c))
6310         {
6311           if (src + 2 >= end
6312               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6313                     && UTF_8_EXTRA_OCTET_P (src[2])))
6314             return -1;
6315           c = (((c & 0xF) << 12)
6316                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6317           if (c < 0x800                       /* overlong sequence */
6318               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6319             return -1;
6320           src += 3;
6321         }
6322       else if (UTF_8_4_OCTET_LEADING_P (c))
6323         {
6324           if (src + 3 >= end
6325               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6326                     && UTF_8_EXTRA_OCTET_P (src[2])
6327                     && UTF_8_EXTRA_OCTET_P (src[3])))
6328             return -1;
6329           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6330                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6331           if (c < 0x10000       /* overlong sequence */
6332               || c >= 0x110000) /* non-Unicode character  */
6333             return -1;
6334           src += 4;
6335         }
6336       else
6337         return -1;
6338       nchars++;
6339     }
6340
6341   if (src == end)
6342     {
6343       if (! UTF_8_1_OCTET_P (*src))
6344         return -1;
6345       nchars++;
6346       if (*src == '\r')
6347         eol_seen |= EOL_SEEN_CR;
6348       else if (*src  == '\n')
6349         eol_seen |= EOL_SEEN_LF;
6350     }
6351   coding->eol_seen = eol_seen;
6352   return nchars;
6353 }
6354
6355
6356 /* Return whether STRING is a valid UTF-8 string.  STRING must be a
6357    unibyte string.  */
6358
6359 bool
6360 utf8_string_p (Lisp_Object string)
6361 {
6362   eassert (!STRING_MULTIBYTE (string));
6363   struct coding_system coding;
6364   setup_coding_system (Qutf_8_unix, &coding);
6365   /* We initialize only the fields that check_utf_8 accesses.  */
6366   coding.head_ascii = -1;
6367   coding.src_pos = 0;
6368   coding.src_pos_byte = 0;
6369   coding.src_chars = SCHARS (string);
6370   coding.src_bytes = SBYTES (string);
6371   coding.src_object = string;
6372   coding.eol_seen = EOL_SEEN_NONE;
6373   return check_utf_8 (&coding) != -1;
6374 }
6375
6376
6377 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6378    SOURCE is encoded.  If CATEGORY is one of
6379    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6380    two-byte, else they are encoded by one-byte.
6381
6382    Return one of EOL_SEEN_XXX.  */
6383
6384 #define MAX_EOL_CHECK_COUNT 3
6385
6386 static int
6387 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6388             enum coding_category category)
6389 {
6390   const unsigned char *src = source, *src_end = src + src_bytes;
6391   unsigned char c;
6392   int total  = 0;
6393   int eol_seen = EOL_SEEN_NONE;
6394
6395   if ((1 << category) & CATEGORY_MASK_UTF_16)
6396     {
6397       bool msb = category == (coding_category_utf_16_le
6398                               | coding_category_utf_16_le_nosig);
6399       bool lsb = !msb;
6400
6401       while (src + 1 < src_end)
6402         {
6403           c = src[lsb];
6404           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6405             {
6406               int this_eol;
6407
6408               if (c == '\n')
6409                 this_eol = EOL_SEEN_LF;
6410               else if (src + 3 >= src_end
6411                        || src[msb + 2] != 0
6412                        || src[lsb + 2] != '\n')
6413                 this_eol = EOL_SEEN_CR;
6414               else
6415                 {
6416                   this_eol = EOL_SEEN_CRLF;
6417                   src += 2;
6418                 }
6419
6420               if (eol_seen == EOL_SEEN_NONE)
6421                 /* This is the first end-of-line.  */
6422                 eol_seen = this_eol;
6423               else if (eol_seen != this_eol)
6424                 {
6425                   /* The found type is different from what found before.
6426                      Allow for stray ^M characters in DOS EOL files.  */
6427                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6428                       || (eol_seen == EOL_SEEN_CRLF
6429                           && this_eol == EOL_SEEN_CR))
6430                     eol_seen = EOL_SEEN_CRLF;
6431                   else
6432                     {
6433                       eol_seen = EOL_SEEN_LF;
6434                       break;
6435                     }
6436                 }
6437               if (++total == MAX_EOL_CHECK_COUNT)
6438                 break;
6439             }
6440           src += 2;
6441         }
6442     }
6443   else
6444     while (src < src_end)
6445       {
6446         c = *src++;
6447         if (c == '\n' || c == '\r')
6448           {
6449             int this_eol;
6450
6451             if (c == '\n')
6452               this_eol = EOL_SEEN_LF;
6453             else if (src >= src_end || *src != '\n')
6454               this_eol = EOL_SEEN_CR;
6455             else
6456               this_eol = EOL_SEEN_CRLF, src++;
6457
6458             if (eol_seen == EOL_SEEN_NONE)
6459               /* This is the first end-of-line.  */
6460               eol_seen = this_eol;
6461             else if (eol_seen != this_eol)
6462               {
6463                 /* The found type is different from what found before.
6464                    Allow for stray ^M characters in DOS EOL files.  */
6465                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6466                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6467                   eol_seen = EOL_SEEN_CRLF;
6468                 else
6469                   {
6470                     eol_seen = EOL_SEEN_LF;
6471                     break;
6472                   }
6473               }
6474             if (++total == MAX_EOL_CHECK_COUNT)
6475               break;
6476           }
6477       }
6478   return eol_seen;
6479 }
6480
6481
6482 static Lisp_Object
6483 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6484 {
6485   Lisp_Object eol_type;
6486
6487   eol_type = CODING_ID_EOL_TYPE (coding->id);
6488   if (! VECTORP (eol_type))
6489     /* Already adjusted.  */
6490     return eol_type;
6491   if (eol_seen & EOL_SEEN_LF)
6492     {
6493       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6494       eol_type = Qunix;
6495     }
6496   else if (eol_seen & EOL_SEEN_CRLF)
6497     {
6498       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6499       eol_type = Qdos;
6500     }
6501   else if (eol_seen & EOL_SEEN_CR)
6502     {
6503       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6504       eol_type = Qmac;
6505     }
6506   return eol_type;
6507 }
6508
6509 /* Detect how a text specified in CODING is encoded.  If a coding
6510    system is detected, update fields of CODING by the detected coding
6511    system.  */
6512
6513 static void
6514 detect_coding (struct coding_system *coding)
6515 {
6516   const unsigned char *src, *src_end;
6517   unsigned int saved_mode = coding->mode;
6518   Lisp_Object found = Qnil;
6519   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6520
6521   coding->consumed = coding->consumed_char = 0;
6522   coding->produced = coding->produced_char = 0;
6523   coding_set_source (coding);
6524
6525   src_end = coding->source + coding->src_bytes;
6526
6527   coding->eol_seen = EOL_SEEN_NONE;
6528   /* If we have not yet decided the text encoding type, detect it
6529      now.  */
6530   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6531     {
6532       int c, i;
6533       struct coding_detection_info detect_info;
6534       bool null_byte_found = 0, eight_bit_found = 0;
6535       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6536                                        inhibit_null_byte_detection);
6537       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6538                                        inhibit_iso_escape_detection);
6539       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6540
6541       coding->head_ascii = 0;
6542       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6543       for (src = coding->source; src < src_end; src++)
6544         {
6545           c = *src;
6546           if (c & 0x80)
6547             {
6548               eight_bit_found = 1;
6549               if (null_byte_found)
6550                 break;
6551             }
6552           else if (c < 0x20)
6553             {
6554               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6555                   && ! inhibit_ied
6556                   && ! detect_info.checked)
6557                 {
6558                   if (detect_coding_iso_2022 (coding, &detect_info))
6559                     {
6560                       /* We have scanned the whole data.  */
6561                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6562                         {
6563                           /* We didn't find an 8-bit code.  We may
6564                              have found a null-byte, but it's very
6565                              rare that a binary file conforms to
6566                              ISO-2022.  */
6567                           src = src_end;
6568                           coding->head_ascii = src - coding->source;
6569                         }
6570                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6571                       break;
6572                     }
6573                 }
6574               else if (! c && !inhibit_nbd)
6575                 {
6576                   null_byte_found = 1;
6577                   if (eight_bit_found)
6578                     break;
6579                 }
6580               else if (! disable_ascii_optimization
6581                        && ! inhibit_eol_conversion)
6582                 {
6583                   if (c == '\r')
6584                     {
6585                       if (src < src_end && src[1] == '\n')
6586                         {
6587                           coding->eol_seen |= EOL_SEEN_CRLF;
6588                           src++;
6589                           if (! eight_bit_found)
6590                             coding->head_ascii++;
6591                         }
6592                       else
6593                         coding->eol_seen |= EOL_SEEN_CR;
6594                     }
6595                   else if (c == '\n')
6596                     {
6597                       coding->eol_seen |= EOL_SEEN_LF;
6598                     }
6599                 }
6600
6601               if (! eight_bit_found)
6602                 coding->head_ascii++;
6603             }
6604           else if (! eight_bit_found)
6605             coding->head_ascii++;
6606         }
6607
6608       if (null_byte_found || eight_bit_found
6609           || coding->head_ascii < coding->src_bytes
6610           || detect_info.found)
6611         {
6612           enum coding_category category;
6613           struct coding_system *this;
6614
6615           if (coding->head_ascii == coding->src_bytes)
6616             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6617             for (i = 0; i < coding_category_raw_text; i++)
6618               {
6619                 category = coding_priorities[i];
6620                 this = coding_categories + category;
6621                 if (detect_info.found & (1 << category))
6622                   break;
6623               }
6624           else
6625             {
6626               if (null_byte_found)
6627                 {
6628                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6629                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6630                 }
6631               else if (prefer_utf_8
6632                        && detect_coding_utf_8 (coding, &detect_info))
6633                 {
6634                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6635                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6636                 }
6637               for (i = 0; i < coding_category_raw_text; i++)
6638                 {
6639                   category = coding_priorities[i];
6640                   this = coding_categories + category;
6641                   /* Some of this->detector (e.g. detect_coding_sjis)
6642                      require this information.  */
6643                   coding->id = this->id;
6644                   if (this->id < 0)
6645                     {
6646                       /* No coding system of this category is defined.  */
6647                       detect_info.rejected |= (1 << category);
6648                     }
6649                   else if (category >= coding_category_raw_text)
6650                     continue;
6651                   else if (detect_info.checked & (1 << category))
6652                     {
6653                       if (detect_info.found & (1 << category))
6654                         break;
6655                     }
6656                   else if ((*(this->detector)) (coding, &detect_info)
6657                            && detect_info.found & (1 << category))
6658                     break;
6659                 }
6660             }
6661
6662           if (i < coding_category_raw_text)
6663             {
6664               if (category == coding_category_utf_8_auto)
6665                 {
6666                   Lisp_Object coding_systems;
6667
6668                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6669                                          coding_attr_utf_bom);
6670                   if (CONSP (coding_systems))
6671                     {
6672                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6673                         found = XCAR (coding_systems);
6674                       else
6675                         found = XCDR (coding_systems);
6676                     }
6677                   else
6678                     found = CODING_ID_NAME (this->id);
6679                 }
6680               else if (category == coding_category_utf_16_auto)
6681                 {
6682                   Lisp_Object coding_systems;
6683
6684                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6685                                          coding_attr_utf_bom);
6686                   if (CONSP (coding_systems))
6687                     {
6688                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6689                         found = XCAR (coding_systems);
6690                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6691                         found = XCDR (coding_systems);
6692                     }
6693                   else
6694                     found = CODING_ID_NAME (this->id);
6695                 }
6696               else
6697                 found = CODING_ID_NAME (this->id);
6698             }
6699           else if (null_byte_found)
6700             found = Qno_conversion;
6701           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6702                    == CATEGORY_MASK_ANY)
6703             found = Qraw_text;
6704           else if (detect_info.rejected)
6705             for (i = 0; i < coding_category_raw_text; i++)
6706               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6707                 {
6708                   this = coding_categories + coding_priorities[i];
6709                   found = CODING_ID_NAME (this->id);
6710                   break;
6711                 }
6712         }
6713     }
6714   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6715            == coding_category_utf_8_auto)
6716     {
6717       Lisp_Object coding_systems;
6718       struct coding_detection_info detect_info;
6719
6720       coding_systems
6721         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6722       detect_info.found = detect_info.rejected = 0;
6723       if (check_ascii (coding) == coding->src_bytes)
6724         {
6725           if (CONSP (coding_systems))
6726             found = XCDR (coding_systems);
6727         }
6728       else
6729         {
6730           if (CONSP (coding_systems)
6731               && detect_coding_utf_8 (coding, &detect_info))
6732             {
6733               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6734                 found = XCAR (coding_systems);
6735               else
6736                 found = XCDR (coding_systems);
6737             }
6738         }
6739     }
6740   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6741            == coding_category_utf_16_auto)
6742     {
6743       Lisp_Object coding_systems;
6744       struct coding_detection_info detect_info;
6745
6746       coding_systems
6747         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6748       detect_info.found = detect_info.rejected = 0;
6749       coding->head_ascii = 0;
6750       if (CONSP (coding_systems)
6751           && detect_coding_utf_16 (coding, &detect_info))
6752         {
6753           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6754             found = XCAR (coding_systems);
6755           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6756             found = XCDR (coding_systems);
6757         }
6758     }
6759
6760   if (! NILP (found))
6761     {
6762       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6763                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6764                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6765                            : EOL_SEEN_LF);
6766
6767       setup_coding_system (found, coding);
6768       if (specified_eol != EOL_SEEN_NONE)
6769         adjust_coding_eol_type (coding, specified_eol);
6770     }
6771
6772   coding->mode = saved_mode;
6773 }
6774
6775
6776 static void
6777 decode_eol (struct coding_system *coding)
6778 {
6779   Lisp_Object eol_type;
6780   unsigned char *p, *pbeg, *pend;
6781
6782   eol_type = CODING_ID_EOL_TYPE (coding->id);
6783   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6784     return;
6785
6786   if (NILP (coding->dst_object))
6787     pbeg = coding->destination;
6788   else
6789     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6790   pend = pbeg + coding->produced;
6791
6792   if (VECTORP (eol_type))
6793     {
6794       int eol_seen = EOL_SEEN_NONE;
6795
6796       for (p = pbeg; p < pend; p++)
6797         {
6798           if (*p == '\n')
6799             eol_seen |= EOL_SEEN_LF;
6800           else if (*p == '\r')
6801             {
6802               if (p + 1 < pend && *(p + 1) == '\n')
6803                 {
6804                   eol_seen |= EOL_SEEN_CRLF;
6805                   p++;
6806                 }
6807               else
6808                 eol_seen |= EOL_SEEN_CR;
6809             }
6810         }
6811       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6812       if ((eol_seen & EOL_SEEN_CRLF) != 0
6813           && (eol_seen & EOL_SEEN_CR) != 0
6814           && (eol_seen & EOL_SEEN_LF) == 0)
6815         eol_seen = EOL_SEEN_CRLF;
6816       else if (eol_seen != EOL_SEEN_NONE
6817           && eol_seen != EOL_SEEN_LF
6818           && eol_seen != EOL_SEEN_CRLF
6819           && eol_seen != EOL_SEEN_CR)
6820         eol_seen = EOL_SEEN_LF;
6821       if (eol_seen != EOL_SEEN_NONE)
6822         eol_type = adjust_coding_eol_type (coding, eol_seen);
6823     }
6824
6825   if (EQ (eol_type, Qmac))
6826     {
6827       for (p = pbeg; p < pend; p++)
6828         if (*p == '\r')
6829           *p = '\n';
6830     }
6831   else if (EQ (eol_type, Qdos))
6832     {
6833       ptrdiff_t n = 0;
6834       ptrdiff_t pos = coding->dst_pos;
6835       ptrdiff_t pos_byte = coding->dst_pos_byte;
6836       ptrdiff_t pos_end = pos_byte + coding->produced - 1;
6837
6838       /* This assertion is here instead of code, now deleted, that
6839          handled the NILP case, which no longer happens with the
6840          current codebase.  */
6841       eassert (!NILP (coding->dst_object));
6842
6843       while (pos_byte < pos_end)
6844         {
6845           int incr;
6846
6847           p = BYTE_POS_ADDR (pos_byte);
6848           if (coding->dst_multibyte)
6849             incr = BYTES_BY_CHAR_HEAD (*p);
6850           else
6851             incr = 1;
6852
6853           if (*p == '\r' && p[1] == '\n')
6854             {
6855               del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6856               n++;
6857               pos_end--;
6858             }
6859           pos++;
6860           pos_byte += incr;
6861         }
6862       coding->produced -= n;
6863       coding->produced_char -= n;
6864     }
6865 }
6866
6867
6868 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
6869    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
6870    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
6871 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6872
6873 /* Return a translation table (or list of them) from coding system
6874    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6875    not ENCODEP). */
6876
6877 static Lisp_Object
6878 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6879 {
6880   Lisp_Object standard, translation_table;
6881   Lisp_Object val;
6882
6883   if (NILP (Venable_character_translation))
6884     {
6885       if (max_lookup)
6886         *max_lookup = 0;
6887       return Qnil;
6888     }
6889   if (encodep)
6890     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6891       standard = Vstandard_translation_table_for_encode;
6892   else
6893     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6894       standard = Vstandard_translation_table_for_decode;
6895   if (NILP (translation_table))
6896     translation_table = standard;
6897   else
6898     {
6899       if (SYMBOLP (translation_table))
6900         translation_table = Fget (translation_table, Qtranslation_table);
6901       else if (CONSP (translation_table))
6902         {
6903           translation_table = Fcopy_sequence (translation_table);
6904           for (val = translation_table; CONSP (val); val = XCDR (val))
6905             if (SYMBOLP (XCAR (val)))
6906               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6907         }
6908       if (CHAR_TABLE_P (standard))
6909         {
6910           if (CONSP (translation_table))
6911             translation_table = nconc2 (translation_table, list1 (standard));
6912           else
6913             translation_table = list2 (translation_table, standard);
6914         }
6915     }
6916
6917   if (max_lookup)
6918     {
6919       *max_lookup = 1;
6920       if (CHAR_TABLE_P (translation_table)
6921           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6922         {
6923           val = XCHAR_TABLE (translation_table)->extras[1];
6924           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6925             *max_lookup = min (XFASTINT (val), MAX_LOOKUP_MAX);
6926         }
6927       else if (CONSP (translation_table))
6928         {
6929           Lisp_Object tail;
6930
6931           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6932             if (CHAR_TABLE_P (XCAR (tail))
6933                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6934               {
6935                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6936                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6937                   *max_lookup = min (XFASTINT (tailval), MAX_LOOKUP_MAX);
6938               }
6939         }
6940     }
6941   return translation_table;
6942 }
6943
6944 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6945   do {                                                          \
6946     trans = Qnil;                                               \
6947     if (CHAR_TABLE_P (table))                                   \
6948       {                                                         \
6949         trans = CHAR_TABLE_REF (table, c);                      \
6950         if (CHARACTERP (trans))                                 \
6951           c = XFASTINT (trans), trans = Qnil;                   \
6952       }                                                         \
6953     else if (CONSP (table))                                     \
6954       {                                                         \
6955         Lisp_Object tail;                                       \
6956                                                                 \
6957         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6958           if (CHAR_TABLE_P (XCAR (tail)))                       \
6959             {                                                   \
6960               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6961               if (CHARACTERP (trans))                           \
6962                 c = XFASTINT (trans), trans = Qnil;             \
6963               else if (! NILP (trans))                          \
6964                 break;                                          \
6965             }                                                   \
6966       }                                                         \
6967   } while (0)
6968
6969
6970 /* Return a translation of character(s) at BUF according to TRANS.
6971    TRANS is TO-CHAR, [TO-CHAR ...], or ((FROM .  TO) ...) where FROM =
6972    [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].  The return value
6973    is TO-CHAR or [TO-CHAR ...] if a translation is found, Qnil if not
6974    found, or Qt if BUF is too short to lookup characters in FROM.  As
6975    a side effect, if a translation is found, *NCHARS is set to the
6976    number of characters being translated.  */
6977
6978 static Lisp_Object
6979 get_translation (Lisp_Object trans, int *buf, int *buf_end, ptrdiff_t *nchars)
6980 {
6981   if (INTEGERP (trans) || VECTORP (trans))
6982     {
6983       *nchars = 1;
6984       return trans;
6985     }
6986   for (; CONSP (trans); trans = XCDR (trans))
6987     {
6988       Lisp_Object val = XCAR (trans);
6989       Lisp_Object from = XCAR (val);
6990       ptrdiff_t len = ASIZE (from);
6991       ptrdiff_t i;
6992
6993       for (i = 0; i < len; i++)
6994         {
6995           if (buf + i == buf_end)
6996             return Qt;
6997           if (XINT (AREF (from, i)) != buf[i])
6998             break;
6999         }
7000       if (i == len)
7001         {
7002           *nchars = len;
7003           return XCDR (val);
7004         }
7005     }
7006   return Qnil;
7007 }
7008
7009
7010 static int
7011 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7012                bool last_block)
7013 {
7014   unsigned char *dst = coding->destination + coding->produced;
7015   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7016   ptrdiff_t produced;
7017   ptrdiff_t produced_chars = 0;
7018   int carryover = 0;
7019
7020   if (! coding->chars_at_source)
7021     {
7022       /* Source characters are in coding->charbuf.  */
7023       int *buf = coding->charbuf;
7024       int *buf_end = buf + coding->charbuf_used;
7025
7026       if (EQ (coding->src_object, coding->dst_object)
7027           && ! NILP (coding->dst_object))
7028         {
7029           eassert (growable_destination (coding));
7030           coding_set_source (coding);
7031           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7032         }
7033
7034       while (buf < buf_end)
7035         {
7036           int c = *buf;
7037           ptrdiff_t i;
7038
7039           if (c >= 0)
7040             {
7041               ptrdiff_t from_nchars = 1, to_nchars = 1;
7042               Lisp_Object trans = Qnil;
7043
7044               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7045               if (! NILP (trans))
7046                 {
7047                   trans = get_translation (trans, buf, buf_end, &from_nchars);
7048                   if (INTEGERP (trans))
7049                     c = XINT (trans);
7050                   else if (VECTORP (trans))
7051                     {
7052                       to_nchars = ASIZE (trans);
7053                       c = XINT (AREF (trans, 0));
7054                     }
7055                   else if (EQ (trans, Qt) && ! last_block)
7056                     break;
7057                 }
7058
7059               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7060                 {
7061                   eassert (growable_destination (coding));
7062                   ptrdiff_t dst_size;
7063                   if (INT_MULTIPLY_WRAPV (to_nchars, MAX_MULTIBYTE_LENGTH,
7064                                           &dst_size)
7065                       || INT_ADD_WRAPV (buf_end - buf, dst_size, &dst_size))
7066                     memory_full (SIZE_MAX);
7067                   dst = alloc_destination (coding, dst_size, dst);
7068                   if (EQ (coding->src_object, coding->dst_object))
7069                     {
7070                       coding_set_source (coding);
7071                       dst_end = (((unsigned char *) coding->source)
7072                                  + coding->consumed);
7073                     }
7074                   else
7075                     dst_end = coding->destination + coding->dst_bytes;
7076                 }
7077
7078               for (i = 0; i < to_nchars; i++)
7079                 {
7080                   if (i > 0)
7081                     c = XINT (AREF (trans, i));
7082                   if (coding->dst_multibyte
7083                       || ! CHAR_BYTE8_P (c))
7084                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7085                   else
7086                     *dst++ = CHAR_TO_BYTE8 (c);
7087                 }
7088               produced_chars += to_nchars;
7089               buf += from_nchars;
7090             }
7091           else
7092             /* This is an annotation datum.  (-C) is the length.  */
7093             buf += -c;
7094         }
7095       carryover = buf_end - buf;
7096     }
7097   else
7098     {
7099       /* Source characters are at coding->source.  */
7100       const unsigned char *src = coding->source;
7101       const unsigned char *src_end = src + coding->consumed;
7102
7103       if (EQ (coding->dst_object, coding->src_object))
7104         {
7105           eassert (growable_destination (coding));
7106           dst_end = (unsigned char *) src;
7107         }
7108       if (coding->src_multibyte != coding->dst_multibyte)
7109         {
7110           if (coding->src_multibyte)
7111             {
7112               bool multibytep = 1;
7113               ptrdiff_t consumed_chars = 0;
7114
7115               while (1)
7116                 {
7117                   const unsigned char *src_base = src;
7118                   int c;
7119
7120                   ONE_MORE_BYTE (c);
7121                   if (dst == dst_end)
7122                     {
7123                       eassert (growable_destination (coding));
7124                       if (EQ (coding->src_object, coding->dst_object))
7125                         dst_end = (unsigned char *) src;
7126                       if (dst == dst_end)
7127                         {
7128                           ptrdiff_t offset = src - coding->source;
7129
7130                           dst = alloc_destination (coding, src_end - src + 1,
7131                                                    dst);
7132                           dst_end = coding->destination + coding->dst_bytes;
7133                           coding_set_source (coding);
7134                           src = coding->source + offset;
7135                           src_end = coding->source + coding->consumed;
7136                           if (EQ (coding->src_object, coding->dst_object))
7137                             dst_end = (unsigned char *) src;
7138                         }
7139                     }
7140                   *dst++ = c;
7141                   produced_chars++;
7142                 }
7143             no_more_source:
7144               ;
7145             }
7146           else
7147             while (src < src_end)
7148               {
7149                 bool multibytep = 1;
7150                 int c = *src++;
7151
7152                 if (dst >= dst_end - 1)
7153                   {
7154                     eassert (growable_destination (coding));
7155                     if (EQ (coding->src_object, coding->dst_object))
7156                       dst_end = (unsigned char *) src;
7157                     if (dst >= dst_end - 1)
7158                       {
7159                         ptrdiff_t offset = src - coding->source;
7160                         ptrdiff_t more_bytes;
7161
7162                         if (EQ (coding->src_object, coding->dst_object))
7163                           more_bytes = ((src_end - src) / 2) + 2;
7164                         else
7165                           more_bytes = src_end - src + 2;
7166                         dst = alloc_destination (coding, more_bytes, dst);
7167                         dst_end = coding->destination + coding->dst_bytes;
7168                         coding_set_source (coding);
7169                         src = coding->source + offset;
7170                         src_end = coding->source + coding->consumed;
7171                         if (EQ (coding->src_object, coding->dst_object))
7172                           dst_end = (unsigned char *) src;
7173                       }
7174                   }
7175                 EMIT_ONE_BYTE (c);
7176               }
7177         }
7178       else
7179         {
7180           if (!EQ (coding->src_object, coding->dst_object))
7181             {
7182               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7183
7184               if (require > 0)
7185                 {
7186                   ptrdiff_t offset = src - coding->source;
7187
7188                   dst = alloc_destination (coding, require, dst);
7189                   coding_set_source (coding);
7190                   src = coding->source + offset;
7191                   src_end = coding->source + coding->consumed;
7192                 }
7193             }
7194           produced_chars = coding->consumed_char;
7195           while (src < src_end)
7196             *dst++ = *src++;
7197         }
7198     }
7199
7200   produced = dst - (coding->destination + coding->produced);
7201   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7202     insert_from_gap (produced_chars, produced, 0);
7203   coding->produced += produced;
7204   coding->produced_char += produced_chars;
7205   return carryover;
7206 }
7207
7208 /* Compose text in CODING->object according to the annotation data at
7209    CHARBUF.  CHARBUF is an array:
7210      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7211  */
7212
7213 static void
7214 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7215 {
7216   int len;
7217   ptrdiff_t to;
7218   enum composition_method method;
7219   Lisp_Object components;
7220
7221   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7222   to = pos + charbuf[2];
7223   method = (enum composition_method) (charbuf[4]);
7224
7225   if (method == COMPOSITION_RELATIVE)
7226     components = Qnil;
7227   else
7228     {
7229       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7230       int i, j;
7231
7232       if (method == COMPOSITION_WITH_RULE)
7233         len = charbuf[2] * 3 - 2;
7234       charbuf += MAX_ANNOTATION_LENGTH;
7235       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7236       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7237         {
7238           if (charbuf[i] >= 0)
7239             args[j] = make_number (charbuf[i]);
7240           else
7241             {
7242               i++;
7243               args[j] = make_number (charbuf[i] % 0x100);
7244             }
7245         }
7246       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7247     }
7248   compose_text (pos, to, components, Qnil, coding->dst_object);
7249 }
7250
7251
7252 /* Put `charset' property on text in CODING->object according to
7253    the annotation data at CHARBUF.  CHARBUF is an array:
7254      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7255  */
7256
7257 static void
7258 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7259 {
7260   ptrdiff_t from = pos - charbuf[2];
7261   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7262
7263   Fput_text_property (make_number (from), make_number (pos),
7264                       Qcharset, CHARSET_NAME (charset),
7265                       coding->dst_object);
7266 }
7267
7268 #define MAX_CHARBUF_SIZE 0x4000
7269 /* How many units decoding functions expect in coding->charbuf at
7270    most.  Currently, decode_coding_emacs_mule expects the following
7271    size, and that is the largest value.  */
7272 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7273
7274 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7275   do {                                                          \
7276     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
7277                            MAX_CHARBUF_SIZE);                   \
7278     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
7279     coding->charbuf_size = units;                               \
7280   } while (0)
7281
7282 static void
7283 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7284 {
7285   int *charbuf = coding->charbuf;
7286   int *charbuf_end = charbuf + coding->charbuf_used;
7287
7288   if (NILP (coding->dst_object))
7289     return;
7290
7291   while (charbuf < charbuf_end)
7292     {
7293       if (*charbuf >= 0)
7294         pos++, charbuf++;
7295       else
7296         {
7297           int len = -*charbuf;
7298
7299           if (len > 2)
7300             switch (charbuf[1])
7301               {
7302               case CODING_ANNOTATE_COMPOSITION_MASK:
7303                 produce_composition (coding, charbuf, pos);
7304                 break;
7305               case CODING_ANNOTATE_CHARSET_MASK:
7306                 produce_charset (coding, charbuf, pos);
7307                 break;
7308               default:
7309                 break;
7310               }
7311           charbuf += len;
7312         }
7313     }
7314 }
7315
7316 /* Decode the data at CODING->src_object into CODING->dst_object.
7317    CODING->src_object is a buffer, a string, or nil.
7318    CODING->dst_object is a buffer.
7319
7320    If CODING->src_object is a buffer, it must be the current buffer.
7321    In this case, if CODING->src_pos is positive, it is a position of
7322    the source text in the buffer, otherwise, the source text is in the
7323    gap area of the buffer, and CODING->src_pos specifies the offset of
7324    the text from GPT (which must be the same as PT).  If this is the
7325    same buffer as CODING->dst_object, CODING->src_pos must be
7326    negative.
7327
7328    If CODING->src_object is a string, CODING->src_pos is an index to
7329    that string.
7330
7331    If CODING->src_object is nil, CODING->source must already point to
7332    the non-relocatable memory area.  In this case, CODING->src_pos is
7333    an offset from CODING->source.
7334
7335    The decoded data is inserted at the current point of the buffer
7336    CODING->dst_object.
7337 */
7338
7339 static void
7340 decode_coding (struct coding_system *coding)
7341 {
7342   Lisp_Object attrs;
7343   Lisp_Object undo_list;
7344   Lisp_Object translation_table;
7345   struct ccl_spec cclspec;
7346   int carryover;
7347   int i;
7348
7349   USE_SAFE_ALLOCA;
7350
7351   if (BUFFERP (coding->src_object)
7352       && coding->src_pos > 0
7353       && coding->src_pos < GPT
7354       && coding->src_pos + coding->src_chars > GPT)
7355     move_gap_both (coding->src_pos, coding->src_pos_byte);
7356
7357   undo_list = Qt;
7358   if (BUFFERP (coding->dst_object))
7359     {
7360       set_buffer_internal (XBUFFER (coding->dst_object));
7361       if (GPT != PT)
7362         move_gap_both (PT, PT_BYTE);
7363
7364       /* We must disable undo_list in order to record the whole insert
7365          transaction via record_insert at the end.  But doing so also
7366          disables the recording of the first change to the undo_list.
7367          Therefore we check for first change here and record it via
7368          record_first_change if needed.  */
7369       if (MODIFF <= SAVE_MODIFF)
7370         record_first_change ();
7371
7372       undo_list = BVAR (current_buffer, undo_list);
7373       bset_undo_list (current_buffer, Qt);
7374     }
7375
7376   coding->consumed = coding->consumed_char = 0;
7377   coding->produced = coding->produced_char = 0;
7378   coding->chars_at_source = 0;
7379   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7380
7381   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7382
7383   attrs = CODING_ID_ATTRS (coding->id);
7384   translation_table = get_translation_table (attrs, 0, NULL);
7385
7386   carryover = 0;
7387   if (coding->decoder == decode_coding_ccl)
7388     {
7389       coding->spec.ccl = &cclspec;
7390       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7391     }
7392   do
7393     {
7394       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7395
7396       coding_set_source (coding);
7397       coding->annotated = 0;
7398       coding->charbuf_used = carryover;
7399       (*(coding->decoder)) (coding);
7400       coding_set_destination (coding);
7401       carryover = produce_chars (coding, translation_table, 0);
7402       if (coding->annotated)
7403         produce_annotation (coding, pos);
7404       for (i = 0; i < carryover; i++)
7405         coding->charbuf[i]
7406           = coding->charbuf[coding->charbuf_used - carryover + i];
7407     }
7408   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7409          || (coding->consumed < coding->src_bytes
7410              && (coding->result == CODING_RESULT_SUCCESS
7411                  || coding->result == CODING_RESULT_INVALID_SRC)));
7412
7413   if (carryover > 0)
7414     {
7415       coding_set_destination (coding);
7416       coding->charbuf_used = carryover;
7417       produce_chars (coding, translation_table, 1);
7418     }
7419
7420   coding->carryover_bytes = 0;
7421   if (coding->consumed < coding->src_bytes)
7422     {
7423       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7424       const unsigned char *src;
7425
7426       coding_set_source (coding);
7427       coding_set_destination (coding);
7428       src = coding->source + coding->consumed;
7429
7430       if (coding->mode & CODING_MODE_LAST_BLOCK)
7431         {
7432           /* Flush out unprocessed data as binary chars.  We are sure
7433              that the number of data is less than the size of
7434              coding->charbuf.  */
7435           coding->charbuf_used = 0;
7436           coding->chars_at_source = 0;
7437
7438           while (nbytes-- > 0)
7439             {
7440               int c;
7441
7442               /* Copy raw bytes in their 2-byte forms from multibyte
7443                  text as single characters.  */
7444               if (coding->src_multibyte
7445                   && CHAR_BYTE8_HEAD_P (*src) && nbytes > 0)
7446                 {
7447                   c = STRING_CHAR_ADVANCE (src);
7448                   nbytes--;
7449                 }
7450               else
7451                 {
7452                   c = *src++;
7453
7454                   if (c & 0x80)
7455                     c = BYTE8_TO_CHAR (c);
7456                 }
7457               coding->charbuf[coding->charbuf_used++] = c;
7458             }
7459           produce_chars (coding, Qnil, 1);
7460         }
7461       else
7462         {
7463           /* Record unprocessed bytes in coding->carryover.  We are
7464              sure that the number of data is less than the size of
7465              coding->carryover.  */
7466           unsigned char *p = coding->carryover;
7467
7468           if (nbytes > sizeof coding->carryover)
7469             nbytes = sizeof coding->carryover;
7470           coding->carryover_bytes = nbytes;
7471           while (nbytes-- > 0)
7472             *p++ = *src++;
7473         }
7474       coding->consumed = coding->src_bytes;
7475     }
7476
7477   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7478       && !inhibit_eol_conversion)
7479     decode_eol (coding);
7480   if (BUFFERP (coding->dst_object))
7481     {
7482       bset_undo_list (current_buffer, undo_list);
7483       record_insert (coding->dst_pos, coding->produced_char);
7484     }
7485
7486   SAFE_FREE ();
7487 }
7488
7489
7490 /* Extract an annotation datum from a composition starting at POS and
7491    ending before LIMIT of CODING->src_object (buffer or string), store
7492    the data in BUF, set *STOP to a starting position of the next
7493    composition (if any) or to LIMIT, and return the address of the
7494    next element of BUF.
7495
7496    If such an annotation is not found, set *STOP to a starting
7497    position of a composition after POS (if any) or to LIMIT, and
7498    return BUF.  */
7499
7500 static int *
7501 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7502                                struct coding_system *coding, int *buf,
7503                                ptrdiff_t *stop)
7504 {
7505   ptrdiff_t start, end;
7506   Lisp_Object prop;
7507
7508   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7509       || end > limit)
7510     *stop = limit;
7511   else if (start > pos)
7512     *stop = start;
7513   else
7514     {
7515       if (start == pos)
7516         {
7517           /* We found a composition.  Store the corresponding
7518              annotation data in BUF.  */
7519           int *head = buf;
7520           enum composition_method method = composition_method (prop);
7521           int nchars = COMPOSITION_LENGTH (prop);
7522
7523           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7524           if (method != COMPOSITION_RELATIVE)
7525             {
7526               Lisp_Object components;
7527               ptrdiff_t i, len, i_byte;
7528
7529               components = COMPOSITION_COMPONENTS (prop);
7530               if (VECTORP (components))
7531                 {
7532                   len = ASIZE (components);
7533                   for (i = 0; i < len; i++)
7534                     *buf++ = XINT (AREF (components, i));
7535                 }
7536               else if (STRINGP (components))
7537                 {
7538                   len = SCHARS (components);
7539                   i = i_byte = 0;
7540                   while (i < len)
7541                     {
7542                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7543                       buf++;
7544                     }
7545                 }
7546               else if (INTEGERP (components))
7547                 {
7548                   len = 1;
7549                   *buf++ = XINT (components);
7550                 }
7551               else if (CONSP (components))
7552                 {
7553                   for (len = 0; CONSP (components);
7554                        len++, components = XCDR (components))
7555                     *buf++ = XINT (XCAR (components));
7556                 }
7557               else
7558                 emacs_abort ();
7559               *head -= len;
7560             }
7561         }
7562
7563       if (find_composition (end, limit, &start, &end, &prop,
7564                             coding->src_object)
7565           && end <= limit)
7566         *stop = start;
7567       else
7568         *stop = limit;
7569     }
7570   return buf;
7571 }
7572
7573
7574 /* Extract an annotation datum from a text property `charset' at POS of
7575    CODING->src_object (buffer of string), store the data in BUF, set
7576    *STOP to the position where the value of `charset' property changes
7577    (limiting by LIMIT), and return the address of the next element of
7578    BUF.
7579
7580    If the property value is nil, set *STOP to the position where the
7581    property value is non-nil (limiting by LIMIT), and return BUF.  */
7582
7583 static int *
7584 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7585                            struct coding_system *coding, int *buf,
7586                            ptrdiff_t *stop)
7587 {
7588   Lisp_Object val, next;
7589   int id;
7590
7591   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7592   if (! NILP (val) && CHARSETP (val))
7593     id = XINT (CHARSET_SYMBOL_ID (val));
7594   else
7595     id = -1;
7596   ADD_CHARSET_DATA (buf, 0, id);
7597   next = Fnext_single_property_change (make_number (pos), Qcharset,
7598                                        coding->src_object,
7599                                        make_number (limit));
7600   *stop = XINT (next);
7601   return buf;
7602 }
7603
7604
7605 static void
7606 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7607                int max_lookup)
7608 {
7609   int *buf = coding->charbuf;
7610   int *buf_end = coding->charbuf + coding->charbuf_size;
7611   const unsigned char *src = coding->source + coding->consumed;
7612   const unsigned char *src_end = coding->source + coding->src_bytes;
7613   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7614   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7615   bool multibytep = coding->src_multibyte;
7616   Lisp_Object eol_type;
7617   int c;
7618   ptrdiff_t stop, stop_composition, stop_charset;
7619   int *lookup_buf = NULL;
7620
7621   if (! NILP (translation_table))
7622     lookup_buf = alloca (sizeof (int) * max_lookup);
7623
7624   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7625   if (VECTORP (eol_type))
7626     eol_type = Qunix;
7627
7628   /* Note: composition handling is not yet implemented.  */
7629   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7630
7631   if (NILP (coding->src_object))
7632     stop = stop_composition = stop_charset = end_pos;
7633   else
7634     {
7635       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7636         stop = stop_composition = pos;
7637       else
7638         stop = stop_composition = end_pos;
7639       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7640         stop = stop_charset = pos;
7641       else
7642         stop_charset = end_pos;
7643     }
7644
7645   /* Compensate for CRLF and conversion.  */
7646   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7647   while (buf < buf_end)
7648     {
7649       Lisp_Object trans;
7650
7651       if (pos == stop)
7652         {
7653           if (pos == end_pos)
7654             break;
7655           if (pos == stop_composition)
7656             buf = handle_composition_annotation (pos, end_pos, coding,
7657                                                  buf, &stop_composition);
7658           if (pos == stop_charset)
7659             buf = handle_charset_annotation (pos, end_pos, coding,
7660                                              buf, &stop_charset);
7661           stop = (stop_composition < stop_charset
7662                   ? stop_composition : stop_charset);
7663         }
7664
7665       if (! multibytep)
7666         {
7667           int bytes;
7668
7669           if (coding->encoder == encode_coding_raw_text
7670               || coding->encoder == encode_coding_ccl)
7671             c = *src++, pos++;
7672           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7673             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7674           else
7675             c = BYTE8_TO_CHAR (*src), src++, pos++;
7676         }
7677       else
7678         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7679       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7680         c = '\n';
7681       if (! EQ (eol_type, Qunix))
7682         {
7683           if (c == '\n')
7684             {
7685               if (EQ (eol_type, Qdos))
7686                 *buf++ = '\r';
7687               else
7688                 c = '\r';
7689             }
7690         }
7691
7692       trans = Qnil;
7693       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7694       if (NILP (trans))
7695         *buf++ = c;
7696       else
7697         {
7698           ptrdiff_t from_nchars = 1, to_nchars = 1;
7699           int *lookup_buf_end;
7700           const unsigned char *p = src;
7701           int i;
7702
7703           lookup_buf[0] = c;
7704           for (i = 1; i < max_lookup && p < src_end; i++)
7705             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7706           lookup_buf_end = lookup_buf + i;
7707           trans = get_translation (trans, lookup_buf, lookup_buf_end,
7708                                    &from_nchars);
7709           if (INTEGERP (trans))
7710             c = XINT (trans);
7711           else if (VECTORP (trans))
7712             {
7713               to_nchars = ASIZE (trans);
7714               if (buf_end - buf < to_nchars)
7715                 break;
7716               c = XINT (AREF (trans, 0));
7717             }
7718           else
7719             break;
7720           *buf++ = c;
7721           for (i = 1; i < to_nchars; i++)
7722             *buf++ = XINT (AREF (trans, i));
7723           for (i = 1; i < from_nchars; i++, pos++)
7724             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7725         }
7726     }
7727
7728   coding->consumed = src - coding->source;
7729   coding->consumed_char = pos - coding->src_pos;
7730   coding->charbuf_used = buf - coding->charbuf;
7731   coding->chars_at_source = 0;
7732 }
7733
7734
7735 /* Encode the text at CODING->src_object into CODING->dst_object.
7736    CODING->src_object is a buffer or a string.
7737    CODING->dst_object is a buffer or nil.
7738
7739    If CODING->src_object is a buffer, it must be the current buffer.
7740    In this case, if CODING->src_pos is positive, it is a position of
7741    the source text in the buffer, otherwise. the source text is in the
7742    gap area of the buffer, and coding->src_pos specifies the offset of
7743    the text from GPT (which must be the same as PT).  If this is the
7744    same buffer as CODING->dst_object, CODING->src_pos must be
7745    negative and CODING should not have `pre-write-conversion'.
7746
7747    If CODING->src_object is a string, CODING should not have
7748    `pre-write-conversion'.
7749
7750    If CODING->dst_object is a buffer, the encoded data is inserted at
7751    the current point of that buffer.
7752
7753    If CODING->dst_object is nil, the encoded data is placed at the
7754    memory area specified by CODING->destination.  */
7755
7756 static void
7757 encode_coding (struct coding_system *coding)
7758 {
7759   Lisp_Object attrs;
7760   Lisp_Object translation_table;
7761   int max_lookup;
7762   struct ccl_spec cclspec;
7763
7764   USE_SAFE_ALLOCA;
7765
7766   attrs = CODING_ID_ATTRS (coding->id);
7767   if (coding->encoder == encode_coding_raw_text)
7768     translation_table = Qnil, max_lookup = 0;
7769   else
7770     translation_table = get_translation_table (attrs, 1, &max_lookup);
7771
7772   if (BUFFERP (coding->dst_object))
7773     {
7774       set_buffer_internal (XBUFFER (coding->dst_object));
7775       coding->dst_multibyte
7776         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7777     }
7778
7779   coding->consumed = coding->consumed_char = 0;
7780   coding->produced = coding->produced_char = 0;
7781   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7782
7783   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7784
7785   if (coding->encoder == encode_coding_ccl)
7786     {
7787       coding->spec.ccl = &cclspec;
7788       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7789     }
7790   do {
7791     coding_set_source (coding);
7792     consume_chars (coding, translation_table, max_lookup);
7793     coding_set_destination (coding);
7794     (*(coding->encoder)) (coding);
7795   } while (coding->consumed_char < coding->src_chars);
7796
7797   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7798     insert_from_gap (coding->produced_char, coding->produced, 0);
7799
7800   SAFE_FREE ();
7801 }
7802
7803
7804 /* Name (or base name) of work buffer for code conversion.  */
7805 static Lisp_Object Vcode_conversion_workbuf_name;
7806
7807 /* A working buffer used by the top level conversion.  Once it is
7808    created, it is never destroyed.  It has the name
7809    Vcode_conversion_workbuf_name.  The other working buffers are
7810    destroyed after the use is finished, and their names are modified
7811    versions of Vcode_conversion_workbuf_name.  */
7812 static Lisp_Object Vcode_conversion_reused_workbuf;
7813
7814 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7815 static bool reused_workbuf_in_use;
7816
7817
7818 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7819    multibyteness of returning buffer.  */
7820
7821 static Lisp_Object
7822 make_conversion_work_buffer (bool multibyte)
7823 {
7824   Lisp_Object name, workbuf;
7825   struct buffer *current;
7826
7827   if (reused_workbuf_in_use)
7828     {
7829       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7830       workbuf = Fget_buffer_create (name);
7831     }
7832   else
7833     {
7834       reused_workbuf_in_use = 1;
7835       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7836         Vcode_conversion_reused_workbuf
7837           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7838       workbuf = Vcode_conversion_reused_workbuf;
7839     }
7840   current = current_buffer;
7841   set_buffer_internal (XBUFFER (workbuf));
7842   /* We can't allow modification hooks to run in the work buffer.  For
7843      instance, directory_files_internal assumes that file decoding
7844      doesn't compile new regexps.  */
7845   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7846   Ferase_buffer ();
7847   bset_undo_list (current_buffer, Qt);
7848   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7849   set_buffer_internal (current);
7850   return workbuf;
7851 }
7852
7853
7854 static void
7855 code_conversion_restore (Lisp_Object arg)
7856 {
7857   Lisp_Object current, workbuf;
7858
7859   current = XCAR (arg);
7860   workbuf = XCDR (arg);
7861   if (! NILP (workbuf))
7862     {
7863       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7864         reused_workbuf_in_use = 0;
7865       else
7866         Fkill_buffer (workbuf);
7867     }
7868   set_buffer_internal (XBUFFER (current));
7869 }
7870
7871 Lisp_Object
7872 code_conversion_save (bool with_work_buf, bool multibyte)
7873 {
7874   Lisp_Object workbuf = Qnil;
7875
7876   if (with_work_buf)
7877     workbuf = make_conversion_work_buffer (multibyte);
7878   record_unwind_protect (code_conversion_restore,
7879                          Fcons (Fcurrent_buffer (), workbuf));
7880   return workbuf;
7881 }
7882
7883 static void
7884 coding_restore_undo_list (Lisp_Object arg)
7885 {
7886   Lisp_Object undo_list = XCAR (arg);
7887   struct buffer *buf = XBUFFER (XCDR (arg));
7888
7889   bset_undo_list (buf, undo_list);
7890 }
7891
7892 void
7893 decode_coding_gap (struct coding_system *coding,
7894                    ptrdiff_t chars, ptrdiff_t bytes)
7895 {
7896   ptrdiff_t count = SPECPDL_INDEX ();
7897   Lisp_Object attrs;
7898
7899   coding->src_object = Fcurrent_buffer ();
7900   coding->src_chars = chars;
7901   coding->src_bytes = bytes;
7902   coding->src_pos = -chars;
7903   coding->src_pos_byte = -bytes;
7904   coding->src_multibyte = chars < bytes;
7905   coding->dst_object = coding->src_object;
7906   coding->dst_pos = PT;
7907   coding->dst_pos_byte = PT_BYTE;
7908   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7909
7910   coding->head_ascii = -1;
7911   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7912   coding->eol_seen = EOL_SEEN_NONE;
7913   if (CODING_REQUIRE_DETECTION (coding))
7914     detect_coding (coding);
7915   attrs = CODING_ID_ATTRS (coding->id);
7916   if (! disable_ascii_optimization
7917       && ! coding->src_multibyte
7918       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7919       && NILP (CODING_ATTR_POST_READ (attrs))
7920       && NILP (get_translation_table (attrs, 0, NULL)))
7921     {
7922       chars = coding->head_ascii;
7923       if (chars < 0)
7924         chars = check_ascii (coding);
7925       if (chars != bytes)
7926         {
7927           /* There exists a non-ASCII byte.  */
7928           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7929               && coding->detected_utf8_bytes == coding->src_bytes)
7930             {
7931               if (coding->detected_utf8_chars >= 0)
7932                 chars = coding->detected_utf8_chars;
7933               else
7934                 chars = check_utf_8 (coding);
7935               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7936                   && coding->head_ascii == 0
7937                   && coding->source[0] == UTF_8_BOM_1
7938                   && coding->source[1] == UTF_8_BOM_2
7939                   && coding->source[2] == UTF_8_BOM_3)
7940                 {
7941                   chars--;
7942                   bytes -= 3;
7943                   coding->src_bytes -= 3;
7944                 }
7945             }
7946           else
7947             chars = -1;
7948         }
7949       if (chars >= 0)
7950         {
7951           Lisp_Object eol_type;
7952
7953           eol_type = CODING_ID_EOL_TYPE (coding->id);
7954           if (VECTORP (eol_type))
7955             {
7956               if (coding->eol_seen != EOL_SEEN_NONE)
7957                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7958             }
7959           if (EQ (eol_type, Qmac))
7960             {
7961               unsigned char *src_end = GAP_END_ADDR;
7962               unsigned char *src = src_end - coding->src_bytes;
7963
7964               while (src < src_end)
7965                 {
7966                   if (*src++ == '\r')
7967                     src[-1] = '\n';
7968                 }
7969             }
7970           else if (EQ (eol_type, Qdos))
7971             {
7972               unsigned char *src = GAP_END_ADDR;
7973               unsigned char *src_beg = src - coding->src_bytes;
7974               unsigned char *dst = src;
7975               ptrdiff_t diff;
7976
7977               while (src_beg < src)
7978                 {
7979                   *--dst = *--src;
7980                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7981                     src--;
7982                 }
7983               diff = dst - src;
7984               bytes -= diff;
7985               chars -= diff;
7986             }
7987           coding->produced = bytes;
7988           coding->produced_char = chars;
7989           insert_from_gap (chars, bytes, 1);
7990           return;
7991         }
7992     }
7993   code_conversion_save (0, 0);
7994
7995   coding->mode |= CODING_MODE_LAST_BLOCK;
7996   current_buffer->text->inhibit_shrinking = 1;
7997   decode_coding (coding);
7998   current_buffer->text->inhibit_shrinking = 0;
7999
8000   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8001     {
8002       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8003       Lisp_Object val;
8004       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
8005       ptrdiff_t count1 = SPECPDL_INDEX ();
8006
8007       record_unwind_protect (coding_restore_undo_list,
8008                              Fcons (undo_list, Fcurrent_buffer ()));
8009       bset_undo_list (current_buffer, Qt);
8010       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8011       val = call1 (CODING_ATTR_POST_READ (attrs),
8012                    make_number (coding->produced_char));
8013       CHECK_NATNUM (val);
8014       coding->produced_char += Z - prev_Z;
8015       coding->produced += Z_BYTE - prev_Z_BYTE;
8016       unbind_to (count1, Qnil);
8017     }
8018
8019   unbind_to (count, Qnil);
8020 }
8021
8022
8023 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8024    SRC_OBJECT into DST_OBJECT by coding context CODING.
8025
8026    SRC_OBJECT is a buffer, a string, or Qnil.
8027
8028    If it is a buffer, the text is at point of the buffer.  FROM and TO
8029    are positions in the buffer.
8030
8031    If it is a string, the text is at the beginning of the string.
8032    FROM and TO are indices to the string.
8033
8034    If it is nil, the text is at coding->source.  FROM and TO are
8035    indices to coding->source.
8036
8037    DST_OBJECT is a buffer, Qt, or Qnil.
8038
8039    If it is a buffer, the decoded text is inserted at point of the
8040    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8041    is deleted.
8042
8043    If it is Qt, a string is made from the decoded text, and
8044    set in CODING->dst_object.
8045
8046    If it is Qnil, the decoded text is stored at CODING->destination.
8047    The caller must allocate CODING->dst_bytes bytes at
8048    CODING->destination by xmalloc.  If the decoded text is longer than
8049    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8050  */
8051
8052 void
8053 decode_coding_object (struct coding_system *coding,
8054                       Lisp_Object src_object,
8055                       ptrdiff_t from, ptrdiff_t from_byte,
8056                       ptrdiff_t to, ptrdiff_t to_byte,
8057                       Lisp_Object dst_object)
8058 {
8059   ptrdiff_t count = SPECPDL_INDEX ();
8060   unsigned char *destination UNINIT;
8061   ptrdiff_t dst_bytes UNINIT;
8062   ptrdiff_t chars = to - from;
8063   ptrdiff_t bytes = to_byte - from_byte;
8064   Lisp_Object attrs;
8065   ptrdiff_t saved_pt = -1, saved_pt_byte UNINIT;
8066   bool need_marker_adjustment = 0;
8067   Lisp_Object old_deactivate_mark;
8068
8069   old_deactivate_mark = Vdeactivate_mark;
8070
8071   if (NILP (dst_object))
8072     {
8073       destination = coding->destination;
8074       dst_bytes = coding->dst_bytes;
8075     }
8076
8077   coding->src_object = src_object;
8078   coding->src_chars = chars;
8079   coding->src_bytes = bytes;
8080   coding->src_multibyte = chars < bytes;
8081
8082   if (STRINGP (src_object))
8083     {
8084       coding->src_pos = from;
8085       coding->src_pos_byte = from_byte;
8086     }
8087   else if (BUFFERP (src_object))
8088     {
8089       set_buffer_internal (XBUFFER (src_object));
8090       if (from != GPT)
8091         move_gap_both (from, from_byte);
8092       if (EQ (src_object, dst_object))
8093         {
8094           struct Lisp_Marker *tail;
8095
8096           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8097             {
8098               tail->need_adjustment
8099                 = tail->charpos == (tail->insertion_type ? from : to);
8100               need_marker_adjustment |= tail->need_adjustment;
8101             }
8102           saved_pt = PT, saved_pt_byte = PT_BYTE;
8103           TEMP_SET_PT_BOTH (from, from_byte);
8104           current_buffer->text->inhibit_shrinking = 1;
8105           del_range_both (from, from_byte, to, to_byte, 1);
8106           coding->src_pos = -chars;
8107           coding->src_pos_byte = -bytes;
8108         }
8109       else
8110         {
8111           coding->src_pos = from;
8112           coding->src_pos_byte = from_byte;
8113         }
8114     }
8115
8116   if (CODING_REQUIRE_DETECTION (coding))
8117     detect_coding (coding);
8118   attrs = CODING_ID_ATTRS (coding->id);
8119
8120   if (EQ (dst_object, Qt)
8121       || (! NILP (CODING_ATTR_POST_READ (attrs))
8122           && NILP (dst_object)))
8123     {
8124       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8125       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8126       coding->dst_pos = BEG;
8127       coding->dst_pos_byte = BEG_BYTE;
8128     }
8129   else if (BUFFERP (dst_object))
8130     {
8131       code_conversion_save (0, 0);
8132       coding->dst_object = dst_object;
8133       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8134       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8135       coding->dst_multibyte
8136         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8137     }
8138   else
8139     {
8140       code_conversion_save (0, 0);
8141       coding->dst_object = Qnil;
8142       /* Most callers presume this will return a multibyte result, and they
8143          won't use `binary' or `raw-text' anyway, so let's not worry about
8144          CODING_FOR_UNIBYTE.  */
8145       coding->dst_multibyte = 1;
8146     }
8147
8148   decode_coding (coding);
8149
8150   if (BUFFERP (coding->dst_object))
8151     set_buffer_internal (XBUFFER (coding->dst_object));
8152
8153   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8154     {
8155       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8156       Lisp_Object val;
8157       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
8158       ptrdiff_t count1 = SPECPDL_INDEX ();
8159
8160       record_unwind_protect (coding_restore_undo_list,
8161                              Fcons (undo_list, Fcurrent_buffer ()));
8162       bset_undo_list (current_buffer, Qt);
8163       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8164       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8165                         make_number (coding->produced_char));
8166       CHECK_NATNUM (val);
8167       coding->produced_char += Z - prev_Z;
8168       coding->produced += Z_BYTE - prev_Z_BYTE;
8169       unbind_to (count1, Qnil);
8170     }
8171
8172   if (EQ (dst_object, Qt))
8173     {
8174       coding->dst_object = Fbuffer_string ();
8175     }
8176   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8177     {
8178       set_buffer_internal (XBUFFER (coding->dst_object));
8179       if (dst_bytes < coding->produced)
8180         {
8181           eassert (coding->produced > 0);
8182           destination = xrealloc (destination, coding->produced);
8183           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8184             move_gap_both (BEGV, BEGV_BYTE);
8185           memcpy (destination, BEGV_ADDR, coding->produced);
8186           coding->destination = destination;
8187         }
8188     }
8189
8190   if (saved_pt >= 0)
8191     {
8192       /* This is the case of:
8193          (BUFFERP (src_object) && EQ (src_object, dst_object))
8194          As we have moved PT while replacing the original buffer
8195          contents, we must recover it now.  */
8196       set_buffer_internal (XBUFFER (src_object));
8197       current_buffer->text->inhibit_shrinking = 0;
8198       if (saved_pt < from)
8199         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8200       else if (saved_pt < from + chars)
8201         TEMP_SET_PT_BOTH (from, from_byte);
8202       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8203         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8204                           saved_pt_byte + (coding->produced - bytes));
8205       else
8206         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8207                           saved_pt_byte + (coding->produced - bytes));
8208
8209       if (need_marker_adjustment)
8210         {
8211           struct Lisp_Marker *tail;
8212
8213           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8214             if (tail->need_adjustment)
8215               {
8216                 tail->need_adjustment = 0;
8217                 if (tail->insertion_type)
8218                   {
8219                     tail->bytepos = from_byte;
8220                     tail->charpos = from;
8221                   }
8222                 else
8223                   {
8224                     tail->bytepos = from_byte + coding->produced;
8225                     tail->charpos
8226                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8227                          ? tail->bytepos : from + coding->produced_char);
8228                   }
8229               }
8230         }
8231     }
8232
8233   Vdeactivate_mark = old_deactivate_mark;
8234   unbind_to (count, coding->dst_object);
8235 }
8236
8237
8238 void
8239 encode_coding_object (struct coding_system *coding,
8240                       Lisp_Object src_object,
8241                       ptrdiff_t from, ptrdiff_t from_byte,
8242                       ptrdiff_t to, ptrdiff_t to_byte,
8243                       Lisp_Object dst_object)
8244 {
8245   ptrdiff_t count = SPECPDL_INDEX ();
8246   ptrdiff_t chars = to - from;
8247   ptrdiff_t bytes = to_byte - from_byte;
8248   Lisp_Object attrs;
8249   ptrdiff_t saved_pt = -1, saved_pt_byte;
8250   bool need_marker_adjustment = 0;
8251   bool kill_src_buffer = 0;
8252   Lisp_Object old_deactivate_mark;
8253
8254   old_deactivate_mark = Vdeactivate_mark;
8255
8256   coding->src_object = src_object;
8257   coding->src_chars = chars;
8258   coding->src_bytes = bytes;
8259   coding->src_multibyte = chars < bytes;
8260
8261   attrs = CODING_ID_ATTRS (coding->id);
8262
8263   if (EQ (src_object, dst_object))
8264     {
8265       struct Lisp_Marker *tail;
8266
8267       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8268         {
8269           tail->need_adjustment
8270             = tail->charpos == (tail->insertion_type ? from : to);
8271           need_marker_adjustment |= tail->need_adjustment;
8272         }
8273     }
8274
8275   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8276     {
8277       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8278       set_buffer_internal (XBUFFER (coding->src_object));
8279       if (STRINGP (src_object))
8280         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8281       else if (BUFFERP (src_object))
8282         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8283       else
8284         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8285
8286       if (EQ (src_object, dst_object))
8287         {
8288           set_buffer_internal (XBUFFER (src_object));
8289           saved_pt = PT, saved_pt_byte = PT_BYTE;
8290           del_range_both (from, from_byte, to, to_byte, 1);
8291           set_buffer_internal (XBUFFER (coding->src_object));
8292         }
8293
8294       safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8295                   make_number (BEG), make_number (Z));
8296       if (XBUFFER (coding->src_object) != current_buffer)
8297         kill_src_buffer = 1;
8298       coding->src_object = Fcurrent_buffer ();
8299       if (BEG != GPT)
8300         move_gap_both (BEG, BEG_BYTE);
8301       coding->src_chars = Z - BEG;
8302       coding->src_bytes = Z_BYTE - BEG_BYTE;
8303       coding->src_pos = BEG;
8304       coding->src_pos_byte = BEG_BYTE;
8305       coding->src_multibyte = Z < Z_BYTE;
8306     }
8307   else if (STRINGP (src_object))
8308     {
8309       code_conversion_save (0, 0);
8310       coding->src_pos = from;
8311       coding->src_pos_byte = from_byte;
8312     }
8313   else if (BUFFERP (src_object))
8314     {
8315       code_conversion_save (0, 0);
8316       set_buffer_internal (XBUFFER (src_object));
8317       if (EQ (src_object, dst_object))
8318         {
8319           saved_pt = PT, saved_pt_byte = PT_BYTE;
8320           coding->src_object = del_range_1 (from, to, 1, 1);
8321           coding->src_pos = 0;
8322           coding->src_pos_byte = 0;
8323         }
8324       else
8325         {
8326           if (from < GPT && to >= GPT)
8327             move_gap_both (from, from_byte);
8328           coding->src_pos = from;
8329           coding->src_pos_byte = from_byte;
8330         }
8331     }
8332   else
8333     {
8334       code_conversion_save (0, 0);
8335       coding->src_pos = from;
8336       coding->src_pos_byte = from_byte;
8337     }
8338
8339   if (BUFFERP (dst_object))
8340     {
8341       coding->dst_object = dst_object;
8342       if (EQ (src_object, dst_object))
8343         {
8344           coding->dst_pos = from;
8345           coding->dst_pos_byte = from_byte;
8346         }
8347       else
8348         {
8349           struct buffer *current = current_buffer;
8350
8351           set_buffer_temp (XBUFFER (dst_object));
8352           coding->dst_pos = PT;
8353           coding->dst_pos_byte = PT_BYTE;
8354           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8355           set_buffer_temp (current);
8356         }
8357       coding->dst_multibyte
8358         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8359     }
8360   else if (EQ (dst_object, Qt))
8361     {
8362       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8363       coding->dst_object = Qnil;
8364       coding->destination = xmalloc (dst_bytes);
8365       coding->dst_bytes = dst_bytes;
8366       coding->dst_multibyte = 0;
8367     }
8368   else
8369     {
8370       coding->dst_object = Qnil;
8371       coding->dst_multibyte = 0;
8372     }
8373
8374   encode_coding (coding);
8375
8376   if (EQ (dst_object, Qt))
8377     {
8378       if (BUFFERP (coding->dst_object))
8379         coding->dst_object = Fbuffer_string ();
8380       else if (coding->raw_destination)
8381         /* This is used to avoid creating huge Lisp string.
8382            NOTE: caller who sets `raw_destination' is also
8383            responsible for freeing `destination' buffer.  */
8384         coding->dst_object = Qnil;
8385       else
8386         {
8387           coding->dst_object
8388             = make_unibyte_string ((char *) coding->destination,
8389                                    coding->produced);
8390           xfree (coding->destination);
8391         }
8392     }
8393
8394   if (saved_pt >= 0)
8395     {
8396       /* This is the case of:
8397          (BUFFERP (src_object) && EQ (src_object, dst_object))
8398          As we have moved PT while replacing the original buffer
8399          contents, we must recover it now.  */
8400       set_buffer_internal (XBUFFER (src_object));
8401       if (saved_pt < from)
8402         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8403       else if (saved_pt < from + chars)
8404         TEMP_SET_PT_BOTH (from, from_byte);
8405       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8406         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8407                           saved_pt_byte + (coding->produced - bytes));
8408       else
8409         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8410                           saved_pt_byte + (coding->produced - bytes));
8411
8412       if (need_marker_adjustment)
8413         {
8414           struct Lisp_Marker *tail;
8415
8416           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8417             if (tail->need_adjustment)
8418               {
8419                 tail->need_adjustment = 0;
8420                 if (tail->insertion_type)
8421                   {
8422                     tail->bytepos = from_byte;
8423                     tail->charpos = from;
8424                   }
8425                 else
8426                   {
8427                     tail->bytepos = from_byte + coding->produced;
8428                     tail->charpos
8429                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8430                          ? tail->bytepos : from + coding->produced_char);
8431                   }
8432               }
8433         }
8434     }
8435
8436   if (kill_src_buffer)
8437     Fkill_buffer (coding->src_object);
8438
8439   Vdeactivate_mark = old_deactivate_mark;
8440   unbind_to (count, Qnil);
8441 }
8442
8443
8444 Lisp_Object
8445 preferred_coding_system (void)
8446 {
8447   int id = coding_categories[coding_priorities[0]].id;
8448
8449   return CODING_ID_NAME (id);
8450 }
8451
8452 #if defined (WINDOWSNT) || defined (CYGWIN)
8453
8454 Lisp_Object
8455 from_unicode (Lisp_Object str)
8456 {
8457   CHECK_STRING (str);
8458   if (!STRING_MULTIBYTE (str) &&
8459       SBYTES (str) & 1)
8460     {
8461       str = Fsubstring (str, make_number (0), make_number (-1));
8462     }
8463
8464   return code_convert_string_norecord (str, Qutf_16le, 0);
8465 }
8466
8467 Lisp_Object
8468 from_unicode_buffer (const wchar_t *wstr)
8469 {
8470   /* We get one of the two final null bytes for free.  */
8471   ptrdiff_t len = 1 + sizeof (wchar_t) * wcslen (wstr);
8472   AUTO_STRING_WITH_LEN (str, (char *) wstr, len);
8473   return from_unicode (str);
8474 }
8475
8476 wchar_t *
8477 to_unicode (Lisp_Object str, Lisp_Object *buf)
8478 {
8479   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8480   /* We need to make another copy (in addition to the one made by
8481      code_convert_string_norecord) to ensure that the final string is
8482      _doubly_ zero terminated --- that is, that the string is
8483      terminated by two zero bytes and one utf-16le null character.
8484      Because strings are already terminated with a single zero byte,
8485      we just add one additional zero. */
8486   str = make_uninit_string (SBYTES (*buf) + 1);
8487   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8488   SDATA (str) [SBYTES (*buf)] = '\0';
8489   *buf = str;
8490   return WCSDATA (*buf);
8491 }
8492
8493 #endif /* WINDOWSNT || CYGWIN */
8494
8495 \f
8496 #ifdef emacs
8497 /*** 8. Emacs Lisp library functions ***/
8498
8499 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8500        doc: /* Return t if OBJECT is nil or a coding-system.
8501 See the documentation of `define-coding-system' for information
8502 about coding-system objects.  */)
8503   (Lisp_Object object)
8504 {
8505   if (NILP (object)
8506       || CODING_SYSTEM_ID (object) >= 0)
8507     return Qt;
8508   if (! SYMBOLP (object)
8509       || NILP (Fget (object, Qcoding_system_define_form)))
8510     return Qnil;
8511   return Qt;
8512 }
8513
8514 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8515        Sread_non_nil_coding_system, 1, 1, 0,
8516        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8517   (Lisp_Object prompt)
8518 {
8519   Lisp_Object val;
8520   do
8521     {
8522       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8523                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8524     }
8525   while (SCHARS (val) == 0);
8526   return (Fintern (val, Qnil));
8527 }
8528
8529 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8530        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8531 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8532 Ignores case when completing coding systems (all Emacs coding systems
8533 are lower-case).  */)
8534   (Lisp_Object prompt, Lisp_Object default_coding_system)
8535 {
8536   Lisp_Object val;
8537   ptrdiff_t count = SPECPDL_INDEX ();
8538
8539   if (SYMBOLP (default_coding_system))
8540     default_coding_system = SYMBOL_NAME (default_coding_system);
8541   specbind (Qcompletion_ignore_case, Qt);
8542   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8543                           Qt, Qnil, Qcoding_system_history,
8544                           default_coding_system, Qnil);
8545   unbind_to (count, Qnil);
8546   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8547 }
8548
8549 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8550        1, 1, 0,
8551        doc: /* Check validity of CODING-SYSTEM.
8552 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8553 It is valid if it is nil or a symbol defined as a coding system by the
8554 function `define-coding-system'.  */)
8555   (Lisp_Object coding_system)
8556 {
8557   Lisp_Object define_form;
8558
8559   define_form = Fget (coding_system, Qcoding_system_define_form);
8560   if (! NILP (define_form))
8561     {
8562       Fput (coding_system, Qcoding_system_define_form, Qnil);
8563       safe_eval (define_form);
8564     }
8565   if (!NILP (Fcoding_system_p (coding_system)))
8566     return coding_system;
8567   xsignal1 (Qcoding_system_error, coding_system);
8568 }
8569
8570 \f
8571 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8572    HIGHEST, return the coding system of the highest
8573    priority among the detected coding systems.  Otherwise return a
8574    list of detected coding systems sorted by their priorities.  If
8575    MULTIBYTEP, it is assumed that the bytes are in correct
8576    multibyte form but contains only ASCII and eight-bit chars.
8577    Otherwise, the bytes are raw bytes.
8578
8579    CODING-SYSTEM controls the detection as below:
8580
8581    If it is nil, detect both text-format and eol-format.  If the
8582    text-format part of CODING-SYSTEM is already specified
8583    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8584    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8585    detect only text-format.  */
8586
8587 Lisp_Object
8588 detect_coding_system (const unsigned char *src,
8589                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8590                       bool highest, bool multibytep,
8591                       Lisp_Object coding_system)
8592 {
8593   const unsigned char *src_end = src + src_bytes;
8594   Lisp_Object attrs, eol_type;
8595   Lisp_Object val = Qnil;
8596   struct coding_system coding;
8597   ptrdiff_t id;
8598   struct coding_detection_info detect_info;
8599   enum coding_category base_category;
8600   bool null_byte_found = 0, eight_bit_found = 0;
8601
8602   if (NILP (coding_system))
8603     coding_system = Qundecided;
8604   setup_coding_system (coding_system, &coding);
8605   attrs = CODING_ID_ATTRS (coding.id);
8606   eol_type = CODING_ID_EOL_TYPE (coding.id);
8607   coding_system = CODING_ATTR_BASE_NAME (attrs);
8608
8609   coding.source = src;
8610   coding.src_chars = src_chars;
8611   coding.src_bytes = src_bytes;
8612   coding.src_multibyte = multibytep;
8613   coding.consumed = 0;
8614   coding.mode |= CODING_MODE_LAST_BLOCK;
8615   coding.head_ascii = 0;
8616
8617   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8618
8619   /* At first, detect text-format if necessary.  */
8620   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8621   if (base_category == coding_category_undecided)
8622     {
8623       enum coding_category category UNINIT;
8624       struct coding_system *this UNINIT;
8625       int c, i;
8626       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8627                                        inhibit_null_byte_detection);
8628       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8629                                        inhibit_iso_escape_detection);
8630       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8631
8632       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8633       for (; src < src_end; src++)
8634         {
8635           c = *src;
8636           if (c & 0x80)
8637             {
8638               eight_bit_found = 1;
8639               if (null_byte_found)
8640                 break;
8641             }
8642           else if (c < 0x20)
8643             {
8644               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8645                   && ! inhibit_ied
8646                   && ! detect_info.checked)
8647                 {
8648                   if (detect_coding_iso_2022 (&coding, &detect_info))
8649                     {
8650                       /* We have scanned the whole data.  */
8651                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8652                         {
8653                           /* We didn't find an 8-bit code.  We may
8654                              have found a null-byte, but it's very
8655                              rare that a binary file confirm to
8656                              ISO-2022.  */
8657                           src = src_end;
8658                           coding.head_ascii = src - coding.source;
8659                         }
8660                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8661                       break;
8662                     }
8663                 }
8664               else if (! c && !inhibit_nbd)
8665                 {
8666                   null_byte_found = 1;
8667                   if (eight_bit_found)
8668                     break;
8669                 }
8670               if (! eight_bit_found)
8671                 coding.head_ascii++;
8672             }
8673           else if (! eight_bit_found)
8674             coding.head_ascii++;
8675         }
8676
8677       if (null_byte_found || eight_bit_found
8678           || coding.head_ascii < coding.src_bytes
8679           || detect_info.found)
8680         {
8681           if (coding.head_ascii == coding.src_bytes)
8682             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8683             for (i = 0; i < coding_category_raw_text; i++)
8684               {
8685                 category = coding_priorities[i];
8686                 this = coding_categories + category;
8687                 if (detect_info.found & (1 << category))
8688                   break;
8689               }
8690           else
8691             {
8692               if (null_byte_found)
8693                 {
8694                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8695                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8696                 }
8697               else if (prefer_utf_8
8698                        && detect_coding_utf_8 (&coding, &detect_info))
8699                 {
8700                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8701                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8702                 }
8703               for (i = 0; i < coding_category_raw_text; i++)
8704                 {
8705                   category = coding_priorities[i];
8706                   this = coding_categories + category;
8707
8708                   if (this->id < 0)
8709                     {
8710                       /* No coding system of this category is defined.  */
8711                       detect_info.rejected |= (1 << category);
8712                     }
8713                   else if (category >= coding_category_raw_text)
8714                     continue;
8715                   else if (detect_info.checked & (1 << category))
8716                     {
8717                       if (highest
8718                           && (detect_info.found & (1 << category)))
8719                         break;
8720                     }
8721                   else if ((*(this->detector)) (&coding, &detect_info)
8722                            && highest
8723                            && (detect_info.found & (1 << category)))
8724                     {
8725                       if (category == coding_category_utf_16_auto)
8726                         {
8727                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8728                             category = coding_category_utf_16_le;
8729                           else
8730                             category = coding_category_utf_16_be;
8731                         }
8732                       break;
8733                     }
8734                 }
8735             }
8736         }
8737
8738       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8739           || null_byte_found)
8740         {
8741           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8742           id = CODING_SYSTEM_ID (Qno_conversion);
8743           val = list1 (make_number (id));
8744         }
8745       else if (! detect_info.rejected && ! detect_info.found)
8746         {
8747           detect_info.found = CATEGORY_MASK_ANY;
8748           id = coding_categories[coding_category_undecided].id;
8749           val = list1 (make_number (id));
8750         }
8751       else if (highest)
8752         {
8753           if (detect_info.found)
8754             {
8755               detect_info.found = 1 << category;
8756               val = list1 (make_number (this->id));
8757             }
8758           else
8759             for (i = 0; i < coding_category_raw_text; i++)
8760               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8761                 {
8762                   detect_info.found = 1 << coding_priorities[i];
8763                   id = coding_categories[coding_priorities[i]].id;
8764                   val = list1 (make_number (id));
8765                   break;
8766                 }
8767         }
8768       else
8769         {
8770           int mask = detect_info.rejected | detect_info.found;
8771           int found = 0;
8772
8773           for (i = coding_category_raw_text - 1; i >= 0; i--)
8774             {
8775               category = coding_priorities[i];
8776               if (! (mask & (1 << category)))
8777                 {
8778                   found |= 1 << category;
8779                   id = coding_categories[category].id;
8780                   if (id >= 0)
8781                     val = list1 (make_number (id));
8782                 }
8783             }
8784           for (i = coding_category_raw_text - 1; i >= 0; i--)
8785             {
8786               category = coding_priorities[i];
8787               if (detect_info.found & (1 << category))
8788                 {
8789                   id = coding_categories[category].id;
8790                   val = Fcons (make_number (id), val);
8791                 }
8792             }
8793           detect_info.found |= found;
8794         }
8795     }
8796   else if (base_category == coding_category_utf_8_auto)
8797     {
8798       if (detect_coding_utf_8 (&coding, &detect_info))
8799         {
8800           struct coding_system *this;
8801
8802           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8803             this = coding_categories + coding_category_utf_8_sig;
8804           else
8805             this = coding_categories + coding_category_utf_8_nosig;
8806           val = list1 (make_number (this->id));
8807         }
8808     }
8809   else if (base_category == coding_category_utf_16_auto)
8810     {
8811       if (detect_coding_utf_16 (&coding, &detect_info))
8812         {
8813           struct coding_system *this;
8814
8815           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8816             this = coding_categories + coding_category_utf_16_le;
8817           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8818             this = coding_categories + coding_category_utf_16_be;
8819           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8820             this = coding_categories + coding_category_utf_16_be_nosig;
8821           else
8822             this = coding_categories + coding_category_utf_16_le_nosig;
8823           val = list1 (make_number (this->id));
8824         }
8825     }
8826   else
8827     {
8828       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8829       val = list1 (make_number (coding.id));
8830     }
8831
8832   /* Then, detect eol-format if necessary.  */
8833   {
8834     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8835     Lisp_Object tail;
8836
8837     if (VECTORP (eol_type))
8838       {
8839         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8840           {
8841             if (null_byte_found)
8842               normal_eol = EOL_SEEN_LF;
8843             else
8844               normal_eol = detect_eol (coding.source, src_bytes,
8845                                        coding_category_raw_text);
8846           }
8847         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8848                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8849           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8850                                       coding_category_utf_16_be);
8851         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8852                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8853           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8854                                       coding_category_utf_16_le);
8855       }
8856     else
8857       {
8858         if (EQ (eol_type, Qunix))
8859           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8860         else if (EQ (eol_type, Qdos))
8861           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8862         else
8863           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8864       }
8865
8866     for (tail = val; CONSP (tail); tail = XCDR (tail))
8867       {
8868         enum coding_category category;
8869         int this_eol;
8870
8871         id = XINT (XCAR (tail));
8872         attrs = CODING_ID_ATTRS (id);
8873         category = XINT (CODING_ATTR_CATEGORY (attrs));
8874         eol_type = CODING_ID_EOL_TYPE (id);
8875         if (VECTORP (eol_type))
8876           {
8877             if (category == coding_category_utf_16_be
8878                 || category == coding_category_utf_16_be_nosig)
8879               this_eol = utf_16_be_eol;
8880             else if (category == coding_category_utf_16_le
8881                      || category == coding_category_utf_16_le_nosig)
8882               this_eol = utf_16_le_eol;
8883             else
8884               this_eol = normal_eol;
8885
8886             if (this_eol == EOL_SEEN_LF)
8887               XSETCAR (tail, AREF (eol_type, 0));
8888             else if (this_eol == EOL_SEEN_CRLF)
8889               XSETCAR (tail, AREF (eol_type, 1));
8890             else if (this_eol == EOL_SEEN_CR)
8891               XSETCAR (tail, AREF (eol_type, 2));
8892             else
8893               XSETCAR (tail, CODING_ID_NAME (id));
8894           }
8895         else
8896           XSETCAR (tail, CODING_ID_NAME (id));
8897       }
8898   }
8899
8900   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8901 }
8902
8903
8904 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8905        2, 3, 0,
8906        doc: /* Detect coding system of the text in the region between START and END.
8907 Return a list of possible coding systems ordered by priority.
8908 The coding systems to try and their priorities follows what
8909 the function `coding-system-priority-list' (which see) returns.
8910
8911 If only ASCII characters are found (except for such ISO-2022 control
8912 characters as ESC), it returns a list of single element `undecided'
8913 or its subsidiary coding system according to a detected end-of-line
8914 format.
8915
8916 If optional argument HIGHEST is non-nil, return the coding system of
8917 highest priority.  */)
8918   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8919 {
8920   ptrdiff_t from, to;
8921   ptrdiff_t from_byte, to_byte;
8922
8923   validate_region (&start, &end);
8924   from = XINT (start), to = XINT (end);
8925   from_byte = CHAR_TO_BYTE (from);
8926   to_byte = CHAR_TO_BYTE (to);
8927
8928   if (from < GPT && to >= GPT)
8929     move_gap_both (to, to_byte);
8930
8931   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8932                                to - from, to_byte - from_byte,
8933                                !NILP (highest),
8934                                !NILP (BVAR (current_buffer
8935                                       , enable_multibyte_characters)),
8936                                Qnil);
8937 }
8938
8939 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8940        1, 2, 0,
8941        doc: /* Detect coding system of the text in STRING.
8942 Return a list of possible coding systems ordered by priority.
8943 The coding systems to try and their priorities follows what
8944 the function `coding-system-priority-list' (which see) returns.
8945
8946 If only ASCII characters are found (except for such ISO-2022 control
8947 characters as ESC), it returns a list of single element `undecided'
8948 or its subsidiary coding system according to a detected end-of-line
8949 format.
8950
8951 If optional argument HIGHEST is non-nil, return the coding system of
8952 highest priority.  */)
8953   (Lisp_Object string, Lisp_Object highest)
8954 {
8955   CHECK_STRING (string);
8956
8957   return detect_coding_system (SDATA (string),
8958                                SCHARS (string), SBYTES (string),
8959                                !NILP (highest), STRING_MULTIBYTE (string),
8960                                Qnil);
8961 }
8962
8963
8964 static bool
8965 char_encodable_p (int c, Lisp_Object attrs)
8966 {
8967   Lisp_Object tail;
8968   struct charset *charset;
8969   Lisp_Object translation_table;
8970
8971   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8972   if (! NILP (translation_table))
8973     c = translate_char (translation_table, c);
8974   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8975        CONSP (tail); tail = XCDR (tail))
8976     {
8977       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8978       if (CHAR_CHARSET_P (c, charset))
8979         break;
8980     }
8981   return (! NILP (tail));
8982 }
8983
8984
8985 /* Return a list of coding systems that safely encode the text between
8986    START and END.  If EXCLUDE is non-nil, it is a list of coding
8987    systems not to check.  The returned list doesn't contain any such
8988    coding systems.  In any case, if the text contains only ASCII or is
8989    unibyte, return t.  */
8990
8991 DEFUN ("find-coding-systems-region-internal",
8992        Ffind_coding_systems_region_internal,
8993        Sfind_coding_systems_region_internal, 2, 3, 0,
8994        doc: /* Internal use only.  */)
8995   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8996 {
8997   Lisp_Object coding_attrs_list, safe_codings;
8998   ptrdiff_t start_byte, end_byte;
8999   const unsigned char *p, *pbeg, *pend;
9000   int c;
9001   Lisp_Object tail, elt, work_table;
9002
9003   if (STRINGP (start))
9004     {
9005       if (!STRING_MULTIBYTE (start)
9006           || SCHARS (start) == SBYTES (start))
9007         return Qt;
9008       start_byte = 0;
9009       end_byte = SBYTES (start);
9010     }
9011   else
9012     {
9013       CHECK_NUMBER_COERCE_MARKER (start);
9014       CHECK_NUMBER_COERCE_MARKER (end);
9015       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9016         args_out_of_range (start, end);
9017       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9018         return Qt;
9019       start_byte = CHAR_TO_BYTE (XINT (start));
9020       end_byte = CHAR_TO_BYTE (XINT (end));
9021       if (XINT (end) - XINT (start) == end_byte - start_byte)
9022         return Qt;
9023
9024       if (XINT (start) < GPT && XINT (end) > GPT)
9025         {
9026           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9027             move_gap_both (XINT (start), start_byte);
9028           else
9029             move_gap_both (XINT (end), end_byte);
9030         }
9031     }
9032
9033   coding_attrs_list = Qnil;
9034   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9035     if (NILP (exclude)
9036         || NILP (Fmemq (XCAR (tail), exclude)))
9037       {
9038         Lisp_Object attrs;
9039
9040         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9041         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9042           {
9043             ASET (attrs, coding_attr_trans_tbl,
9044                   get_translation_table (attrs, 1, NULL));
9045             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9046           }
9047       }
9048
9049   if (STRINGP (start))
9050     p = pbeg = SDATA (start);
9051   else
9052     p = pbeg = BYTE_POS_ADDR (start_byte);
9053   pend = p + (end_byte - start_byte);
9054
9055   while (p < pend && ASCII_CHAR_P (*p)) p++;
9056   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9057
9058   work_table = Fmake_char_table (Qnil, Qnil);
9059   while (p < pend)
9060     {
9061       if (ASCII_CHAR_P (*p))
9062         p++;
9063       else
9064         {
9065           c = STRING_CHAR_ADVANCE (p);
9066           if (!NILP (char_table_ref (work_table, c)))
9067             /* This character was already checked.  Ignore it.  */
9068             continue;
9069
9070           charset_map_loaded = 0;
9071           for (tail = coding_attrs_list; CONSP (tail);)
9072             {
9073               elt = XCAR (tail);
9074               if (NILP (elt))
9075                 tail = XCDR (tail);
9076               else if (char_encodable_p (c, elt))
9077                 tail = XCDR (tail);
9078               else if (CONSP (XCDR (tail)))
9079                 {
9080                   XSETCAR (tail, XCAR (XCDR (tail)));
9081                   XSETCDR (tail, XCDR (XCDR (tail)));
9082                 }
9083               else
9084                 {
9085                   XSETCAR (tail, Qnil);
9086                   tail = XCDR (tail);
9087                 }
9088             }
9089           if (charset_map_loaded)
9090             {
9091               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9092
9093               if (STRINGP (start))
9094                 pbeg = SDATA (start);
9095               else
9096                 pbeg = BYTE_POS_ADDR (start_byte);
9097               p = pbeg + p_offset;
9098               pend = pbeg + pend_offset;
9099             }
9100           char_table_set (work_table, c, Qt);
9101         }
9102     }
9103
9104   safe_codings = list2 (Qraw_text, Qno_conversion);
9105   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9106     if (! NILP (XCAR (tail)))
9107       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9108
9109   return safe_codings;
9110 }
9111
9112
9113 DEFUN ("unencodable-char-position", Funencodable_char_position,
9114        Sunencodable_char_position, 3, 5, 0,
9115        doc: /* Return position of first un-encodable character in a region.
9116 START and END specify the region and CODING-SYSTEM specifies the
9117 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9118
9119 If optional 4th argument COUNT is non-nil, it specifies at most how
9120 many un-encodable characters to search.  In this case, the value is a
9121 list of positions.
9122
9123 If optional 5th argument STRING is non-nil, it is a string to search
9124 for un-encodable characters.  In that case, START and END are indexes
9125 to the string and treated as in `substring'.  */)
9126   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9127    Lisp_Object count, Lisp_Object string)
9128 {
9129   EMACS_INT n;
9130   struct coding_system coding;
9131   Lisp_Object attrs, charset_list, translation_table;
9132   Lisp_Object positions;
9133   ptrdiff_t from, to;
9134   const unsigned char *p, *stop, *pend;
9135   bool ascii_compatible;
9136
9137   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9138   attrs = CODING_ID_ATTRS (coding.id);
9139   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9140     return Qnil;
9141   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9142   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9143   translation_table = get_translation_table (attrs, 1, NULL);
9144
9145   if (NILP (string))
9146     {
9147       validate_region (&start, &end);
9148       from = XINT (start);
9149       to = XINT (end);
9150       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9151           || (ascii_compatible
9152               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9153         return Qnil;
9154       p = CHAR_POS_ADDR (from);
9155       pend = CHAR_POS_ADDR (to);
9156       if (from < GPT && to >= GPT)
9157         stop = GPT_ADDR;
9158       else
9159         stop = pend;
9160     }
9161   else
9162     {
9163       CHECK_STRING (string);
9164       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9165       if (! STRING_MULTIBYTE (string))
9166         return Qnil;
9167       p = SDATA (string) + string_char_to_byte (string, from);
9168       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9169       if (ascii_compatible && (to - from) == (pend - p))
9170         return Qnil;
9171     }
9172
9173   if (NILP (count))
9174     n = 1;
9175   else
9176     {
9177       CHECK_NATNUM (count);
9178       n = XINT (count);
9179     }
9180
9181   positions = Qnil;
9182   charset_map_loaded = 0;
9183   while (1)
9184     {
9185       int c;
9186
9187       if (ascii_compatible)
9188         while (p < stop && ASCII_CHAR_P (*p))
9189           p++, from++;
9190       if (p >= stop)
9191         {
9192           if (p >= pend)
9193             break;
9194           stop = pend;
9195           p = GAP_END_ADDR;
9196         }
9197
9198       c = STRING_CHAR_ADVANCE (p);
9199       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9200           && ! char_charset (translate_char (translation_table, c),
9201                              charset_list, NULL))
9202         {
9203           positions = Fcons (make_number (from), positions);
9204           n--;
9205           if (n == 0)
9206             break;
9207         }
9208
9209       from++;
9210       if (charset_map_loaded && NILP (string))
9211         {
9212           p = CHAR_POS_ADDR (from);
9213           pend = CHAR_POS_ADDR (to);
9214           if (from < GPT && to >= GPT)
9215             stop = GPT_ADDR;
9216           else
9217             stop = pend;
9218           charset_map_loaded = 0;
9219         }
9220     }
9221
9222   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9223 }
9224
9225
9226 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9227        Scheck_coding_systems_region, 3, 3, 0,
9228        doc: /* Check if the region is encodable by coding systems.
9229
9230 START and END are buffer positions specifying the region.
9231 CODING-SYSTEM-LIST is a list of coding systems to check.
9232
9233 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9234 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9235 whole region, POS0, POS1, ... are buffer positions where non-encodable
9236 characters are found.
9237
9238 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9239 value is nil.
9240
9241 START may be a string.  In that case, check if the string is
9242 encodable, and the value contains indices to the string instead of
9243 buffer positions.  END is ignored.
9244
9245 If the current buffer (or START if it is a string) is unibyte, the value
9246 is nil.  */)
9247   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9248 {
9249   Lisp_Object list;
9250   ptrdiff_t start_byte, end_byte;
9251   ptrdiff_t pos;
9252   const unsigned char *p, *pbeg, *pend;
9253   int c;
9254   Lisp_Object tail, elt, attrs;
9255
9256   if (STRINGP (start))
9257     {
9258       if (!STRING_MULTIBYTE (start)
9259           || SCHARS (start) == SBYTES (start))
9260         return Qnil;
9261       start_byte = 0;
9262       end_byte = SBYTES (start);
9263       pos = 0;
9264     }
9265   else
9266     {
9267       CHECK_NUMBER_COERCE_MARKER (start);
9268       CHECK_NUMBER_COERCE_MARKER (end);
9269       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9270         args_out_of_range (start, end);
9271       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9272         return Qnil;
9273       start_byte = CHAR_TO_BYTE (XINT (start));
9274       end_byte = CHAR_TO_BYTE (XINT (end));
9275       if (XINT (end) - XINT (start) == end_byte - start_byte)
9276         return Qnil;
9277
9278       if (XINT (start) < GPT && XINT (end) > GPT)
9279         {
9280           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9281             move_gap_both (XINT (start), start_byte);
9282           else
9283             move_gap_both (XINT (end), end_byte);
9284         }
9285       pos = XINT (start);
9286     }
9287
9288   list = Qnil;
9289   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9290     {
9291       elt = XCAR (tail);
9292       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9293       ASET (attrs, coding_attr_trans_tbl,
9294             get_translation_table (attrs, 1, NULL));
9295       list = Fcons (list2 (elt, attrs), list);
9296     }
9297
9298   if (STRINGP (start))
9299     p = pbeg = SDATA (start);
9300   else
9301     p = pbeg = BYTE_POS_ADDR (start_byte);
9302   pend = p + (end_byte - start_byte);
9303
9304   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9305   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9306
9307   while (p < pend)
9308     {
9309       if (ASCII_CHAR_P (*p))
9310         p++;
9311       else
9312         {
9313           c = STRING_CHAR_ADVANCE (p);
9314
9315           charset_map_loaded = 0;
9316           for (tail = list; CONSP (tail); tail = XCDR (tail))
9317             {
9318               elt = XCDR (XCAR (tail));
9319               if (! char_encodable_p (c, XCAR (elt)))
9320                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9321             }
9322           if (charset_map_loaded)
9323             {
9324               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9325
9326               if (STRINGP (start))
9327                 pbeg = SDATA (start);
9328               else
9329                 pbeg = BYTE_POS_ADDR (start_byte);
9330               p = pbeg + p_offset;
9331               pend = pbeg + pend_offset;
9332             }
9333         }
9334       pos++;
9335     }
9336
9337   tail = list;
9338   list = Qnil;
9339   for (; CONSP (tail); tail = XCDR (tail))
9340     {
9341       elt = XCAR (tail);
9342       if (CONSP (XCDR (XCDR (elt))))
9343         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9344                       list);
9345     }
9346
9347   return list;
9348 }
9349
9350
9351 static Lisp_Object
9352 code_convert_region (Lisp_Object start, Lisp_Object end,
9353                      Lisp_Object coding_system, Lisp_Object dst_object,
9354                      bool encodep, bool norecord)
9355 {
9356   struct coding_system coding;
9357   ptrdiff_t from, from_byte, to, to_byte;
9358   Lisp_Object src_object;
9359
9360   if (NILP (coding_system))
9361     coding_system = Qno_conversion;
9362   else
9363     CHECK_CODING_SYSTEM (coding_system);
9364   src_object = Fcurrent_buffer ();
9365   if (NILP (dst_object))
9366     dst_object = src_object;
9367   else if (! EQ (dst_object, Qt))
9368     CHECK_BUFFER (dst_object);
9369
9370   validate_region (&start, &end);
9371   from = XFASTINT (start);
9372   from_byte = CHAR_TO_BYTE (from);
9373   to = XFASTINT (end);
9374   to_byte = CHAR_TO_BYTE (to);
9375
9376   setup_coding_system (coding_system, &coding);
9377   coding.mode |= CODING_MODE_LAST_BLOCK;
9378
9379   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9380     {
9381       struct buffer *buf = XBUFFER (dst_object);
9382       ptrdiff_t buf_pt = BUF_PT (buf);
9383
9384       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9385     }
9386
9387   if (encodep)
9388     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9389                           dst_object);
9390   else
9391     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9392                           dst_object);
9393   if (! norecord)
9394     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9395
9396   return (BUFFERP (dst_object)
9397           ? make_number (coding.produced_char)
9398           : coding.dst_object);
9399 }
9400
9401
9402 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9403        3, 4, "r\nzCoding system: ",
9404        doc: /* Decode the current region from the specified coding system.
9405 When called from a program, takes four arguments:
9406         START, END, CODING-SYSTEM, and DESTINATION.
9407 START and END are buffer positions.
9408
9409 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9410 If nil, the region between START and END is replaced by the decoded text.
9411 If buffer, the decoded text is inserted in that buffer after point (point
9412 does not move).
9413 In those cases, the length of the decoded text is returned.
9414 If DESTINATION is t, the decoded text is returned.
9415
9416 This function sets `last-coding-system-used' to the precise coding system
9417 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9418 not fully specified.)  */)
9419   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9420 {
9421   return code_convert_region (start, end, coding_system, destination, 0, 0);
9422 }
9423
9424 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9425        3, 4, "r\nzCoding system: ",
9426        doc: /* Encode the current region by specified coding system.
9427 When called from a program, takes four arguments:
9428         START, END, CODING-SYSTEM and DESTINATION.
9429 START and END are buffer positions.
9430
9431 Optional 4th argument DESTINATION specifies where the encoded text goes.
9432 If nil, the region between START and END is replaced by the encoded text.
9433 If buffer, the encoded text is inserted in that buffer after point (point
9434 does not move).
9435 In those cases, the length of the encoded text is returned.
9436 If DESTINATION is t, the encoded text is returned.
9437
9438 This function sets `last-coding-system-used' to the precise coding system
9439 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9440 not fully specified.)  */)
9441   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9442 {
9443   return code_convert_region (start, end, coding_system, destination, 1, 0);
9444 }
9445
9446 Lisp_Object
9447 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9448                      Lisp_Object dst_object, bool encodep, bool nocopy,
9449                      bool norecord)
9450 {
9451   struct coding_system coding;
9452   ptrdiff_t chars, bytes;
9453
9454   CHECK_STRING (string);
9455   if (NILP (coding_system))
9456     {
9457       if (! norecord)
9458         Vlast_coding_system_used = Qno_conversion;
9459       if (NILP (dst_object))
9460         return (nocopy ? Fcopy_sequence (string) : string);
9461     }
9462
9463   if (NILP (coding_system))
9464     coding_system = Qno_conversion;
9465   else
9466     CHECK_CODING_SYSTEM (coding_system);
9467   if (NILP (dst_object))
9468     dst_object = Qt;
9469   else if (! EQ (dst_object, Qt))
9470     CHECK_BUFFER (dst_object);
9471
9472   setup_coding_system (coding_system, &coding);
9473   coding.mode |= CODING_MODE_LAST_BLOCK;
9474   chars = SCHARS (string);
9475   bytes = SBYTES (string);
9476
9477   if (BUFFERP (dst_object))
9478     {
9479       struct buffer *buf = XBUFFER (dst_object);
9480       ptrdiff_t buf_pt = BUF_PT (buf);
9481
9482       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9483     }
9484
9485   if (encodep)
9486     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9487   else
9488     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9489   if (! norecord)
9490     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9491
9492   return (BUFFERP (dst_object)
9493           ? make_number (coding.produced_char)
9494           : coding.dst_object);
9495 }
9496
9497
9498 /* Encode or decode STRING according to CODING_SYSTEM.
9499    Do not set Vlast_coding_system_used.
9500
9501    This function is called only from macros DECODE_FILE and
9502    ENCODE_FILE, thus we ignore character composition.  */
9503
9504 Lisp_Object
9505 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9506                               bool encodep)
9507 {
9508   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9509 }
9510
9511 /* Encode or decode a file name, to or from a unibyte string suitable
9512    for passing to C library functions.  */
9513 Lisp_Object
9514 decode_file_name (Lisp_Object fname)
9515 {
9516 #ifdef WINDOWSNT
9517   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9518      converts the file names either to UTF-16LE or to the system ANSI
9519      codepage internally, depending on the underlying OS; see w32.c.  */
9520   if (! NILP (Fcoding_system_p (Qutf_8)))
9521     return code_convert_string_norecord (fname, Qutf_8, 0);
9522   return fname;
9523 #else  /* !WINDOWSNT */
9524   if (! NILP (Vfile_name_coding_system))
9525     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9526   else if (! NILP (Vdefault_file_name_coding_system))
9527     return code_convert_string_norecord (fname,
9528                                          Vdefault_file_name_coding_system, 0);
9529   else
9530     return fname;
9531 #endif
9532 }
9533
9534 Lisp_Object
9535 encode_file_name (Lisp_Object fname)
9536 {
9537   /* This is especially important during bootstrap and dumping, when
9538      file-name encoding is not yet known, and therefore any non-ASCII
9539      file names are unibyte strings, and could only be thrashed if we
9540      try to encode them.  */
9541   if (!STRING_MULTIBYTE (fname))
9542     return fname;
9543 #ifdef WINDOWSNT
9544   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9545      converts the file names either to UTF-16LE or to the system ANSI
9546      codepage internally, depending on the underlying OS; see w32.c.  */
9547   if (! NILP (Fcoding_system_p (Qutf_8)))
9548     return code_convert_string_norecord (fname, Qutf_8, 1);
9549   return fname;
9550 #else  /* !WINDOWSNT */
9551   if (! NILP (Vfile_name_coding_system))
9552     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9553   else if (! NILP (Vdefault_file_name_coding_system))
9554     return code_convert_string_norecord (fname,
9555                                          Vdefault_file_name_coding_system, 1);
9556   else
9557     return fname;
9558 #endif
9559 }
9560
9561 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9562        2, 4, 0,
9563        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9564
9565 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9566 if the decoding operation is trivial.
9567
9568 Optional fourth arg BUFFER non-nil means that the decoded text is
9569 inserted in that buffer after point (point does not move).  In this
9570 case, the return value is the length of the decoded text.
9571
9572 This function sets `last-coding-system-used' to the precise coding system
9573 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9574 not fully specified.)  */)
9575   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9576 {
9577   return code_convert_string (string, coding_system, buffer,
9578                               0, ! NILP (nocopy), 0);
9579 }
9580
9581 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9582        2, 4, 0,
9583        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9584
9585 Optional third arg NOCOPY non-nil means it is OK to return STRING
9586 itself if the encoding operation is trivial.
9587
9588 Optional fourth arg BUFFER non-nil means that the encoded text is
9589 inserted in that buffer after point (point does not move).  In this
9590 case, the return value is the length of the encoded text.
9591
9592 This function sets `last-coding-system-used' to the precise coding system
9593 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9594 not fully specified.)  */)
9595   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9596 {
9597   return code_convert_string (string, coding_system, buffer,
9598                               1, ! NILP (nocopy), 0);
9599 }
9600
9601 \f
9602 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9603        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9604 Return the corresponding character.  */)
9605   (Lisp_Object code)
9606 {
9607   Lisp_Object spec, attrs, val;
9608   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9609   EMACS_INT ch;
9610   int c;
9611
9612   CHECK_NATNUM (code);
9613   ch = XFASTINT (code);
9614   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9615   attrs = AREF (spec, 0);
9616
9617   if (ASCII_CHAR_P (ch)
9618       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9619     return code;
9620
9621   val = CODING_ATTR_CHARSET_LIST (attrs);
9622   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9623   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9624   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9625
9626   if (ch <= 0x7F)
9627     {
9628       c = ch;
9629       charset = charset_roman;
9630     }
9631   else if (ch >= 0xA0 && ch < 0xDF)
9632     {
9633       c = ch - 0x80;
9634       charset = charset_kana;
9635     }
9636   else
9637     {
9638       EMACS_INT c1 = ch >> 8;
9639       int c2 = ch & 0xFF;
9640
9641       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9642           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9643         error ("Invalid code: %"pI"d", ch);
9644       c = ch;
9645       SJIS_TO_JIS (c);
9646       charset = charset_kanji;
9647     }
9648   c = DECODE_CHAR (charset, c);
9649   if (c < 0)
9650     error ("Invalid code: %"pI"d", ch);
9651   return make_number (c);
9652 }
9653
9654
9655 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9656        doc: /* Encode a Japanese character CH to shift_jis encoding.
9657 Return the corresponding code in SJIS.  */)
9658   (Lisp_Object ch)
9659 {
9660   Lisp_Object spec, attrs, charset_list;
9661   int c;
9662   struct charset *charset;
9663   unsigned code;
9664
9665   CHECK_CHARACTER (ch);
9666   c = XFASTINT (ch);
9667   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9668   attrs = AREF (spec, 0);
9669
9670   if (ASCII_CHAR_P (c)
9671       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9672     return ch;
9673
9674   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9675   charset = char_charset (c, charset_list, &code);
9676   if (code == CHARSET_INVALID_CODE (charset))
9677     error ("Can't encode by shift_jis encoding: %c", c);
9678   JIS_TO_SJIS (code);
9679
9680   return make_number (code);
9681 }
9682
9683 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9684        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9685 Return the corresponding character.  */)
9686   (Lisp_Object code)
9687 {
9688   Lisp_Object spec, attrs, val;
9689   struct charset *charset_roman, *charset_big5, *charset;
9690   EMACS_INT ch;
9691   int c;
9692
9693   CHECK_NATNUM (code);
9694   ch = XFASTINT (code);
9695   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9696   attrs = AREF (spec, 0);
9697
9698   if (ASCII_CHAR_P (ch)
9699       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9700     return code;
9701
9702   val = CODING_ATTR_CHARSET_LIST (attrs);
9703   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9704   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9705
9706   if (ch <= 0x7F)
9707     {
9708       c = ch;
9709       charset = charset_roman;
9710     }
9711   else
9712     {
9713       EMACS_INT b1 = ch >> 8;
9714       int b2 = ch & 0x7F;
9715       if (b1 < 0xA1 || b1 > 0xFE
9716           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9717         error ("Invalid code: %"pI"d", ch);
9718       c = ch;
9719       charset = charset_big5;
9720     }
9721   c = DECODE_CHAR (charset, c);
9722   if (c < 0)
9723     error ("Invalid code: %"pI"d", ch);
9724   return make_number (c);
9725 }
9726
9727 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9728        doc: /* Encode the Big5 character CH to BIG5 coding system.
9729 Return the corresponding character code in Big5.  */)
9730   (Lisp_Object ch)
9731 {
9732   Lisp_Object spec, attrs, charset_list;
9733   struct charset *charset;
9734   int c;
9735   unsigned code;
9736
9737   CHECK_CHARACTER (ch);
9738   c = XFASTINT (ch);
9739   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9740   attrs = AREF (spec, 0);
9741   if (ASCII_CHAR_P (c)
9742       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9743     return ch;
9744
9745   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9746   charset = char_charset (c, charset_list, &code);
9747   if (code == CHARSET_INVALID_CODE (charset))
9748     error ("Can't encode by Big5 encoding: %c", c);
9749
9750   return make_number (code);
9751 }
9752
9753 \f
9754 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9755        Sset_terminal_coding_system_internal, 1, 2, 0,
9756        doc: /* Internal use only.  */)
9757   (Lisp_Object coding_system, Lisp_Object terminal)
9758 {
9759   struct terminal *term = decode_live_terminal (terminal);
9760   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9761   CHECK_SYMBOL (coding_system);
9762   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9763   /* We had better not send unsafe characters to terminal.  */
9764   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9765   /* Character composition should be disabled.  */
9766   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9767   terminal_coding->src_multibyte = 1;
9768   terminal_coding->dst_multibyte = 0;
9769   tset_charset_list
9770     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9771             ? coding_charset_list (terminal_coding)
9772             : list1 (make_number (charset_ascii))));
9773   return Qnil;
9774 }
9775
9776 DEFUN ("set-safe-terminal-coding-system-internal",
9777        Fset_safe_terminal_coding_system_internal,
9778        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9779        doc: /* Internal use only.  */)
9780   (Lisp_Object coding_system)
9781 {
9782   CHECK_SYMBOL (coding_system);
9783   setup_coding_system (Fcheck_coding_system (coding_system),
9784                        &safe_terminal_coding);
9785   /* Character composition should be disabled.  */
9786   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9787   safe_terminal_coding.src_multibyte = 1;
9788   safe_terminal_coding.dst_multibyte = 0;
9789   return Qnil;
9790 }
9791
9792 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9793        Sterminal_coding_system, 0, 1, 0,
9794        doc: /* Return coding system specified for terminal output on the given terminal.
9795 TERMINAL may be a terminal object, a frame, or nil for the selected
9796 frame's terminal device.  */)
9797   (Lisp_Object terminal)
9798 {
9799   struct coding_system *terminal_coding
9800     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
9801   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9802
9803   /* For backward compatibility, return nil if it is `undecided'.  */
9804   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9805 }
9806
9807 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9808        Sset_keyboard_coding_system_internal, 1, 2, 0,
9809        doc: /* Internal use only.  */)
9810   (Lisp_Object coding_system, Lisp_Object terminal)
9811 {
9812   struct terminal *t = decode_live_terminal (terminal);
9813   CHECK_SYMBOL (coding_system);
9814   if (NILP (coding_system))
9815     coding_system = Qno_conversion;
9816   else
9817     Fcheck_coding_system (coding_system);
9818   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9819   /* Character composition should be disabled.  */
9820   TERMINAL_KEYBOARD_CODING (t)->common_flags
9821     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9822   return Qnil;
9823 }
9824
9825 DEFUN ("keyboard-coding-system",
9826        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9827        doc: /* Return coding system specified for decoding keyboard input.  */)
9828   (Lisp_Object terminal)
9829 {
9830   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9831                          (decode_live_terminal (terminal))->id);
9832 }
9833
9834 \f
9835 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9836        Sfind_operation_coding_system,  1, MANY, 0,
9837        doc: /* Choose a coding system for an operation based on the target name.
9838 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9839 DECODING-SYSTEM is the coding system to use for decoding
9840 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9841 for encoding (in case OPERATION does encoding).
9842
9843 The first argument OPERATION specifies an I/O primitive:
9844   For file I/O, `insert-file-contents' or `write-region'.
9845   For process I/O, `call-process', `call-process-region', or `start-process'.
9846   For network I/O, `open-network-stream'.
9847
9848 The remaining arguments should be the same arguments that were passed
9849 to the primitive.  Depending on which primitive, one of those arguments
9850 is selected as the TARGET.  For example, if OPERATION does file I/O,
9851 whichever argument specifies the file name is TARGET.
9852
9853 TARGET has a meaning which depends on OPERATION:
9854   For file I/O, TARGET is a file name (except for the special case below).
9855   For process I/O, TARGET is a process name.
9856   For network I/O, TARGET is a service name or a port number.
9857
9858 This function looks up what is specified for TARGET in
9859 `file-coding-system-alist', `process-coding-system-alist',
9860 or `network-coding-system-alist' depending on OPERATION.
9861 They may specify a coding system, a cons of coding systems,
9862 or a function symbol to call.
9863 In the last case, we call the function with one argument,
9864 which is a list of all the arguments given to this function.
9865 If the function can't decide a coding system, it can return
9866 `undecided' so that the normal code-detection is performed.
9867
9868 If OPERATION is `insert-file-contents', the argument corresponding to
9869 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9870 file name to look up, and BUFFER is a buffer that contains the file's
9871 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9872 function to call for FILENAME, that function should examine the
9873 contents of BUFFER instead of reading the file.
9874
9875 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9876   (ptrdiff_t nargs, Lisp_Object *args)
9877 {
9878   Lisp_Object operation, target_idx, target, val;
9879   register Lisp_Object chain;
9880
9881   if (nargs < 2)
9882     error ("Too few arguments");
9883   operation = args[0];
9884   if (!SYMBOLP (operation)
9885       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9886     error ("Invalid first argument");
9887   if (nargs <= 1 + XFASTINT (target_idx))
9888     error ("Too few arguments for operation `%s'",
9889            SDATA (SYMBOL_NAME (operation)));
9890   target = args[XFASTINT (target_idx) + 1];
9891   if (!(STRINGP (target)
9892         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9893             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9894         || (EQ (operation, Qopen_network_stream)
9895             && (INTEGERP (target) || EQ (target, Qt)))))
9896     error ("Invalid argument %"pI"d of operation `%s'",
9897            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9898   if (CONSP (target))
9899     target = XCAR (target);
9900
9901   chain = ((EQ (operation, Qinsert_file_contents)
9902             || EQ (operation, Qwrite_region))
9903            ? Vfile_coding_system_alist
9904            : (EQ (operation, Qopen_network_stream)
9905               ? Vnetwork_coding_system_alist
9906               : Vprocess_coding_system_alist));
9907   if (NILP (chain))
9908     return Qnil;
9909
9910   for (; CONSP (chain); chain = XCDR (chain))
9911     {
9912       Lisp_Object elt;
9913
9914       elt = XCAR (chain);
9915       if (CONSP (elt)
9916           && ((STRINGP (target)
9917                && STRINGP (XCAR (elt))
9918                && fast_string_match (XCAR (elt), target) >= 0)
9919               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9920         {
9921           val = XCDR (elt);
9922           /* Here, if VAL is both a valid coding system and a valid
9923              function symbol, we return VAL as a coding system.  */
9924           if (CONSP (val))
9925             return val;
9926           if (! SYMBOLP (val))
9927             return Qnil;
9928           if (! NILP (Fcoding_system_p (val)))
9929             return Fcons (val, val);
9930           if (! NILP (Ffboundp (val)))
9931             {
9932               /* We use call1 rather than safe_call1
9933                  so as to get bug reports about functions called here
9934                  which don't handle the current interface.  */
9935               val = call1 (val, Flist (nargs, args));
9936               if (CONSP (val))
9937                 return val;
9938               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9939                 return Fcons (val, val);
9940             }
9941           return Qnil;
9942         }
9943     }
9944   return Qnil;
9945 }
9946
9947 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9948        Sset_coding_system_priority, 0, MANY, 0,
9949        doc: /* Assign higher priority to the coding systems given as arguments.
9950 If multiple coding systems belong to the same category,
9951 all but the first one are ignored.
9952
9953 usage: (set-coding-system-priority &rest coding-systems)  */)
9954   (ptrdiff_t nargs, Lisp_Object *args)
9955 {
9956   ptrdiff_t i, j;
9957   bool changed[coding_category_max];
9958   enum coding_category priorities[coding_category_max];
9959
9960   memset (changed, 0, sizeof changed);
9961
9962   for (i = j = 0; i < nargs; i++)
9963     {
9964       enum coding_category category;
9965       Lisp_Object spec, attrs;
9966
9967       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9968       attrs = AREF (spec, 0);
9969       category = XINT (CODING_ATTR_CATEGORY (attrs));
9970       if (changed[category])
9971         /* Ignore this coding system because a coding system of the
9972            same category already had a higher priority.  */
9973         continue;
9974       changed[category] = 1;
9975       priorities[j++] = category;
9976       if (coding_categories[category].id >= 0
9977           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9978         setup_coding_system (args[i], &coding_categories[category]);
9979       Fset (AREF (Vcoding_category_table, category), args[i]);
9980     }
9981
9982   /* Now we have decided top J priorities.  Reflect the order of the
9983      original priorities to the remaining priorities.  */
9984
9985   for (i = j, j = 0; i < coding_category_max; i++, j++)
9986     {
9987       while (j < coding_category_max
9988              && changed[coding_priorities[j]])
9989         j++;
9990       if (j == coding_category_max)
9991         emacs_abort ();
9992       priorities[i] = coding_priorities[j];
9993     }
9994
9995   memcpy (coding_priorities, priorities, sizeof priorities);
9996
9997   /* Update `coding-category-list'.  */
9998   Vcoding_category_list = Qnil;
9999   for (i = coding_category_max; i-- > 0; )
10000     Vcoding_category_list
10001       = Fcons (AREF (Vcoding_category_table, priorities[i]),
10002                Vcoding_category_list);
10003
10004   return Qnil;
10005 }
10006
10007 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
10008        Scoding_system_priority_list, 0, 1, 0,
10009        doc: /* Return a list of coding systems ordered by their priorities.
10010 The list contains a subset of coding systems; i.e. coding systems
10011 assigned to each coding category (see `coding-category-list').
10012
10013 HIGHESTP non-nil means just return the highest priority one.  */)
10014   (Lisp_Object highestp)
10015 {
10016   int i;
10017   Lisp_Object val;
10018
10019   for (i = 0, val = Qnil; i < coding_category_max; i++)
10020     {
10021       enum coding_category category = coding_priorities[i];
10022       int id = coding_categories[category].id;
10023       Lisp_Object attrs;
10024
10025       if (id < 0)
10026         continue;
10027       attrs = CODING_ID_ATTRS (id);
10028       if (! NILP (highestp))
10029         return CODING_ATTR_BASE_NAME (attrs);
10030       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10031     }
10032   return Fnreverse (val);
10033 }
10034
10035 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10036
10037 static Lisp_Object
10038 make_subsidiaries (Lisp_Object base)
10039 {
10040   Lisp_Object subsidiaries;
10041   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10042   USE_SAFE_ALLOCA;
10043   char *buf = SAFE_ALLOCA (base_name_len + 6);
10044   int i;
10045
10046   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10047   subsidiaries = make_uninit_vector (3);
10048   for (i = 0; i < 3; i++)
10049     {
10050       strcpy (buf + base_name_len, suffixes[i]);
10051       ASET (subsidiaries, i, intern (buf));
10052     }
10053   SAFE_FREE ();
10054   return subsidiaries;
10055 }
10056
10057
10058 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10059        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10060        doc: /* For internal use only.
10061 usage: (define-coding-system-internal ...)  */)
10062   (ptrdiff_t nargs, Lisp_Object *args)
10063 {
10064   Lisp_Object name;
10065   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10066   Lisp_Object attrs;            /* Vector of attributes.  */
10067   Lisp_Object eol_type;
10068   Lisp_Object aliases;
10069   Lisp_Object coding_type, charset_list, safe_charsets;
10070   enum coding_category category;
10071   Lisp_Object tail, val;
10072   int max_charset_id = 0;
10073   int i;
10074
10075   if (nargs < coding_arg_max)
10076     goto short_args;
10077
10078   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10079
10080   name = args[coding_arg_name];
10081   CHECK_SYMBOL (name);
10082   ASET (attrs, coding_attr_base_name, name);
10083
10084   val = args[coding_arg_mnemonic];
10085   if (! STRINGP (val))
10086     CHECK_CHARACTER (val);
10087   ASET (attrs, coding_attr_mnemonic, val);
10088
10089   coding_type = args[coding_arg_coding_type];
10090   CHECK_SYMBOL (coding_type);
10091   ASET (attrs, coding_attr_type, coding_type);
10092
10093   charset_list = args[coding_arg_charset_list];
10094   if (SYMBOLP (charset_list))
10095     {
10096       if (EQ (charset_list, Qiso_2022))
10097         {
10098           if (! EQ (coding_type, Qiso_2022))
10099             error ("Invalid charset-list");
10100           charset_list = Viso_2022_charset_list;
10101         }
10102       else if (EQ (charset_list, Qemacs_mule))
10103         {
10104           if (! EQ (coding_type, Qemacs_mule))
10105             error ("Invalid charset-list");
10106           charset_list = Vemacs_mule_charset_list;
10107         }
10108       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10109         {
10110           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10111             error ("Invalid charset-list");
10112           if (max_charset_id < XFASTINT (XCAR (tail)))
10113             max_charset_id = XFASTINT (XCAR (tail));
10114         }
10115     }
10116   else
10117     {
10118       charset_list = Fcopy_sequence (charset_list);
10119       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10120         {
10121           struct charset *charset;
10122
10123           val = XCAR (tail);
10124           CHECK_CHARSET_GET_CHARSET (val, charset);
10125           if (EQ (coding_type, Qiso_2022)
10126               ? CHARSET_ISO_FINAL (charset) < 0
10127               : EQ (coding_type, Qemacs_mule)
10128               ? CHARSET_EMACS_MULE_ID (charset) < 0
10129               : 0)
10130             error ("Can't handle charset `%s'",
10131                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10132
10133           XSETCAR (tail, make_number (charset->id));
10134           if (max_charset_id < charset->id)
10135             max_charset_id = charset->id;
10136         }
10137     }
10138   ASET (attrs, coding_attr_charset_list, charset_list);
10139
10140   safe_charsets = make_uninit_string (max_charset_id + 1);
10141   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10142   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10143     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10144   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10145
10146   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10147
10148   val = args[coding_arg_decode_translation_table];
10149   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10150     CHECK_SYMBOL (val);
10151   ASET (attrs, coding_attr_decode_tbl, val);
10152
10153   val = args[coding_arg_encode_translation_table];
10154   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10155     CHECK_SYMBOL (val);
10156   ASET (attrs, coding_attr_encode_tbl, val);
10157
10158   val = args[coding_arg_post_read_conversion];
10159   CHECK_SYMBOL (val);
10160   ASET (attrs, coding_attr_post_read, val);
10161
10162   val = args[coding_arg_pre_write_conversion];
10163   CHECK_SYMBOL (val);
10164   ASET (attrs, coding_attr_pre_write, val);
10165
10166   val = args[coding_arg_default_char];
10167   if (NILP (val))
10168     ASET (attrs, coding_attr_default_char, make_number (' '));
10169   else
10170     {
10171       CHECK_CHARACTER (val);
10172       ASET (attrs, coding_attr_default_char, val);
10173     }
10174
10175   val = args[coding_arg_for_unibyte];
10176   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10177
10178   val = args[coding_arg_plist];
10179   CHECK_LIST (val);
10180   ASET (attrs, coding_attr_plist, val);
10181
10182   if (EQ (coding_type, Qcharset))
10183     {
10184       /* Generate a lisp vector of 256 elements.  Each element is nil,
10185          integer, or a list of charset IDs.
10186
10187          If Nth element is nil, the byte code N is invalid in this
10188          coding system.
10189
10190          If Nth element is a number NUM, N is the first byte of a
10191          charset whose ID is NUM.
10192
10193          If Nth element is a list of charset IDs, N is the first byte
10194          of one of them.  The list is sorted by dimensions of the
10195          charsets.  A charset of smaller dimension comes first. */
10196       val = Fmake_vector (make_number (256), Qnil);
10197
10198       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10199         {
10200           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10201           int dim = CHARSET_DIMENSION (charset);
10202           int idx = (dim - 1) * 4;
10203
10204           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10205             ASET (attrs, coding_attr_ascii_compat, Qt);
10206
10207           for (i = charset->code_space[idx];
10208                i <= charset->code_space[idx + 1]; i++)
10209             {
10210               Lisp_Object tmp, tmp2;
10211               int dim2;
10212
10213               tmp = AREF (val, i);
10214               if (NILP (tmp))
10215                 tmp = XCAR (tail);
10216               else if (NUMBERP (tmp))
10217                 {
10218                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10219                   if (dim < dim2)
10220                     tmp = list2 (XCAR (tail), tmp);
10221                   else
10222                     tmp = list2 (tmp, XCAR (tail));
10223                 }
10224               else
10225                 {
10226                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10227                     {
10228                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10229                       if (dim < dim2)
10230                         break;
10231                     }
10232                   if (NILP (tmp2))
10233                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10234                   else
10235                     {
10236                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10237                       XSETCAR (tmp2, XCAR (tail));
10238                     }
10239                 }
10240               ASET (val, i, tmp);
10241             }
10242         }
10243       ASET (attrs, coding_attr_charset_valids, val);
10244       category = coding_category_charset;
10245     }
10246   else if (EQ (coding_type, Qccl))
10247     {
10248       Lisp_Object valids;
10249
10250       if (nargs < coding_arg_ccl_max)
10251         goto short_args;
10252
10253       val = args[coding_arg_ccl_decoder];
10254       CHECK_CCL_PROGRAM (val);
10255       if (VECTORP (val))
10256         val = Fcopy_sequence (val);
10257       ASET (attrs, coding_attr_ccl_decoder, val);
10258
10259       val = args[coding_arg_ccl_encoder];
10260       CHECK_CCL_PROGRAM (val);
10261       if (VECTORP (val))
10262         val = Fcopy_sequence (val);
10263       ASET (attrs, coding_attr_ccl_encoder, val);
10264
10265       val = args[coding_arg_ccl_valids];
10266       valids = Fmake_string (make_number (256), make_number (0), Qnil);
10267       for (tail = val; CONSP (tail); tail = XCDR (tail))
10268         {
10269           int from, to;
10270
10271           val = XCAR (tail);
10272           if (INTEGERP (val))
10273             {
10274               if (! (0 <= XINT (val) && XINT (val) <= 255))
10275                 args_out_of_range_3 (val, make_number (0), make_number (255));
10276               from = to = XINT (val);
10277             }
10278           else
10279             {
10280               CHECK_CONS (val);
10281               CHECK_NATNUM_CAR (val);
10282               CHECK_NUMBER_CDR (val);
10283               if (XINT (XCAR (val)) > 255)
10284                 args_out_of_range_3 (XCAR (val),
10285                                      make_number (0), make_number (255));
10286               from = XINT (XCAR (val));
10287               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10288                 args_out_of_range_3 (XCDR (val),
10289                                      XCAR (val), make_number (255));
10290               to = XINT (XCDR (val));
10291             }
10292           for (i = from; i <= to; i++)
10293             SSET (valids, i, 1);
10294         }
10295       ASET (attrs, coding_attr_ccl_valids, valids);
10296
10297       category = coding_category_ccl;
10298     }
10299   else if (EQ (coding_type, Qutf_16))
10300     {
10301       Lisp_Object bom, endian;
10302
10303       ASET (attrs, coding_attr_ascii_compat, Qnil);
10304
10305       if (nargs < coding_arg_utf16_max)
10306         goto short_args;
10307
10308       bom = args[coding_arg_utf16_bom];
10309       if (! NILP (bom) && ! EQ (bom, Qt))
10310         {
10311           CHECK_CONS (bom);
10312           val = XCAR (bom);
10313           CHECK_CODING_SYSTEM (val);
10314           val = XCDR (bom);
10315           CHECK_CODING_SYSTEM (val);
10316         }
10317       ASET (attrs, coding_attr_utf_bom, bom);
10318
10319       endian = args[coding_arg_utf16_endian];
10320       CHECK_SYMBOL (endian);
10321       if (NILP (endian))
10322         endian = Qbig;
10323       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10324         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10325       ASET (attrs, coding_attr_utf_16_endian, endian);
10326
10327       category = (CONSP (bom)
10328                   ? coding_category_utf_16_auto
10329                   : NILP (bom)
10330                   ? (EQ (endian, Qbig)
10331                      ? coding_category_utf_16_be_nosig
10332                      : coding_category_utf_16_le_nosig)
10333                   : (EQ (endian, Qbig)
10334                      ? coding_category_utf_16_be
10335                      : coding_category_utf_16_le));
10336     }
10337   else if (EQ (coding_type, Qiso_2022))
10338     {
10339       Lisp_Object initial, reg_usage, request, flags;
10340
10341       if (nargs < coding_arg_iso2022_max)
10342         goto short_args;
10343
10344       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10345       CHECK_VECTOR (initial);
10346       for (i = 0; i < 4; i++)
10347         {
10348           val = AREF (initial, i);
10349           if (! NILP (val))
10350             {
10351               struct charset *charset;
10352
10353               CHECK_CHARSET_GET_CHARSET (val, charset);
10354               ASET (initial, i, make_number (CHARSET_ID (charset)));
10355               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10356                 ASET (attrs, coding_attr_ascii_compat, Qt);
10357             }
10358           else
10359             ASET (initial, i, make_number (-1));
10360         }
10361
10362       reg_usage = args[coding_arg_iso2022_reg_usage];
10363       CHECK_CONS (reg_usage);
10364       CHECK_NUMBER_CAR (reg_usage);
10365       CHECK_NUMBER_CDR (reg_usage);
10366
10367       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10368       for (tail = request; CONSP (tail); tail = XCDR (tail))
10369         {
10370           int id;
10371           Lisp_Object tmp1;
10372
10373           val = XCAR (tail);
10374           CHECK_CONS (val);
10375           tmp1 = XCAR (val);
10376           CHECK_CHARSET_GET_ID (tmp1, id);
10377           CHECK_NATNUM_CDR (val);
10378           if (XINT (XCDR (val)) >= 4)
10379             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10380           XSETCAR (val, make_number (id));
10381         }
10382
10383       flags = args[coding_arg_iso2022_flags];
10384       CHECK_NATNUM (flags);
10385       i = XINT (flags) & INT_MAX;
10386       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10387         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10388       flags = make_number (i);
10389
10390       ASET (attrs, coding_attr_iso_initial, initial);
10391       ASET (attrs, coding_attr_iso_usage, reg_usage);
10392       ASET (attrs, coding_attr_iso_request, request);
10393       ASET (attrs, coding_attr_iso_flags, flags);
10394       setup_iso_safe_charsets (attrs);
10395
10396       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10397         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10398                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10399                     ? coding_category_iso_7_else
10400                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10401                     ? coding_category_iso_7
10402                     : coding_category_iso_7_tight);
10403       else
10404         {
10405           int id = XINT (AREF (initial, 1));
10406
10407           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10408                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10409                        || id < 0)
10410                       ? coding_category_iso_8_else
10411                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10412                       ? coding_category_iso_8_1
10413                       : coding_category_iso_8_2);
10414         }
10415       if (category != coding_category_iso_8_1
10416           && category != coding_category_iso_8_2)
10417         ASET (attrs, coding_attr_ascii_compat, Qnil);
10418     }
10419   else if (EQ (coding_type, Qemacs_mule))
10420     {
10421       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10422         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10423       ASET (attrs, coding_attr_ascii_compat, Qt);
10424       category = coding_category_emacs_mule;
10425     }
10426   else if (EQ (coding_type, Qshift_jis))
10427     {
10428
10429       struct charset *charset;
10430
10431       if (XINT (Flength (charset_list)) != 3
10432           && XINT (Flength (charset_list)) != 4)
10433         error ("There should be three or four charsets");
10434
10435       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10436       if (CHARSET_DIMENSION (charset) != 1)
10437         error ("Dimension of charset %s is not one",
10438                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10439       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10440         ASET (attrs, coding_attr_ascii_compat, Qt);
10441
10442       charset_list = XCDR (charset_list);
10443       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10444       if (CHARSET_DIMENSION (charset) != 1)
10445         error ("Dimension of charset %s is not one",
10446                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10447
10448       charset_list = XCDR (charset_list);
10449       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10450       if (CHARSET_DIMENSION (charset) != 2)
10451         error ("Dimension of charset %s is not two",
10452                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10453
10454       charset_list = XCDR (charset_list);
10455       if (! NILP (charset_list))
10456         {
10457           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10458           if (CHARSET_DIMENSION (charset) != 2)
10459             error ("Dimension of charset %s is not two",
10460                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10461         }
10462
10463       category = coding_category_sjis;
10464       Vsjis_coding_system = name;
10465     }
10466   else if (EQ (coding_type, Qbig5))
10467     {
10468       struct charset *charset;
10469
10470       if (XINT (Flength (charset_list)) != 2)
10471         error ("There should be just two charsets");
10472
10473       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10474       if (CHARSET_DIMENSION (charset) != 1)
10475         error ("Dimension of charset %s is not one",
10476                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10477       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10478         ASET (attrs, coding_attr_ascii_compat, Qt);
10479
10480       charset_list = XCDR (charset_list);
10481       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10482       if (CHARSET_DIMENSION (charset) != 2)
10483         error ("Dimension of charset %s is not two",
10484                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10485
10486       category = coding_category_big5;
10487       Vbig5_coding_system = name;
10488     }
10489   else if (EQ (coding_type, Qraw_text))
10490     {
10491       category = coding_category_raw_text;
10492       ASET (attrs, coding_attr_ascii_compat, Qt);
10493     }
10494   else if (EQ (coding_type, Qutf_8))
10495     {
10496       Lisp_Object bom;
10497
10498       if (nargs < coding_arg_utf8_max)
10499         goto short_args;
10500
10501       bom = args[coding_arg_utf8_bom];
10502       if (! NILP (bom) && ! EQ (bom, Qt))
10503         {
10504           CHECK_CONS (bom);
10505           val = XCAR (bom);
10506           CHECK_CODING_SYSTEM (val);
10507           val = XCDR (bom);
10508           CHECK_CODING_SYSTEM (val);
10509         }
10510       ASET (attrs, coding_attr_utf_bom, bom);
10511       if (NILP (bom))
10512         ASET (attrs, coding_attr_ascii_compat, Qt);
10513
10514       category = (CONSP (bom) ? coding_category_utf_8_auto
10515                   : NILP (bom) ? coding_category_utf_8_nosig
10516                   : coding_category_utf_8_sig);
10517     }
10518   else if (EQ (coding_type, Qundecided))
10519     {
10520       if (nargs < coding_arg_undecided_max)
10521         goto short_args;
10522       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10523             args[coding_arg_undecided_inhibit_null_byte_detection]);
10524       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10525             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10526       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10527             args[coding_arg_undecided_prefer_utf_8]);
10528       category = coding_category_undecided;
10529     }
10530   else
10531     error ("Invalid coding system type: %s",
10532            SDATA (SYMBOL_NAME (coding_type)));
10533
10534   ASET (attrs, coding_attr_category, make_number (category));
10535   ASET (attrs, coding_attr_plist,
10536         Fcons (QCcategory,
10537                Fcons (AREF (Vcoding_category_table, category),
10538                       CODING_ATTR_PLIST (attrs))));
10539   ASET (attrs, coding_attr_plist,
10540         Fcons (QCascii_compatible_p,
10541                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10542                       CODING_ATTR_PLIST (attrs))));
10543
10544   eol_type = args[coding_arg_eol_type];
10545   if (! NILP (eol_type)
10546       && ! EQ (eol_type, Qunix)
10547       && ! EQ (eol_type, Qdos)
10548       && ! EQ (eol_type, Qmac))
10549     error ("Invalid eol-type");
10550
10551   aliases = list1 (name);
10552
10553   if (NILP (eol_type))
10554     {
10555       eol_type = make_subsidiaries (name);
10556       for (i = 0; i < 3; i++)
10557         {
10558           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10559
10560           this_name = AREF (eol_type, i);
10561           this_aliases = list1 (this_name);
10562           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10563           this_spec = make_uninit_vector (3);
10564           ASET (this_spec, 0, attrs);
10565           ASET (this_spec, 1, this_aliases);
10566           ASET (this_spec, 2, this_eol_type);
10567           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10568           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10569           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist, Qnil);
10570           if (NILP (val))
10571             Vcoding_system_alist
10572               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10573                        Vcoding_system_alist);
10574         }
10575     }
10576
10577   spec_vec = make_uninit_vector (3);
10578   ASET (spec_vec, 0, attrs);
10579   ASET (spec_vec, 1, aliases);
10580   ASET (spec_vec, 2, eol_type);
10581
10582   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10583   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10584   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist, Qnil);
10585   if (NILP (val))
10586     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10587                                   Vcoding_system_alist);
10588
10589   {
10590     int id = coding_categories[category].id;
10591
10592     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10593       setup_coding_system (name, &coding_categories[category]);
10594   }
10595
10596   return Qnil;
10597
10598  short_args:
10599   Fsignal (Qwrong_number_of_arguments,
10600            Fcons (intern ("define-coding-system-internal"),
10601                   make_number (nargs)));
10602 }
10603
10604
10605 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10606        3, 3, 0,
10607        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10608   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10609 {
10610   Lisp_Object spec, attrs;
10611
10612   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10613   attrs = AREF (spec, 0);
10614   if (EQ (prop, QCmnemonic))
10615     {
10616       if (! STRINGP (val))
10617         CHECK_CHARACTER (val);
10618       ASET (attrs, coding_attr_mnemonic, val);
10619     }
10620   else if (EQ (prop, QCdefault_char))
10621     {
10622       if (NILP (val))
10623         val = make_number (' ');
10624       else
10625         CHECK_CHARACTER (val);
10626       ASET (attrs, coding_attr_default_char, val);
10627     }
10628   else if (EQ (prop, QCdecode_translation_table))
10629     {
10630       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10631         CHECK_SYMBOL (val);
10632       ASET (attrs, coding_attr_decode_tbl, val);
10633     }
10634   else if (EQ (prop, QCencode_translation_table))
10635     {
10636       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10637         CHECK_SYMBOL (val);
10638       ASET (attrs, coding_attr_encode_tbl, val);
10639     }
10640   else if (EQ (prop, QCpost_read_conversion))
10641     {
10642       CHECK_SYMBOL (val);
10643       ASET (attrs, coding_attr_post_read, val);
10644     }
10645   else if (EQ (prop, QCpre_write_conversion))
10646     {
10647       CHECK_SYMBOL (val);
10648       ASET (attrs, coding_attr_pre_write, val);
10649     }
10650   else if (EQ (prop, QCascii_compatible_p))
10651     {
10652       ASET (attrs, coding_attr_ascii_compat, val);
10653     }
10654
10655   ASET (attrs, coding_attr_plist,
10656         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10657   return val;
10658 }
10659
10660
10661 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10662        Sdefine_coding_system_alias, 2, 2, 0,
10663        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10664   (Lisp_Object alias, Lisp_Object coding_system)
10665 {
10666   Lisp_Object spec, aliases, eol_type, val;
10667
10668   CHECK_SYMBOL (alias);
10669   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10670   aliases = AREF (spec, 1);
10671   /* ALIASES should be a list of length more than zero, and the first
10672      element is a base coding system.  Append ALIAS at the tail of the
10673      list.  */
10674   while (!NILP (XCDR (aliases)))
10675     aliases = XCDR (aliases);
10676   XSETCDR (aliases, list1 (alias));
10677
10678   eol_type = AREF (spec, 2);
10679   if (VECTORP (eol_type))
10680     {
10681       Lisp_Object subsidiaries;
10682       int i;
10683
10684       subsidiaries = make_subsidiaries (alias);
10685       for (i = 0; i < 3; i++)
10686         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10687                                      AREF (eol_type, i));
10688     }
10689
10690   Fputhash (alias, spec, Vcoding_system_hash_table);
10691   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10692   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist, Qnil);
10693   if (NILP (val))
10694     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10695                                   Vcoding_system_alist);
10696
10697   return Qnil;
10698 }
10699
10700 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10701        1, 1, 0,
10702        doc: /* Return the base of CODING-SYSTEM.
10703 Any alias or subsidiary coding system is not a base coding system.  */)
10704   (Lisp_Object coding_system)
10705 {
10706   Lisp_Object spec, attrs;
10707
10708   if (NILP (coding_system))
10709     return (Qno_conversion);
10710   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10711   attrs = AREF (spec, 0);
10712   return CODING_ATTR_BASE_NAME (attrs);
10713 }
10714
10715 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10716        1, 1, 0,
10717        doc: /* Return the property list of CODING-SYSTEM.  */)
10718   (Lisp_Object coding_system)
10719 {
10720   Lisp_Object spec, attrs;
10721
10722   if (NILP (coding_system))
10723     coding_system = Qno_conversion;
10724   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10725   attrs = AREF (spec, 0);
10726   return CODING_ATTR_PLIST (attrs);
10727 }
10728
10729
10730 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10731        1, 1, 0,
10732        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10733   (Lisp_Object coding_system)
10734 {
10735   Lisp_Object spec;
10736
10737   if (NILP (coding_system))
10738     coding_system = Qno_conversion;
10739   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10740   return AREF (spec, 1);
10741 }
10742
10743 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10744        Scoding_system_eol_type, 1, 1, 0,
10745        doc: /* Return eol-type of CODING-SYSTEM.
10746 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10747
10748 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10749 and CR respectively.
10750
10751 A vector value indicates that a format of end-of-line should be
10752 detected automatically.  Nth element of the vector is the subsidiary
10753 coding system whose eol-type is N.  */)
10754   (Lisp_Object coding_system)
10755 {
10756   Lisp_Object spec, eol_type;
10757   int n;
10758
10759   if (NILP (coding_system))
10760     coding_system = Qno_conversion;
10761   if (! CODING_SYSTEM_P (coding_system))
10762     return Qnil;
10763   spec = CODING_SYSTEM_SPEC (coding_system);
10764   eol_type = AREF (spec, 2);
10765   if (VECTORP (eol_type))
10766     return Fcopy_sequence (eol_type);
10767   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10768   return make_number (n);
10769 }
10770
10771 #endif /* emacs */
10772
10773 \f
10774 /*** 9. Post-amble ***/
10775
10776 void
10777 init_coding_once (void)
10778 {
10779   int i;
10780
10781   for (i = 0; i < coding_category_max; i++)
10782     {
10783       coding_categories[i].id = -1;
10784       coding_priorities[i] = i;
10785     }
10786
10787   /* ISO2022 specific initialize routine.  */
10788   for (i = 0; i < 0x20; i++)
10789     iso_code_class[i] = ISO_control_0;
10790   for (i = 0x21; i < 0x7F; i++)
10791     iso_code_class[i] = ISO_graphic_plane_0;
10792   for (i = 0x80; i < 0xA0; i++)
10793     iso_code_class[i] = ISO_control_1;
10794   for (i = 0xA1; i < 0xFF; i++)
10795     iso_code_class[i] = ISO_graphic_plane_1;
10796   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10797   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10798   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10799   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10800   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10801   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10802   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10803   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10804   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10805
10806   for (i = 0; i < 256; i++)
10807     {
10808       emacs_mule_bytes[i] = 1;
10809     }
10810   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10811   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10812   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10813   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10814 }
10815
10816 #ifdef emacs
10817
10818 void
10819 syms_of_coding (void)
10820 {
10821   staticpro (&Vcoding_system_hash_table);
10822   Vcoding_system_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
10823
10824   staticpro (&Vsjis_coding_system);
10825   Vsjis_coding_system = Qnil;
10826
10827   staticpro (&Vbig5_coding_system);
10828   Vbig5_coding_system = Qnil;
10829
10830   staticpro (&Vcode_conversion_reused_workbuf);
10831   Vcode_conversion_reused_workbuf = Qnil;
10832
10833   staticpro (&Vcode_conversion_workbuf_name);
10834   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10835
10836   reused_workbuf_in_use = 0;
10837
10838   DEFSYM (Qcharset, "charset");
10839   DEFSYM (Qtarget_idx, "target-idx");
10840   DEFSYM (Qcoding_system_history, "coding-system-history");
10841   Fset (Qcoding_system_history, Qnil);
10842
10843   /* Target FILENAME is the first argument.  */
10844   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10845   /* Target FILENAME is the third argument.  */
10846   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10847
10848   DEFSYM (Qcall_process, "call-process");
10849   /* Target PROGRAM is the first argument.  */
10850   Fput (Qcall_process, Qtarget_idx, make_number (0));
10851
10852   DEFSYM (Qcall_process_region, "call-process-region");
10853   /* Target PROGRAM is the third argument.  */
10854   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10855
10856   DEFSYM (Qstart_process, "start-process");
10857   /* Target PROGRAM is the third argument.  */
10858   Fput (Qstart_process, Qtarget_idx, make_number (2));
10859
10860   DEFSYM (Qopen_network_stream, "open-network-stream");
10861   /* Target SERVICE is the fourth argument.  */
10862   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10863
10864   DEFSYM (Qunix, "unix");
10865   DEFSYM (Qdos, "dos");
10866   DEFSYM (Qmac, "mac");
10867
10868   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10869   DEFSYM (Qundecided, "undecided");
10870   DEFSYM (Qno_conversion, "no-conversion");
10871   DEFSYM (Qraw_text, "raw-text");
10872
10873   DEFSYM (Qiso_2022, "iso-2022");
10874
10875   DEFSYM (Qutf_8, "utf-8");
10876   DEFSYM (Qutf_8_unix, "utf-8-unix");
10877   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10878
10879 #if defined (WINDOWSNT) || defined (CYGWIN)
10880   /* No, not utf-16-le: that one has a BOM.  */
10881   DEFSYM (Qutf_16le, "utf-16le");
10882 #endif
10883
10884   DEFSYM (Qutf_16, "utf-16");
10885   DEFSYM (Qbig, "big");
10886   DEFSYM (Qlittle, "little");
10887
10888   DEFSYM (Qshift_jis, "shift-jis");
10889   DEFSYM (Qbig5, "big5");
10890
10891   DEFSYM (Qcoding_system_p, "coding-system-p");
10892
10893   /* Error signaled when there's a problem with detecting a coding system.  */
10894   DEFSYM (Qcoding_system_error, "coding-system-error");
10895   Fput (Qcoding_system_error, Qerror_conditions,
10896         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10897   Fput (Qcoding_system_error, Qerror_message,
10898         build_pure_c_string ("Invalid coding system"));
10899
10900   DEFSYM (Qtranslation_table, "translation-table");
10901   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10902   DEFSYM (Qtranslation_table_id, "translation-table-id");
10903
10904   /* Coding system emacs-mule and raw-text are for converting only
10905      end-of-line format.  */
10906   DEFSYM (Qemacs_mule, "emacs-mule");
10907
10908   DEFSYM (QCcategory, ":category");
10909   DEFSYM (QCmnemonic, ":mnemonic");
10910   DEFSYM (QCdefault_char, ":default-char");
10911   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10912   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10913   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10914   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10915   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10916
10917   Vcoding_category_table
10918     = Fmake_vector (make_number (coding_category_max), Qnil);
10919   staticpro (&Vcoding_category_table);
10920   /* Followings are target of code detection.  */
10921   ASET (Vcoding_category_table, coding_category_iso_7,
10922         intern_c_string ("coding-category-iso-7"));
10923   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10924         intern_c_string ("coding-category-iso-7-tight"));
10925   ASET (Vcoding_category_table, coding_category_iso_8_1,
10926         intern_c_string ("coding-category-iso-8-1"));
10927   ASET (Vcoding_category_table, coding_category_iso_8_2,
10928         intern_c_string ("coding-category-iso-8-2"));
10929   ASET (Vcoding_category_table, coding_category_iso_7_else,
10930         intern_c_string ("coding-category-iso-7-else"));
10931   ASET (Vcoding_category_table, coding_category_iso_8_else,
10932         intern_c_string ("coding-category-iso-8-else"));
10933   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10934         intern_c_string ("coding-category-utf-8-auto"));
10935   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10936         intern_c_string ("coding-category-utf-8"));
10937   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10938         intern_c_string ("coding-category-utf-8-sig"));
10939   ASET (Vcoding_category_table, coding_category_utf_16_be,
10940         intern_c_string ("coding-category-utf-16-be"));
10941   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10942         intern_c_string ("coding-category-utf-16-auto"));
10943   ASET (Vcoding_category_table, coding_category_utf_16_le,
10944         intern_c_string ("coding-category-utf-16-le"));
10945   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10946         intern_c_string ("coding-category-utf-16-be-nosig"));
10947   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10948         intern_c_string ("coding-category-utf-16-le-nosig"));
10949   ASET (Vcoding_category_table, coding_category_charset,
10950         intern_c_string ("coding-category-charset"));
10951   ASET (Vcoding_category_table, coding_category_sjis,
10952         intern_c_string ("coding-category-sjis"));
10953   ASET (Vcoding_category_table, coding_category_big5,
10954         intern_c_string ("coding-category-big5"));
10955   ASET (Vcoding_category_table, coding_category_ccl,
10956         intern_c_string ("coding-category-ccl"));
10957   ASET (Vcoding_category_table, coding_category_emacs_mule,
10958         intern_c_string ("coding-category-emacs-mule"));
10959   /* Followings are NOT target of code detection.  */
10960   ASET (Vcoding_category_table, coding_category_raw_text,
10961         intern_c_string ("coding-category-raw-text"));
10962   ASET (Vcoding_category_table, coding_category_undecided,
10963         intern_c_string ("coding-category-undecided"));
10964
10965   DEFSYM (Qinsufficient_source, "insufficient-source");
10966   DEFSYM (Qinvalid_source, "invalid-source");
10967   DEFSYM (Qinterrupted, "interrupted");
10968
10969   /* If a symbol has this property, evaluate the value to define the
10970      symbol as a coding system.  */
10971   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10972
10973   defsubr (&Scoding_system_p);
10974   defsubr (&Sread_coding_system);
10975   defsubr (&Sread_non_nil_coding_system);
10976   defsubr (&Scheck_coding_system);
10977   defsubr (&Sdetect_coding_region);
10978   defsubr (&Sdetect_coding_string);
10979   defsubr (&Sfind_coding_systems_region_internal);
10980   defsubr (&Sunencodable_char_position);
10981   defsubr (&Scheck_coding_systems_region);
10982   defsubr (&Sdecode_coding_region);
10983   defsubr (&Sencode_coding_region);
10984   defsubr (&Sdecode_coding_string);
10985   defsubr (&Sencode_coding_string);
10986   defsubr (&Sdecode_sjis_char);
10987   defsubr (&Sencode_sjis_char);
10988   defsubr (&Sdecode_big5_char);
10989   defsubr (&Sencode_big5_char);
10990   defsubr (&Sset_terminal_coding_system_internal);
10991   defsubr (&Sset_safe_terminal_coding_system_internal);
10992   defsubr (&Sterminal_coding_system);
10993   defsubr (&Sset_keyboard_coding_system_internal);
10994   defsubr (&Skeyboard_coding_system);
10995   defsubr (&Sfind_operation_coding_system);
10996   defsubr (&Sset_coding_system_priority);
10997   defsubr (&Sdefine_coding_system_internal);
10998   defsubr (&Sdefine_coding_system_alias);
10999   defsubr (&Scoding_system_put);
11000   defsubr (&Scoding_system_base);
11001   defsubr (&Scoding_system_plist);
11002   defsubr (&Scoding_system_aliases);
11003   defsubr (&Scoding_system_eol_type);
11004   defsubr (&Scoding_system_priority_list);
11005
11006   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
11007                doc: /* List of coding systems.
11008
11009 Do not alter the value of this variable manually.  This variable should be
11010 updated by the functions `define-coding-system' and
11011 `define-coding-system-alias'.  */);
11012   Vcoding_system_list = Qnil;
11013
11014   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
11015                doc: /* Alist of coding system names.
11016 Each element is one element list of coding system name.
11017 This variable is given to `completing-read' as COLLECTION argument.
11018
11019 Do not alter the value of this variable manually.  This variable should be
11020 updated by the functions `make-coding-system' and
11021 `define-coding-system-alias'.  */);
11022   Vcoding_system_alist = Qnil;
11023
11024   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
11025                doc: /* List of coding-categories (symbols) ordered by priority.
11026
11027 On detecting a coding system, Emacs tries code detection algorithms
11028 associated with each coding-category one by one in this order.  When
11029 one algorithm agrees with a byte sequence of source text, the coding
11030 system bound to the corresponding coding-category is selected.
11031
11032 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
11033   {
11034     int i;
11035
11036     Vcoding_category_list = Qnil;
11037     for (i = coding_category_max - 1; i >= 0; i--)
11038       Vcoding_category_list
11039         = Fcons (AREF (Vcoding_category_table, i),
11040                  Vcoding_category_list);
11041   }
11042
11043   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11044                doc: /* Specify the coding system for read operations.
11045 It is useful to bind this variable with `let', but do not set it globally.
11046 If the value is a coding system, it is used for decoding on read operation.
11047 If not, an appropriate element is used from one of the coding system alists.
11048 There are three such tables: `file-coding-system-alist',
11049 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11050   Vcoding_system_for_read = Qnil;
11051
11052   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11053                doc: /* Specify the coding system for write operations.
11054 Programs bind this variable with `let', but you should not set it globally.
11055 If the value is a coding system, it is used for encoding of output,
11056 when writing it to a file and when sending it to a file or subprocess.
11057
11058 If this does not specify a coding system, an appropriate element
11059 is used from one of the coding system alists.
11060 There are three such tables: `file-coding-system-alist',
11061 `process-coding-system-alist', and `network-coding-system-alist'.
11062 For output to files, if the above procedure does not specify a coding system,
11063 the value of `buffer-file-coding-system' is used.  */);
11064   Vcoding_system_for_write = Qnil;
11065
11066   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11067                doc: /*
11068 Coding system used in the latest file or process I/O.  */);
11069   Vlast_coding_system_used = Qnil;
11070
11071   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11072                doc: /*
11073 Error status of the last code conversion.
11074
11075 When an error was detected in the last code conversion, this variable
11076 is set to one of the following symbols.
11077   `insufficient-source'
11078   `inconsistent-eol'
11079   `invalid-source'
11080   `interrupted'
11081   `insufficient-memory'
11082 When no error was detected, the value doesn't change.  So, to check
11083 the error status of a code conversion by this variable, you must
11084 explicitly set this variable to nil before performing code
11085 conversion.  */);
11086   Vlast_code_conversion_error = Qnil;
11087
11088   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11089                doc: /*
11090 Non-nil means always inhibit code conversion of end-of-line format.
11091 See info node `Coding Systems' and info node `Text and Binary' concerning
11092 such conversion.  */);
11093   inhibit_eol_conversion = 0;
11094
11095   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11096                doc: /*
11097 Non-nil means process buffer inherits coding system of process output.
11098 Bind it to t if the process output is to be treated as if it were a file
11099 read from some filesystem.  */);
11100   inherit_process_coding_system = 0;
11101
11102   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11103                doc: /*
11104 Alist to decide a coding system to use for a file I/O operation.
11105 The format is ((PATTERN . VAL) ...),
11106 where PATTERN is a regular expression matching a file name,
11107 VAL is a coding system, a cons of coding systems, or a function symbol.
11108 If VAL is a coding system, it is used for both decoding and encoding
11109 the file contents.
11110 If VAL is a cons of coding systems, the car part is used for decoding,
11111 and the cdr part is used for encoding.
11112 If VAL is a function symbol, the function must return a coding system
11113 or a cons of coding systems which are used as above.  The function is
11114 called with an argument that is a list of the arguments with which
11115 `find-operation-coding-system' was called.  If the function can't decide
11116 a coding system, it can return `undecided' so that the normal
11117 code-detection is performed.
11118
11119 See also the function `find-operation-coding-system'
11120 and the variable `auto-coding-alist'.  */);
11121   Vfile_coding_system_alist = Qnil;
11122
11123   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11124                doc: /*
11125 Alist to decide a coding system to use for a process I/O operation.
11126 The format is ((PATTERN . VAL) ...),
11127 where PATTERN is a regular expression matching a program name,
11128 VAL is a coding system, a cons of coding systems, or a function symbol.
11129 If VAL is a coding system, it is used for both decoding what received
11130 from the program and encoding what sent to the program.
11131 If VAL is a cons of coding systems, the car part is used for decoding,
11132 and the cdr part is used for encoding.
11133 If VAL is a function symbol, the function must return a coding system
11134 or a cons of coding systems which are used as above.
11135
11136 See also the function `find-operation-coding-system'.  */);
11137   Vprocess_coding_system_alist = Qnil;
11138
11139   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11140                doc: /*
11141 Alist to decide a coding system to use for a network I/O operation.
11142 The format is ((PATTERN . VAL) ...),
11143 where PATTERN is a regular expression matching a network service name
11144 or is a port number to connect to,
11145 VAL is a coding system, a cons of coding systems, or a function symbol.
11146 If VAL is a coding system, it is used for both decoding what received
11147 from the network stream and encoding what sent to the network stream.
11148 If VAL is a cons of coding systems, the car part is used for decoding,
11149 and the cdr part is used for encoding.
11150 If VAL is a function symbol, the function must return a coding system
11151 or a cons of coding systems which are used as above.
11152
11153 See also the function `find-operation-coding-system'.  */);
11154   Vnetwork_coding_system_alist = Qnil;
11155
11156   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11157                doc: /* Coding system to use with system messages.
11158 Also used for decoding keyboard input on X Window system, and for
11159 encoding standard output and error streams.  */);
11160   Vlocale_coding_system = Qnil;
11161
11162   /* The eol mnemonics are reset in startup.el system-dependently.  */
11163   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11164                doc: /*
11165 String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11166   eol_mnemonic_unix = build_pure_c_string (":");
11167
11168   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11169                doc: /*
11170 String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11171   eol_mnemonic_dos = build_pure_c_string ("\\");
11172
11173   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11174                doc: /*
11175 String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11176   eol_mnemonic_mac = build_pure_c_string ("/");
11177
11178   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11179                doc: /*
11180 String displayed in mode line when end-of-line format is not yet determined.  */);
11181   eol_mnemonic_undecided = build_pure_c_string (":");
11182
11183   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11184                doc: /*
11185 Non-nil enables character translation while encoding and decoding.  */);
11186   Venable_character_translation = Qt;
11187
11188   DEFVAR_LISP ("standard-translation-table-for-decode",
11189                Vstandard_translation_table_for_decode,
11190                doc: /* Table for translating characters while decoding.  */);
11191   Vstandard_translation_table_for_decode = Qnil;
11192
11193   DEFVAR_LISP ("standard-translation-table-for-encode",
11194                Vstandard_translation_table_for_encode,
11195                doc: /* Table for translating characters while encoding.  */);
11196   Vstandard_translation_table_for_encode = Qnil;
11197
11198   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11199                doc: /* Alist of charsets vs revision numbers.
11200 While encoding, if a charset (car part of an element) is found,
11201 designate it with the escape sequence identifying revision (cdr part
11202 of the element).  */);
11203   Vcharset_revision_table = Qnil;
11204
11205   DEFVAR_LISP ("default-process-coding-system",
11206                Vdefault_process_coding_system,
11207                doc: /* Cons of coding systems used for process I/O by default.
11208 The car part is used for decoding a process output,
11209 the cdr part is used for encoding a text to be sent to a process.  */);
11210   Vdefault_process_coding_system = Qnil;
11211
11212   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11213                doc: /*
11214 Table of extra Latin codes in the range 128..159 (inclusive).
11215 This is a vector of length 256.
11216 If Nth element is non-nil, the existence of code N in a file
11217 \(or output of subprocess) doesn't prevent it to be detected as
11218 a coding system of ISO 2022 variant which has a flag
11219 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11220 or reading output of a subprocess.
11221 Only 128th through 159th elements have a meaning.  */);
11222   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11223
11224   DEFVAR_LISP ("select-safe-coding-system-function",
11225                Vselect_safe_coding_system_function,
11226                doc: /*
11227 Function to call to select safe coding system for encoding a text.
11228
11229 If set, this function is called to force a user to select a proper
11230 coding system which can encode the text in the case that a default
11231 coding system used in each operation can't encode the text.  The
11232 function should take care that the buffer is not modified while
11233 the coding system is being selected.
11234
11235 The default value is `select-safe-coding-system' (which see).  */);
11236   Vselect_safe_coding_system_function = Qnil;
11237
11238   DEFVAR_BOOL ("coding-system-require-warning",
11239                coding_system_require_warning,
11240                doc: /* Internal use only.
11241 If non-nil, on writing a file, `select-safe-coding-system-function' is
11242 called even if `coding-system-for-write' is non-nil.  The command
11243 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11244   coding_system_require_warning = 0;
11245
11246
11247   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11248                inhibit_iso_escape_detection,
11249                doc: /*
11250 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11251
11252 When Emacs reads text, it tries to detect how the text is encoded.
11253 This code detection is sensitive to escape sequences.  If Emacs sees
11254 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11255 of the ISO2022 encodings, and decodes text by the corresponding coding
11256 system (e.g. `iso-2022-7bit').
11257
11258 However, there may be a case that you want to read escape sequences in
11259 a file as is.  In such a case, you can set this variable to non-nil.
11260 Then the code detection will ignore any escape sequences, and no text is
11261 detected as encoded in some ISO-2022 encoding.  The result is that all
11262 escape sequences become visible in a buffer.
11263
11264 The default value is nil, and it is strongly recommended not to change
11265 it.  That is because many Emacs Lisp source files that contain
11266 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11267 in Emacs's distribution, and they won't be decoded correctly on
11268 reading if you suppress escape sequence detection.
11269
11270 The other way to read escape sequences in a file without decoding is
11271 to explicitly specify some coding system that doesn't use ISO-2022
11272 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11273   inhibit_iso_escape_detection = 0;
11274
11275   DEFVAR_BOOL ("inhibit-null-byte-detection",
11276                inhibit_null_byte_detection,
11277                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11278 By default, Emacs treats it as binary data, and does not attempt to
11279 decode it.  The effect is as if you specified `no-conversion' for
11280 reading that text.
11281
11282 Set this to non-nil when a regular text happens to include null bytes.
11283 Examples are Index nodes of Info files and null-byte delimited output
11284 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11285 decode text as usual.  */);
11286   inhibit_null_byte_detection = 0;
11287
11288   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11289                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11290 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11291   disable_ascii_optimization = 0;
11292
11293   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11294                doc: /* Char table for translating self-inserting characters.
11295 This is applied to the result of input methods, not their input.
11296 See also `keyboard-translate-table'.
11297
11298 Use of this variable for character code unification was rendered
11299 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11300 internal character representation.  */);
11301   Vtranslation_table_for_input = Qnil;
11302
11303   Lisp_Object args[coding_arg_undecided_max];
11304   memclear (args, sizeof args);
11305
11306   Lisp_Object plist[] =
11307     {
11308       QCname,
11309       args[coding_arg_name] = Qno_conversion,
11310       QCmnemonic,
11311       args[coding_arg_mnemonic] = make_number ('='),
11312       intern_c_string (":coding-type"),
11313       args[coding_arg_coding_type] = Qraw_text,
11314       QCascii_compatible_p,
11315       args[coding_arg_ascii_compatible_p] = Qt,
11316       QCdefault_char,
11317       args[coding_arg_default_char] = make_number (0),
11318       intern_c_string (":for-unibyte"),
11319       args[coding_arg_for_unibyte] = Qt,
11320       intern_c_string (":docstring"),
11321       (build_pure_c_string
11322        ("Do no conversion.\n"
11323         "\n"
11324         "When you visit a file with this coding, the file is read into a\n"
11325         "unibyte buffer as is, thus each byte of a file is treated as a\n"
11326         "character.")),
11327       intern_c_string (":eol-type"),
11328       args[coding_arg_eol_type] = Qunix,
11329     };
11330   args[coding_arg_plist] = CALLMANY (Flist, plist);
11331   Fdefine_coding_system_internal (coding_arg_max, args);
11332
11333   plist[1] = args[coding_arg_name] = Qundecided;
11334   plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11335   plist[5] = args[coding_arg_coding_type] = Qundecided;
11336   /* This is already set.
11337      plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11338   plist[8] = intern_c_string (":charset-list");
11339   plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11340   plist[11] = args[coding_arg_for_unibyte] = Qnil;
11341   plist[13] = build_pure_c_string ("No conversion on encoding, "
11342                                    "automatic conversion on decoding.");
11343   plist[15] = args[coding_arg_eol_type] = Qnil;
11344   args[coding_arg_plist] = CALLMANY (Flist, plist);
11345   args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11346   args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11347   Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11348
11349   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11350
11351   for (int i = 0; i < coding_category_max; i++)
11352     Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11353
11354 #if defined (DOS_NT)
11355   system_eol_type = Qdos;
11356 #else
11357   system_eol_type = Qunix;
11358 #endif
11359   staticpro (&system_eol_type);
11360 }
11361 #endif /* emacs */