src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2016 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or (at
  16 your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "termhooks.h"
 301
 302 Lisp_Object Vcoding_system_hash_table;
 303
 304 /* Format of end-of-line decided by system.  This is Qunix on
 305    Unix and Mac, Qdos on DOS/Windows.
 306    This has an effect only for external encoding (i.e. for output to
 307    file and process), not for in-buffer or Lisp string encoding.  */
 308 static Lisp_Object system_eol_type;
 309
 310 #ifdef emacs
 311
 312 /* Coding-systems are handed between Emacs Lisp programs and C internal
 313    routines by the following three variables.  */
 314 /* Coding system to be used to encode text for terminal display when
 315    terminal coding system is nil.  */
 316 struct coding_system safe_terminal_coding;
 317
 318 #endif /* emacs */
 319
 320 /* Two special coding systems.  */
 321 static Lisp_Object Vsjis_coding_system;
 322 static Lisp_Object Vbig5_coding_system;
 323
 324 /* ISO2022 section */
 325
 326 #define CODING_ISO_INITIAL(coding, reg)                 \
 327   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 328                      coding_attr_iso_initial),          \
 329                reg)))
 330
 331
 332 #define CODING_ISO_REQUEST(coding, charset_id)          \
 333   (((charset_id) <= (coding)->max_charset_id            \
 334     ? ((coding)->safe_charsets[charset_id] != 255       \
 335        ? (coding)->safe_charsets[charset_id]            \
 336        : -1)                                            \
 337     : -1))
 338
 339
 340 #define CODING_ISO_FLAGS(coding)        \
 341   ((coding)->spec.iso_2022.flags)
 342 #define CODING_ISO_DESIGNATION(coding, reg)     \
 343   ((coding)->spec.iso_2022.current_designation[reg])
 344 #define CODING_ISO_INVOCATION(coding, plane)    \
 345   ((coding)->spec.iso_2022.current_invocation[plane])
 346 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 347   ((coding)->spec.iso_2022.single_shifting)
 348 #define CODING_ISO_BOL(coding)  \
 349   ((coding)->spec.iso_2022.bol)
 350 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 351   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1       \
 352    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
 353 #define CODING_ISO_CMP_STATUS(coding)   \
 354   (&(coding)->spec.iso_2022.cmp_status)
 355 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 356   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 357 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 358   ((coding)->spec.iso_2022.embedded_utf_8)
 359
 360 /* Control characters of ISO2022.  */
 361                         /* code */      /* function */
 362 #define ISO_CODE_SO     0x0E            /* shift-out */
 363 #define ISO_CODE_SI     0x0F            /* shift-in */
 364 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 365 #define ISO_CODE_ESC    0x1B            /* escape */
 366 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 367 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 368 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 369
 370 /* All code (1-byte) of ISO2022 is classified into one of the
 371    followings.  */
 372 enum iso_code_class_type
 373   {
 374     ISO_control_0,              /* Control codes in the range
 375                                    0x00..0x1F and 0x7F, except for the
 376                                    following 5 codes.  */
 377     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 378     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 379     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 380     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 381     ISO_control_1,              /* Control codes in the range
 382                                    0x80..0x9F, except for the
 383                                    following 3 codes.  */
 384     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 385     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 386     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 387     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 388     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 389     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 390     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 391   };
 392
 393 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 394     `iso-flags' attribute of an iso2022 coding system.  */
 395
 396 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 397    instead of the correct short-form sequence (e.g. ESC $ A).  */
 398 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 399
 400 /* If set, reset graphic planes and registers at end-of-line to the
 401    initial state.  */
 402 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 403
 404 /* If set, reset graphic planes and registers before any control
 405    characters to the initial state.  */
 406 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 407
 408 /* If set, encode by 7-bit environment.  */
 409 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 410
 411 /* If set, use locking-shift function.  */
 412 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 413
 414 /* If set, use single-shift function.  Overwrite
 415    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 416 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 417
 418 /* If set, use designation escape sequence.  */
 419 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 420
 421 /* If set, produce revision number sequence.  */
 422 #define CODING_ISO_FLAG_REVISION        0x0080
 423
 424 /* If set, produce ISO6429's direction specifying sequence.  */
 425 #define CODING_ISO_FLAG_DIRECTION       0x0100
 426
 427 /* If set, assume designation states are reset at beginning of line on
 428    output.  */
 429 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 430
 431 /* If set, designation sequence should be placed at beginning of line
 432    on output.  */
 433 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 434
 435 /* If set, do not encode unsafe characters on output.  */
 436 #define CODING_ISO_FLAG_SAFE            0x0800
 437
 438 /* If set, extra latin codes (128..159) are accepted as a valid code
 439    on input.  */
 440 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 441
 442 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 443
 444 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 445
 446 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 447
 448 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 449
 450 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 451
 452 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 453
 454 /* A character to be produced on output if encoding of the original
 455    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 456 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 457
 458 /* UTF-8 section */
 459 #define CODING_UTF_8_BOM(coding)        \
 460   ((coding)->spec.utf_8_bom)
 461
 462 /* UTF-16 section */
 463 #define CODING_UTF_16_BOM(coding)       \
 464   ((coding)->spec.utf_16.bom)
 465
 466 #define CODING_UTF_16_ENDIAN(coding)    \
 467   ((coding)->spec.utf_16.endian)
 468
 469 #define CODING_UTF_16_SURROGATE(coding) \
 470   ((coding)->spec.utf_16.surrogate)
 471
 472
 473 /* CCL section */
 474 #define CODING_CCL_DECODER(coding)      \
 475   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 476 #define CODING_CCL_ENCODER(coding)      \
 477   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 478 #define CODING_CCL_VALIDS(coding)                                          \
 479   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 480
 481 /* Index for each coding category in `coding_categories' */
 482
 483 enum coding_category
 484   {
 485     coding_category_iso_7,
 486     coding_category_iso_7_tight,
 487     coding_category_iso_8_1,
 488     coding_category_iso_8_2,
 489     coding_category_iso_7_else,
 490     coding_category_iso_8_else,
 491     coding_category_utf_8_auto,
 492     coding_category_utf_8_nosig,
 493     coding_category_utf_8_sig,
 494     coding_category_utf_16_auto,
 495     coding_category_utf_16_be,
 496     coding_category_utf_16_le,
 497     coding_category_utf_16_be_nosig,
 498     coding_category_utf_16_le_nosig,
 499     coding_category_charset,
 500     coding_category_sjis,
 501     coding_category_big5,
 502     coding_category_ccl,
 503     coding_category_emacs_mule,
 504     /* All above are targets of code detection.  */
 505     coding_category_raw_text,
 506     coding_category_undecided,
 507     coding_category_max
 508   };
 509
 510 /* Definitions of flag bits used in detect_coding_XXXX.  */
 511 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 512 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 513 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 514 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 515 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 516 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 517 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 518 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 519 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 520 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 521 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 522 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 523 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 524 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 525 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 526 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 527 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 528 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 529 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 530 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 531
 532 /* This value is returned if detect_coding_mask () find nothing other
 533    than ASCII characters.  */
 534 #define CATEGORY_MASK_ANY               \
 535   (CATEGORY_MASK_ISO_7                  \
 536    | CATEGORY_MASK_ISO_7_TIGHT          \
 537    | CATEGORY_MASK_ISO_8_1              \
 538    | CATEGORY_MASK_ISO_8_2              \
 539    | CATEGORY_MASK_ISO_7_ELSE           \
 540    | CATEGORY_MASK_ISO_8_ELSE           \
 541    | CATEGORY_MASK_UTF_8_AUTO           \
 542    | CATEGORY_MASK_UTF_8_NOSIG          \
 543    | CATEGORY_MASK_UTF_8_SIG            \
 544    | CATEGORY_MASK_UTF_16_AUTO          \
 545    | CATEGORY_MASK_UTF_16_BE            \
 546    | CATEGORY_MASK_UTF_16_LE            \
 547    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 548    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 549    | CATEGORY_MASK_CHARSET              \
 550    | CATEGORY_MASK_SJIS                 \
 551    | CATEGORY_MASK_BIG5                 \
 552    | CATEGORY_MASK_CCL                  \
 553    | CATEGORY_MASK_EMACS_MULE)
 554
 555
 556 #define CATEGORY_MASK_ISO_7BIT \
 557   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 558
 559 #define CATEGORY_MASK_ISO_8BIT \
 560   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 561
 562 #define CATEGORY_MASK_ISO_ELSE \
 563   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 564
 565 #define CATEGORY_MASK_ISO_ESCAPE        \
 566   (CATEGORY_MASK_ISO_7                  \
 567    | CATEGORY_MASK_ISO_7_TIGHT          \
 568    | CATEGORY_MASK_ISO_7_ELSE           \
 569    | CATEGORY_MASK_ISO_8_ELSE)
 570
 571 #define CATEGORY_MASK_ISO       \
 572   (  CATEGORY_MASK_ISO_7BIT     \
 573      | CATEGORY_MASK_ISO_8BIT   \
 574      | CATEGORY_MASK_ISO_ELSE)
 575
 576 #define CATEGORY_MASK_UTF_16            \
 577   (CATEGORY_MASK_UTF_16_AUTO            \
 578    | CATEGORY_MASK_UTF_16_BE            \
 579    | CATEGORY_MASK_UTF_16_LE            \
 580    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 581    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 582
 583 #define CATEGORY_MASK_UTF_8     \
 584   (CATEGORY_MASK_UTF_8_AUTO     \
 585    | CATEGORY_MASK_UTF_8_NOSIG  \
 586    | CATEGORY_MASK_UTF_8_SIG)
 587
 588 /* Table of coding categories (Lisp symbols).  This variable is for
 589    internal use only.  */
 590 static Lisp_Object Vcoding_category_table;
 591
 592 /* Table of coding-categories ordered by priority.  */
 593 static enum coding_category coding_priorities[coding_category_max];
 594
 595 /* Nth element is a coding context for the coding system bound to the
 596    Nth coding category.  */
 597 static struct coding_system coding_categories[coding_category_max];
 598
 599 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 600
 601 static int
 602 encode_inhibit_flag (Lisp_Object flag)
 603 {
 604   return NILP (flag) ? -1 : EQ (flag, Qt);
 605 }
 606
 607 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 608    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 609
 610 static bool
 611 inhibit_flag (int encoded_flag, bool var)
 612 {
 613   return 0 < encoded_flag + var;
 614 }
 615
 616 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 617   do {                                                  \
 618     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 619     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 620   } while (0)
 621
 622 static void
 623 CHECK_NATNUM_CAR (Lisp_Object x)
 624 {
 625   Lisp_Object tmp = XCAR (x);
 626   CHECK_NATNUM (tmp);
 627   XSETCAR (x, tmp);
 628 }
 629
 630 static void
 631 CHECK_NATNUM_CDR (Lisp_Object x)
 632 {
 633   Lisp_Object tmp = XCDR (x);
 634   CHECK_NATNUM (tmp);
 635   XSETCDR (x, tmp);
 636 }
 637
 638 /* True if CODING's destination can be grown.  */
 639
 640 static bool
 641 growable_destination (struct coding_system *coding)
 642 {
 643   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
 644 }
 645
 646
 647 /* Safely get one byte from the source text pointed by SRC which ends
 648    at SRC_END, and set C to that byte.  If there are not enough bytes
 649    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 650    and a multibyte character is found at SRC, set C to the
 651    negative value of the character code.  The caller should declare
 652    and set these variables appropriately in advance:
 653         src, src_end, multibytep */
 654
 655 #define ONE_MORE_BYTE(c)                                \
 656   do {                                                  \
 657     if (src == src_end)                                 \
 658       {                                                 \
 659         if (src_base < src)                             \
 660           record_conversion_result                      \
 661             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 662         goto no_more_source;                            \
 663       }                                                 \
 664     c = *src++;                                         \
 665     if (multibytep && (c & 0x80))                       \
 666       {                                                 \
 667         if ((c & 0xFE) == 0xC0)                         \
 668           c = ((c & 1) << 6) | *src++;                  \
 669         else                                            \
 670           {                                             \
 671             src--;                                      \
 672             c = - string_char (src, &src, NULL);        \
 673             record_conversion_result                    \
 674               (coding, CODING_RESULT_INVALID_SRC);      \
 675           }                                             \
 676       }                                                 \
 677     consumed_chars++;                                   \
 678   } while (0)
 679
 680 /* Safely get two bytes from the source text pointed by SRC which ends
 681    at SRC_END, and set C1 and C2 to those bytes while skipping the
 682    heading multibyte characters.  If there are not enough bytes in the
 683    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 684    a multibyte character is found for C2, set C2 to the negative value
 685    of the character code.  The caller should declare and set these
 686    variables appropriately in advance:
 687         src, src_end, multibytep
 688    It is intended that this macro is used in detect_coding_utf_16.  */
 689
 690 #define TWO_MORE_BYTES(c1, c2)                          \
 691   do {                                                  \
 692     do {                                                \
 693       if (src == src_end)                               \
 694         goto no_more_source;                            \
 695       c1 = *src++;                                      \
 696       if (multibytep && (c1 & 0x80))                    \
 697         {                                               \
 698           if ((c1 & 0xFE) == 0xC0)                      \
 699             c1 = ((c1 & 1) << 6) | *src++;              \
 700           else                                          \
 701             {                                           \
 702               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 703               c1 = -1;                                  \
 704             }                                           \
 705         }                                               \
 706     } while (c1 < 0);                                   \
 707     if (src == src_end)                                 \
 708       goto no_more_source;                              \
 709     c2 = *src++;                                        \
 710     if (multibytep && (c2 & 0x80))                      \
 711       {                                                 \
 712         if ((c2 & 0xFE) == 0xC0)                        \
 713           c2 = ((c2 & 1) << 6) | *src++;                \
 714         else                                            \
 715           c2 = -1;                                      \
 716       }                                                 \
 717   } while (0)
 718
 719
 720 /* Store a byte C in the place pointed by DST and increment DST to the
 721    next free point, and increment PRODUCED_CHARS.  The caller should
 722    assure that C is 0..127, and declare and set the variable `dst'
 723    appropriately in advance.
 724 */
 725
 726
 727 #define EMIT_ONE_ASCII_BYTE(c)  \
 728   do {                          \
 729     produced_chars++;           \
 730     *dst++ = (c);               \
 731   } while (0)
 732
 733
 734 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 735
 736 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 737   do {                                  \
 738     produced_chars += 2;                \
 739     *dst++ = (c1), *dst++ = (c2);       \
 740   } while (0)
 741
 742
 743 /* Store a byte C in the place pointed by DST and increment DST to the
 744    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 745    store in an appropriate multibyte form.  The caller should
 746    declare and set the variables `dst' and `multibytep' appropriately
 747    in advance.  */
 748
 749 #define EMIT_ONE_BYTE(c)                \
 750   do {                                  \
 751     produced_chars++;                   \
 752     if (multibytep)                     \
 753       {                                 \
 754         unsigned ch = (c);              \
 755         if (ch >= 0x80)                 \
 756           ch = BYTE8_TO_CHAR (ch);      \
 757         CHAR_STRING_ADVANCE (ch, dst);  \
 758       }                                 \
 759     else                                \
 760       *dst++ = (c);                     \
 761   } while (0)
 762
 763
 764 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 765
 766 #define EMIT_TWO_BYTES(c1, c2)          \
 767   do {                                  \
 768     produced_chars += 2;                \
 769     if (multibytep)                     \
 770       {                                 \
 771         unsigned ch;                    \
 772                                         \
 773         ch = (c1);                      \
 774         if (ch >= 0x80)                 \
 775           ch = BYTE8_TO_CHAR (ch);      \
 776         CHAR_STRING_ADVANCE (ch, dst);  \
 777         ch = (c2);                      \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781       }                                 \
 782     else                                \
 783       {                                 \
 784         *dst++ = (c1);                  \
 785         *dst++ = (c2);                  \
 786       }                                 \
 787   } while (0)
 788
 789
 790 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 791   do {                                  \
 792     EMIT_ONE_BYTE (c1);                 \
 793     EMIT_TWO_BYTES (c2, c3);            \
 794   } while (0)
 795
 796
 797 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 798   do {                                          \
 799     EMIT_TWO_BYTES (c1, c2);                    \
 800     EMIT_TWO_BYTES (c3, c4);                    \
 801   } while (0)
 802
 803
 804 static void
 805 record_conversion_result (struct coding_system *coding,
 806                           enum coding_result_code result)
 807 {
 808   coding->result = result;
 809   switch (result)
 810     {
 811     case CODING_RESULT_INSUFFICIENT_SRC:
 812       Vlast_code_conversion_error = Qinsufficient_source;
 813       break;
 814     case CODING_RESULT_INVALID_SRC:
 815       Vlast_code_conversion_error = Qinvalid_source;
 816       break;
 817     case CODING_RESULT_INTERRUPT:
 818       Vlast_code_conversion_error = Qinterrupted;
 819       break;
 820     case CODING_RESULT_INSUFFICIENT_DST:
 821       /* Don't record this error in Vlast_code_conversion_error
 822          because it happens just temporarily and is resolved when the
 823          whole conversion is finished.  */
 824       break;
 825     case CODING_RESULT_SUCCESS:
 826       break;
 827     default:
 828       Vlast_code_conversion_error = intern ("Unknown error");
 829     }
 830 }
 831
 832 /* These wrapper macros are used to preserve validity of pointers into
 833    buffer text across calls to decode_char, encode_char, etc, which
 834    could cause relocation of buffers if it loads a charset map,
 835    because loading a charset map allocates large structures.  */
 836
 837 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 838   do {                                                                       \
 839     ptrdiff_t offset;                                                        \
 840                                                                              \
 841     charset_map_loaded = 0;                                                  \
 842     c = DECODE_CHAR (charset, code);                                         \
 843     if (charset_map_loaded                                                   \
 844         && (offset = coding_change_source (coding)))                         \
 845       {                                                                      \
 846         src += offset;                                                       \
 847         src_base += offset;                                                  \
 848         src_end += offset;                                                   \
 849       }                                                                      \
 850   } while (0)
 851
 852 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 853   do {                                                                  \
 854     ptrdiff_t offset;                                                   \
 855                                                                         \
 856     charset_map_loaded = 0;                                             \
 857     code = ENCODE_CHAR (charset, c);                                    \
 858     if (charset_map_loaded                                              \
 859         && (offset = coding_change_destination (coding)))               \
 860       {                                                                 \
 861         dst += offset;                                                  \
 862         dst_end += offset;                                              \
 863       }                                                                 \
 864   } while (0)
 865
 866 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 867   do {                                                                  \
 868     ptrdiff_t offset;                                                   \
 869                                                                         \
 870     charset_map_loaded = 0;                                             \
 871     charset = char_charset (c, charset_list, code_return);              \
 872     if (charset_map_loaded                                              \
 873         && (offset = coding_change_destination (coding)))               \
 874       {                                                                 \
 875         dst += offset;                                                  \
 876         dst_end += offset;                                              \
 877       }                                                                 \
 878   } while (0)
 879
 880 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 881   do {                                                                  \
 882     ptrdiff_t offset;                                                   \
 883                                                                         \
 884     charset_map_loaded = 0;                                             \
 885     result = CHAR_CHARSET_P (c, charset);                               \
 886     if (charset_map_loaded                                              \
 887         && (offset = coding_change_destination (coding)))               \
 888       {                                                                 \
 889         dst += offset;                                                  \
 890         dst_end += offset;                                              \
 891       }                                                                 \
 892   } while (0)
 893
 894
 895 /* If there are at least BYTES length of room at dst, allocate memory
 896    for coding->destination and update dst and dst_end.  We don't have
 897    to take care of coding->source which will be relocated.  It is
 898    handled by calling coding_set_source in encode_coding.  */
 899
 900 #define ASSURE_DESTINATION(bytes)                               \
 901   do {                                                          \
 902     if (dst + (bytes) >= dst_end)                               \
 903       {                                                         \
 904         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 905                                                                 \
 906         dst = alloc_destination (coding, more_bytes, dst);      \
 907         dst_end = coding->destination + coding->dst_bytes;      \
 908       }                                                         \
 909   } while (0)
 910
 911
 912 /* Store multibyte form of the character C in P, and advance P to the
 913    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 914    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 915    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 916
 917 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 918
 919 /* Return the character code of character whose multibyte form is at
 920    P, and advance P to the end of the multibyte form.  This used to be
 921    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 922    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 923
 924 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 925
 926 /* Set coding->source from coding->src_object.  */
 927
 928 static void
 929 coding_set_source (struct coding_system *coding)
 930 {
 931   if (BUFFERP (coding->src_object))
 932     {
 933       struct buffer *buf = XBUFFER (coding->src_object);
 934
 935       if (coding->src_pos < 0)
 936         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 937       else
 938         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 939     }
 940   else if (STRINGP (coding->src_object))
 941     {
 942       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 943     }
 944   else
 945     {
 946       /* Otherwise, the source is C string and is never relocated
 947          automatically.  Thus we don't have to update anything.  */
 948     }
 949 }
 950
 951
 952 /* Set coding->source from coding->src_object, and return how many
 953    bytes coding->source was changed.  */
 954
 955 static ptrdiff_t
 956 coding_change_source (struct coding_system *coding)
 957 {
 958   const unsigned char *orig = coding->source;
 959   coding_set_source (coding);
 960   return coding->source - orig;
 961 }
 962
 963
 964 /* Set coding->destination from coding->dst_object.  */
 965
 966 static void
 967 coding_set_destination (struct coding_system *coding)
 968 {
 969   if (BUFFERP (coding->dst_object))
 970     {
 971       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 972         {
 973           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 974           coding->dst_bytes = (GAP_END_ADDR
 975                                - (coding->src_bytes - coding->consumed)
 976                                - coding->destination);
 977         }
 978       else
 979         {
 980           /* We are sure that coding->dst_pos_byte is before the gap
 981              of the buffer. */
 982           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 983                                  + coding->dst_pos_byte - BEG_BYTE);
 984           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 985                                - coding->destination);
 986         }
 987     }
 988   else
 989     {
 990       /* Otherwise, the destination is C string and is never relocated
 991          automatically.  Thus we don't have to update anything.  */
 992     }
 993 }
 994
 995
 996 /* Set coding->destination from coding->dst_object, and return how
 997    many bytes coding->destination was changed.  */
 998
 999 static ptrdiff_t
1000 coding_change_destination (struct coding_system *coding)
1001 {
1002   const unsigned char *orig = coding->destination;
1003   coding_set_destination (coding);
1004   return coding->destination - orig;
1005 }
1006
1007
1008 static void
1009 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1010 {
1011   ptrdiff_t newbytes;
1012   if (INT_ADD_WRAPV (coding->dst_bytes, bytes, &newbytes)
1013       || SIZE_MAX < newbytes)
1014     string_overflow ();
1015   coding->destination = xrealloc (coding->destination, newbytes);
1016   coding->dst_bytes = newbytes;
1017 }
1018
1019 static void
1020 coding_alloc_by_making_gap (struct coding_system *coding,
1021                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1022 {
1023   if (EQ (coding->src_object, coding->dst_object))
1024     {
1025       /* The gap may contain the produced data at the head and not-yet
1026          consumed data at the tail.  To preserve those data, we at
1027          first make the gap size to zero, then increase the gap
1028          size.  */
1029       ptrdiff_t add = GAP_SIZE;
1030
1031       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1032       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1033       make_gap (bytes);
1034       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1035       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1036     }
1037   else
1038     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1039 }
1040
1041
1042 static unsigned char *
1043 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1044                    unsigned char *dst)
1045 {
1046   ptrdiff_t offset = dst - coding->destination;
1047
1048   if (BUFFERP (coding->dst_object))
1049     {
1050       struct buffer *buf = XBUFFER (coding->dst_object);
1051
1052       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1053     }
1054   else
1055     coding_alloc_by_realloc (coding, nbytes);
1056   coding_set_destination (coding);
1057   dst = coding->destination + offset;
1058   return dst;
1059 }
1060
1061 /** Macros for annotations.  */
1062
1063 /* An annotation data is stored in the array coding->charbuf in this
1064    format:
1065      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1066    LENGTH is the number of elements in the annotation.
1067    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1068    NCHARS is the number of characters in the text annotated.
1069
1070    The format of the following elements depend on ANNOTATION_MASK.
1071
1072    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1073    follows:
1074      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1075
1076    NBYTES is the number of bytes specified in the header part of
1077    old-style emacs-mule encoding, or 0 for the other kind of
1078    composition.
1079
1080    METHOD is one of enum composition_method.
1081
1082    Optional COMPOSITION-COMPONENTS are characters and composition
1083    rules.
1084
1085    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1086    follows.
1087
1088    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1089    recover from an invalid annotation, and should be skipped by
1090    produce_annotation.  */
1091
1092 /* Maximum length of the header of annotation data.  */
1093 #define MAX_ANNOTATION_LENGTH 5
1094
1095 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1096   do {                                                  \
1097     *(buf)++ = -(len);                                  \
1098     *(buf)++ = (mask);                                  \
1099     *(buf)++ = (nchars);                                \
1100     coding->annotated = 1;                              \
1101   } while (0);
1102
1103 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1104   do {                                                                      \
1105     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1106     *buf++ = nbytes;                                                        \
1107     *buf++ = method;                                                        \
1108   } while (0)
1109
1110
1111 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1112   do {                                                                  \
1113     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1114     *buf++ = id;                                                        \
1115   } while (0)
1116
1117
1118 /* Bitmasks for coding->eol_seen.  */
1119
1120 #define EOL_SEEN_NONE   0
1121 #define EOL_SEEN_LF     1
1122 #define EOL_SEEN_CR     2
1123 #define EOL_SEEN_CRLF   4
1124
1125 \f
1126 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1127
1128
1129
1130 \f
1131 /*** 3. UTF-8 ***/
1132
1133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1134    Return true if a text is encoded in UTF-8.  */
1135
1136 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1137 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1138 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1139 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1140 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1141 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1142
1143 #define UTF_8_BOM_1 0xEF
1144 #define UTF_8_BOM_2 0xBB
1145 #define UTF_8_BOM_3 0xBF
1146
1147 /* Unlike the other detect_coding_XXX, this function counts the number
1148    of characters and checks the EOL format.  */
1149
1150 static bool
1151 detect_coding_utf_8 (struct coding_system *coding,
1152                      struct coding_detection_info *detect_info)
1153 {
1154   const unsigned char *src = coding->source, *src_base;
1155   const unsigned char *src_end = coding->source + coding->src_bytes;
1156   bool multibytep = coding->src_multibyte;
1157   ptrdiff_t consumed_chars = 0;
1158   bool bom_found = 0;
1159   ptrdiff_t nchars = coding->head_ascii;
1160   int eol_seen = coding->eol_seen;
1161
1162   detect_info->checked |= CATEGORY_MASK_UTF_8;
1163   /* A coding system of this category is always ASCII compatible.  */
1164   src += nchars;
1165
1166   if (src == coding->source     /* BOM should be at the head.  */
1167       && src + 3 < src_end      /* BOM is 3-byte long.  */
1168       && src[0] == UTF_8_BOM_1
1169       && src[1] == UTF_8_BOM_2
1170       && src[2] == UTF_8_BOM_3)
1171     {
1172       bom_found = 1;
1173       src += 3;
1174       nchars++;
1175     }
1176
1177   while (1)
1178     {
1179       int c, c1, c2, c3, c4;
1180
1181       src_base = src;
1182       ONE_MORE_BYTE (c);
1183       if (c < 0 || UTF_8_1_OCTET_P (c))
1184         {
1185           nchars++;
1186           if (c == '\r')
1187             {
1188               if (src < src_end && *src == '\n')
1189                 {
1190                   eol_seen |= EOL_SEEN_CRLF;
1191                   src++;
1192                   nchars++;
1193                 }
1194               else
1195                 eol_seen |= EOL_SEEN_CR;
1196             }
1197           else if (c == '\n')
1198             eol_seen |= EOL_SEEN_LF;
1199           continue;
1200         }
1201       ONE_MORE_BYTE (c1);
1202       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1203         break;
1204       if (UTF_8_2_OCTET_LEADING_P (c))
1205         {
1206           nchars++;
1207           continue;
1208         }
1209       ONE_MORE_BYTE (c2);
1210       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1211         break;
1212       if (UTF_8_3_OCTET_LEADING_P (c))
1213         {
1214           nchars++;
1215           continue;
1216         }
1217       ONE_MORE_BYTE (c3);
1218       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1219         break;
1220       if (UTF_8_4_OCTET_LEADING_P (c))
1221         {
1222           nchars++;
1223           continue;
1224         }
1225       ONE_MORE_BYTE (c4);
1226       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1227         break;
1228       if (UTF_8_5_OCTET_LEADING_P (c))
1229         {
1230           nchars++;
1231           continue;
1232         }
1233       break;
1234     }
1235   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1236   return 0;
1237
1238  no_more_source:
1239   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1240     {
1241       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1242       return 0;
1243     }
1244   if (bom_found)
1245     {
1246       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1247       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1248     }
1249   else
1250     {
1251       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1252       if (nchars < src_end - coding->source)
1253         /* The found characters are less than source bytes, which
1254            means that we found a valid non-ASCII characters.  */
1255         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1256     }
1257   coding->detected_utf8_bytes = src_base - coding->source;
1258   coding->detected_utf8_chars = nchars;
1259   return 1;
1260 }
1261
1262
1263 static void
1264 decode_coding_utf_8 (struct coding_system *coding)
1265 {
1266   const unsigned char *src = coding->source + coding->consumed;
1267   const unsigned char *src_end = coding->source + coding->src_bytes;
1268   const unsigned char *src_base;
1269   int *charbuf = coding->charbuf + coding->charbuf_used;
1270   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1271   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1272   bool multibytep = coding->src_multibyte;
1273   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1274   bool eol_dos
1275     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1276   int byte_after_cr = -1;
1277
1278   if (bom != utf_without_bom)
1279     {
1280       int c1, c2, c3;
1281
1282       src_base = src;
1283       ONE_MORE_BYTE (c1);
1284       if (! UTF_8_3_OCTET_LEADING_P (c1))
1285         src = src_base;
1286       else
1287         {
1288           ONE_MORE_BYTE (c2);
1289           if (! UTF_8_EXTRA_OCTET_P (c2))
1290             src = src_base;
1291           else
1292             {
1293               ONE_MORE_BYTE (c3);
1294               if (! UTF_8_EXTRA_OCTET_P (c3))
1295                 src = src_base;
1296               else
1297                 {
1298                   if ((c1 != UTF_8_BOM_1)
1299                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1300                     src = src_base;
1301                   else
1302                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1303                 }
1304             }
1305         }
1306     }
1307   CODING_UTF_8_BOM (coding) = utf_without_bom;
1308
1309   while (1)
1310     {
1311       int c, c1, c2, c3, c4, c5;
1312
1313       src_base = src;
1314       consumed_chars_base = consumed_chars;
1315
1316       if (charbuf >= charbuf_end)
1317         {
1318           if (byte_after_cr >= 0)
1319             src_base--;
1320           break;
1321         }
1322
1323       /* In the simple case, rapidly handle ordinary characters */
1324       if (multibytep && ! eol_dos
1325           && charbuf < charbuf_end - 6 && src < src_end - 6)
1326         {
1327           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1328             {
1329               c1 = *src;
1330               if (c1 & 0x80)
1331                 break;
1332               src++;
1333               consumed_chars++;
1334               *charbuf++ = c1;
1335
1336               c1 = *src;
1337               if (c1 & 0x80)
1338                 break;
1339               src++;
1340               consumed_chars++;
1341               *charbuf++ = c1;
1342
1343               c1 = *src;
1344               if (c1 & 0x80)
1345                 break;
1346               src++;
1347               consumed_chars++;
1348               *charbuf++ = c1;
1349
1350               c1 = *src;
1351               if (c1 & 0x80)
1352                 break;
1353               src++;
1354               consumed_chars++;
1355               *charbuf++ = c1;
1356             }
1357           /* If we handled at least one character, restart the main loop.  */
1358           if (src != src_base)
1359             continue;
1360         }
1361
1362       if (byte_after_cr >= 0)
1363         c1 = byte_after_cr, byte_after_cr = -1;
1364       else
1365         ONE_MORE_BYTE (c1);
1366       if (c1 < 0)
1367         {
1368           c = - c1;
1369         }
1370       else if (UTF_8_1_OCTET_P (c1))
1371         {
1372           if (eol_dos && c1 == '\r')
1373             ONE_MORE_BYTE (byte_after_cr);
1374           c = c1;
1375         }
1376       else
1377         {
1378           ONE_MORE_BYTE (c2);
1379           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1380             goto invalid_code;
1381           if (UTF_8_2_OCTET_LEADING_P (c1))
1382             {
1383               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1384               /* Reject overlong sequences here and below.  Encoders
1385                  producing them are incorrect, they can be misleading,
1386                  and they mess up read/write invariance.  */
1387               if (c < 128)
1388                 goto invalid_code;
1389             }
1390           else
1391             {
1392               ONE_MORE_BYTE (c3);
1393               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1394                 goto invalid_code;
1395               if (UTF_8_3_OCTET_LEADING_P (c1))
1396                 {
1397                   c = (((c1 & 0xF) << 12)
1398                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1399                   if (c < 0x800
1400                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1401                     goto invalid_code;
1402                 }
1403               else
1404                 {
1405                   ONE_MORE_BYTE (c4);
1406                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1407                     goto invalid_code;
1408                   if (UTF_8_4_OCTET_LEADING_P (c1))
1409                     {
1410                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1411                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1412                     if (c < 0x10000)
1413                       goto invalid_code;
1414                     }
1415                   else
1416                     {
1417                       ONE_MORE_BYTE (c5);
1418                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1419                         goto invalid_code;
1420                       if (UTF_8_5_OCTET_LEADING_P (c1))
1421                         {
1422                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1423                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1424                                | (c5 & 0x3F));
1425                           if ((c > MAX_CHAR) || (c < 0x200000))
1426                             goto invalid_code;
1427                         }
1428                       else
1429                         goto invalid_code;
1430                     }
1431                 }
1432             }
1433         }
1434
1435       *charbuf++ = c;
1436       continue;
1437
1438     invalid_code:
1439       src = src_base;
1440       consumed_chars = consumed_chars_base;
1441       ONE_MORE_BYTE (c);
1442       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1443     }
1444
1445  no_more_source:
1446   coding->consumed_char += consumed_chars_base;
1447   coding->consumed = src_base - coding->source;
1448   coding->charbuf_used = charbuf - coding->charbuf;
1449 }
1450
1451
1452 static bool
1453 encode_coding_utf_8 (struct coding_system *coding)
1454 {
1455   bool multibytep = coding->dst_multibyte;
1456   int *charbuf = coding->charbuf;
1457   int *charbuf_end = charbuf + coding->charbuf_used;
1458   unsigned char *dst = coding->destination + coding->produced;
1459   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1460   ptrdiff_t produced_chars = 0;
1461   int c;
1462
1463   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1464     {
1465       ASSURE_DESTINATION (3);
1466       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1467       CODING_UTF_8_BOM (coding) = utf_without_bom;
1468     }
1469
1470   if (multibytep)
1471     {
1472       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1473
1474       while (charbuf < charbuf_end)
1475         {
1476           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1477
1478           ASSURE_DESTINATION (safe_room);
1479           c = *charbuf++;
1480           if (CHAR_BYTE8_P (c))
1481             {
1482               c = CHAR_TO_BYTE8 (c);
1483               EMIT_ONE_BYTE (c);
1484             }
1485           else
1486             {
1487               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1488               for (p = str; p < pend; p++)
1489                 EMIT_ONE_BYTE (*p);
1490             }
1491         }
1492     }
1493   else
1494     {
1495       int safe_room = MAX_MULTIBYTE_LENGTH;
1496
1497       while (charbuf < charbuf_end)
1498         {
1499           ASSURE_DESTINATION (safe_room);
1500           c = *charbuf++;
1501           if (CHAR_BYTE8_P (c))
1502             *dst++ = CHAR_TO_BYTE8 (c);
1503           else
1504             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1505         }
1506       produced_chars = dst - (coding->destination + coding->produced);
1507     }
1508   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1509   coding->produced_char += produced_chars;
1510   coding->produced = dst - coding->destination;
1511   return 0;
1512 }
1513
1514
1515 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1516    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1517
1518 #define UTF_16_HIGH_SURROGATE_P(val) \
1519   (((val) & 0xFC00) == 0xD800)
1520
1521 #define UTF_16_LOW_SURROGATE_P(val) \
1522   (((val) & 0xFC00) == 0xDC00)
1523
1524
1525 static bool
1526 detect_coding_utf_16 (struct coding_system *coding,
1527                       struct coding_detection_info *detect_info)
1528 {
1529   const unsigned char *src = coding->source;
1530   const unsigned char *src_end = coding->source + coding->src_bytes;
1531   bool multibytep = coding->src_multibyte;
1532   int c1, c2;
1533
1534   detect_info->checked |= CATEGORY_MASK_UTF_16;
1535   if (coding->mode & CODING_MODE_LAST_BLOCK
1536       && (coding->src_chars & 1))
1537     {
1538       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1539       return 0;
1540     }
1541
1542   TWO_MORE_BYTES (c1, c2);
1543   if ((c1 == 0xFF) && (c2 == 0xFE))
1544     {
1545       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1546                              | CATEGORY_MASK_UTF_16_AUTO);
1547       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1548                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1549                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1550     }
1551   else if ((c1 == 0xFE) && (c2 == 0xFF))
1552     {
1553       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1554                              | CATEGORY_MASK_UTF_16_AUTO);
1555       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1556                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1557                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1558     }
1559   else if (c2 < 0)
1560     {
1561       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1562       return 0;
1563     }
1564   else
1565     {
1566       /* We check the dispersion of Eth and Oth bytes where E is even and
1567          O is odd.  If both are high, we assume binary data.*/
1568       unsigned char e[256], o[256];
1569       unsigned e_num = 1, o_num = 1;
1570
1571       memset (e, 0, 256);
1572       memset (o, 0, 256);
1573       e[c1] = 1;
1574       o[c2] = 1;
1575
1576       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1577                                 |CATEGORY_MASK_UTF_16_BE
1578                                 | CATEGORY_MASK_UTF_16_LE);
1579
1580       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1581              != CATEGORY_MASK_UTF_16)
1582         {
1583           TWO_MORE_BYTES (c1, c2);
1584           if (c2 < 0)
1585             break;
1586           if (! e[c1])
1587             {
1588               e[c1] = 1;
1589               e_num++;
1590               if (e_num >= 128)
1591                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1592             }
1593           if (! o[c2])
1594             {
1595               o[c2] = 1;
1596               o_num++;
1597               if (o_num >= 128)
1598                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1599             }
1600         }
1601       return 0;
1602     }
1603
1604  no_more_source:
1605   return 1;
1606 }
1607
1608 static void
1609 decode_coding_utf_16 (struct coding_system *coding)
1610 {
1611   const unsigned char *src = coding->source + coding->consumed;
1612   const unsigned char *src_end = coding->source + coding->src_bytes;
1613   const unsigned char *src_base;
1614   int *charbuf = coding->charbuf + coding->charbuf_used;
1615   /* We may produces at most 3 chars in one loop.  */
1616   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1617   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1618   bool multibytep = coding->src_multibyte;
1619   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1620   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1621   int surrogate = CODING_UTF_16_SURROGATE (coding);
1622   bool eol_dos
1623     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1624   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1625
1626   if (bom == utf_with_bom)
1627     {
1628       int c, c1, c2;
1629
1630       src_base = src;
1631       ONE_MORE_BYTE (c1);
1632       ONE_MORE_BYTE (c2);
1633       c = (c1 << 8) | c2;
1634
1635       if (endian == utf_16_big_endian
1636           ? c != 0xFEFF : c != 0xFFFE)
1637         {
1638           /* The first two bytes are not BOM.  Treat them as bytes
1639              for a normal character.  */
1640           src = src_base;
1641         }
1642       CODING_UTF_16_BOM (coding) = utf_without_bom;
1643     }
1644   else if (bom == utf_detect_bom)
1645     {
1646       /* We have already tried to detect BOM and failed in
1647          detect_coding.  */
1648       CODING_UTF_16_BOM (coding) = utf_without_bom;
1649     }
1650
1651   while (1)
1652     {
1653       int c, c1, c2;
1654
1655       src_base = src;
1656       consumed_chars_base = consumed_chars;
1657
1658       if (charbuf >= charbuf_end)
1659         {
1660           if (byte_after_cr1 >= 0)
1661             src_base -= 2;
1662           break;
1663         }
1664
1665       if (byte_after_cr1 >= 0)
1666         c1 = byte_after_cr1, byte_after_cr1 = -1;
1667       else
1668         ONE_MORE_BYTE (c1);
1669       if (c1 < 0)
1670         {
1671           *charbuf++ = -c1;
1672           continue;
1673         }
1674       if (byte_after_cr2 >= 0)
1675         c2 = byte_after_cr2, byte_after_cr2 = -1;
1676       else
1677         ONE_MORE_BYTE (c2);
1678       if (c2 < 0)
1679         {
1680           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1681           *charbuf++ = -c2;
1682           continue;
1683         }
1684       c = (endian == utf_16_big_endian
1685            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1686
1687       if (surrogate)
1688         {
1689           if (! UTF_16_LOW_SURROGATE_P (c))
1690             {
1691               if (endian == utf_16_big_endian)
1692                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1693               else
1694                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1695               *charbuf++ = c1;
1696               *charbuf++ = c2;
1697               if (UTF_16_HIGH_SURROGATE_P (c))
1698                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1699               else
1700                 *charbuf++ = c;
1701             }
1702           else
1703             {
1704               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1705               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1706               *charbuf++ = 0x10000 + c;
1707             }
1708         }
1709       else
1710         {
1711           if (UTF_16_HIGH_SURROGATE_P (c))
1712             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1713           else
1714             {
1715               if (eol_dos && c == '\r')
1716                 {
1717                   ONE_MORE_BYTE (byte_after_cr1);
1718                   ONE_MORE_BYTE (byte_after_cr2);
1719                 }
1720               *charbuf++ = c;
1721             }
1722         }
1723     }
1724
1725  no_more_source:
1726   coding->consumed_char += consumed_chars_base;
1727   coding->consumed = src_base - coding->source;
1728   coding->charbuf_used = charbuf - coding->charbuf;
1729 }
1730
1731 static bool
1732 encode_coding_utf_16 (struct coding_system *coding)
1733 {
1734   bool multibytep = coding->dst_multibyte;
1735   int *charbuf = coding->charbuf;
1736   int *charbuf_end = charbuf + coding->charbuf_used;
1737   unsigned char *dst = coding->destination + coding->produced;
1738   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1739   int safe_room = 8;
1740   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1741   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1742   ptrdiff_t produced_chars = 0;
1743   int c;
1744
1745   if (bom != utf_without_bom)
1746     {
1747       ASSURE_DESTINATION (safe_room);
1748       if (big_endian)
1749         EMIT_TWO_BYTES (0xFE, 0xFF);
1750       else
1751         EMIT_TWO_BYTES (0xFF, 0xFE);
1752       CODING_UTF_16_BOM (coding) = utf_without_bom;
1753     }
1754
1755   while (charbuf < charbuf_end)
1756     {
1757       ASSURE_DESTINATION (safe_room);
1758       c = *charbuf++;
1759       if (c > MAX_UNICODE_CHAR)
1760         c = coding->default_char;
1761
1762       if (c < 0x10000)
1763         {
1764           if (big_endian)
1765             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1766           else
1767             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1768         }
1769       else
1770         {
1771           int c1, c2;
1772
1773           c -= 0x10000;
1774           c1 = (c >> 10) + 0xD800;
1775           c2 = (c & 0x3FF) + 0xDC00;
1776           if (big_endian)
1777             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1778           else
1779             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1780         }
1781     }
1782   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1783   coding->produced = dst - coding->destination;
1784   coding->produced_char += produced_chars;
1785   return 0;
1786 }
1787
1788 \f
1789 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1790
1791 /* Emacs' internal format for representation of multiple character
1792    sets is a kind of multi-byte encoding, i.e. characters are
1793    represented by variable-length sequences of one-byte codes.
1794
1795    ASCII characters and control characters (e.g. `tab', `newline') are
1796    represented by one-byte sequences which are their ASCII codes, in
1797    the range 0x00 through 0x7F.
1798
1799    8-bit characters of the range 0x80..0x9F are represented by
1800    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1801    code + 0x20).
1802
1803    8-bit characters of the range 0xA0..0xFF are represented by
1804    one-byte sequences which are their 8-bit code.
1805
1806    The other characters are represented by a sequence of `base
1807    leading-code', optional `extended leading-code', and one or two
1808    `position-code's.  The length of the sequence is determined by the
1809    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1810    whereas extended leading-code and position-code take the range 0xA0
1811    through 0xFF.  See `charset.h' for more details about leading-code
1812    and position-code.
1813
1814    --- CODE RANGE of Emacs' internal format ---
1815    character set        range
1816    -------------        -----
1817    ascii                0x00..0x7F
1818    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1819    eight-bit-graphic    0xA0..0xBF
1820    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1821    ---------------------------------------------
1822
1823    As this is the internal character representation, the format is
1824    usually not used externally (i.e. in a file or in a data sent to a
1825    process).  But, it is possible to have a text externally in this
1826    format (i.e. by encoding by the coding system `emacs-mule').
1827
1828    In that case, a sequence of one-byte codes has a slightly different
1829    form.
1830
1831    At first, all characters in eight-bit-control are represented by
1832    one-byte sequences which are their 8-bit code.
1833
1834    Next, character composition data are represented by the byte
1835    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1836    where,
1837         METHOD is 0xF2 plus one of composition method (enum
1838         composition_method),
1839
1840         BYTES is 0xA0 plus a byte length of this composition data,
1841
1842         CHARS is 0xA0 plus a number of characters composed by this
1843         data,
1844
1845         COMPONENTs are characters of multibyte form or composition
1846         rules encoded by two-byte of ASCII codes.
1847
1848    In addition, for backward compatibility, the following formats are
1849    also recognized as composition data on decoding.
1850
1851    0x80 MSEQ ...
1852    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1853
1854    Here,
1855         MSEQ is a multibyte form but in these special format:
1856           ASCII: 0xA0 ASCII_CODE+0x80,
1857           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1858         RULE is a one byte code of the range 0xA0..0xF0 that
1859         represents a composition rule.
1860   */
1861
1862 char emacs_mule_bytes[256];
1863
1864
1865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1866    Return true if a text is encoded in 'emacs-mule'.  */
1867
1868 static bool
1869 detect_coding_emacs_mule (struct coding_system *coding,
1870                           struct coding_detection_info *detect_info)
1871 {
1872   const unsigned char *src = coding->source, *src_base;
1873   const unsigned char *src_end = coding->source + coding->src_bytes;
1874   bool multibytep = coding->src_multibyte;
1875   ptrdiff_t consumed_chars = 0;
1876   int c;
1877   int found = 0;
1878
1879   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1880   /* A coding system of this category is always ASCII compatible.  */
1881   src += coding->head_ascii;
1882
1883   while (1)
1884     {
1885       src_base = src;
1886       ONE_MORE_BYTE (c);
1887       if (c < 0)
1888         continue;
1889       if (c == 0x80)
1890         {
1891           /* Perhaps the start of composite character.  We simply skip
1892              it because analyzing it is too heavy for detecting.  But,
1893              at least, we check that the composite character
1894              constitutes of more than 4 bytes.  */
1895           const unsigned char *src_start;
1896
1897         repeat:
1898           src_start = src;
1899           do
1900             {
1901               ONE_MORE_BYTE (c);
1902             }
1903           while (c >= 0xA0);
1904
1905           if (src - src_start <= 4)
1906             break;
1907           found = CATEGORY_MASK_EMACS_MULE;
1908           if (c == 0x80)
1909             goto repeat;
1910         }
1911
1912       if (c < 0x80)
1913         {
1914           if (c < 0x20
1915               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1916             break;
1917         }
1918       else
1919         {
1920           int more_bytes = emacs_mule_bytes[c] - 1;
1921
1922           while (more_bytes > 0)
1923             {
1924               ONE_MORE_BYTE (c);
1925               if (c < 0xA0)
1926                 {
1927                   src--;        /* Unread the last byte.  */
1928                   break;
1929                 }
1930               more_bytes--;
1931             }
1932           if (more_bytes != 0)
1933             break;
1934           found = CATEGORY_MASK_EMACS_MULE;
1935         }
1936     }
1937   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1938   return 0;
1939
1940  no_more_source:
1941   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1942     {
1943       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1944       return 0;
1945     }
1946   detect_info->found |= found;
1947   return 1;
1948 }
1949
1950
1951 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1952    character.  If CMP_STATUS indicates that we must expect MSEQ or
1953    RULE described above, decode it and return the negative value of
1954    the decoded character or rule.  If an invalid byte is found, return
1955    -1.  If SRC is too short, return -2.  */
1956
1957 static int
1958 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1959                  int *nbytes, int *nchars, int *id,
1960                  struct composition_status *cmp_status)
1961 {
1962   const unsigned char *src_end = coding->source + coding->src_bytes;
1963   const unsigned char *src_base = src;
1964   bool multibytep = coding->src_multibyte;
1965   int charset_ID;
1966   unsigned code;
1967   int c;
1968   ptrdiff_t consumed_chars = 0;
1969   bool mseq_found = 0;
1970
1971   ONE_MORE_BYTE (c);
1972   if (c < 0)
1973     {
1974       c = -c;
1975       charset_ID = emacs_mule_charset[0];
1976     }
1977   else
1978     {
1979       if (c >= 0xA0)
1980         {
1981           if (cmp_status->state != COMPOSING_NO
1982               && cmp_status->old_form)
1983             {
1984               if (cmp_status->state == COMPOSING_CHAR)
1985                 {
1986                   if (c == 0xA0)
1987                     {
1988                       ONE_MORE_BYTE (c);
1989                       c -= 0x80;
1990                       if (c < 0)
1991                         goto invalid_code;
1992                     }
1993                   else
1994                     c -= 0x20;
1995                   mseq_found = 1;
1996                 }
1997               else
1998                 {
1999                   *nbytes = src - src_base;
2000                   *nchars = consumed_chars;
2001                   return -c;
2002                 }
2003             }
2004           else
2005             goto invalid_code;
2006         }
2007
2008       switch (emacs_mule_bytes[c])
2009         {
2010         case 2:
2011           if ((charset_ID = emacs_mule_charset[c]) < 0)
2012             goto invalid_code;
2013           ONE_MORE_BYTE (c);
2014           if (c < 0xA0)
2015             goto invalid_code;
2016           code = c & 0x7F;
2017           break;
2018
2019         case 3:
2020           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2021               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2022             {
2023               ONE_MORE_BYTE (c);
2024               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2025                 goto invalid_code;
2026               ONE_MORE_BYTE (c);
2027               if (c < 0xA0)
2028                 goto invalid_code;
2029               code = c & 0x7F;
2030             }
2031           else
2032             {
2033               if ((charset_ID = emacs_mule_charset[c]) < 0)
2034                 goto invalid_code;
2035               ONE_MORE_BYTE (c);
2036               if (c < 0xA0)
2037                 goto invalid_code;
2038               code = (c & 0x7F) << 8;
2039               ONE_MORE_BYTE (c);
2040               if (c < 0xA0)
2041                 goto invalid_code;
2042               code |= c & 0x7F;
2043             }
2044           break;
2045
2046         case 4:
2047           ONE_MORE_BYTE (c);
2048           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2049             goto invalid_code;
2050           ONE_MORE_BYTE (c);
2051           if (c < 0xA0)
2052             goto invalid_code;
2053           code = (c & 0x7F) << 8;
2054           ONE_MORE_BYTE (c);
2055           if (c < 0xA0)
2056             goto invalid_code;
2057           code |= c & 0x7F;
2058           break;
2059
2060         case 1:
2061           code = c;
2062           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2063           break;
2064
2065         default:
2066           emacs_abort ();
2067         }
2068       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2069                           CHARSET_FROM_ID (charset_ID), code, c);
2070       if (c < 0)
2071         goto invalid_code;
2072     }
2073   *nbytes = src - src_base;
2074   *nchars = consumed_chars;
2075   if (id)
2076     *id = charset_ID;
2077   return (mseq_found ? -c : c);
2078
2079  no_more_source:
2080   return -2;
2081
2082  invalid_code:
2083   return -1;
2084 }
2085
2086
2087 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2088
2089 /* Handle these composition sequence ('|': the end of header elements,
2090    BYTES and CHARS >= 0xA0):
2091
2092    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2093    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2094    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2095
2096    and these old form:
2097
2098    (4) relative composition: 0x80 | MSEQ ... MSEQ
2099    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2100
2101    When the starter 0x80 and the following header elements are found,
2102    this annotation header is produced.
2103
2104         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2105
2106    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2107    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2108
2109    Then, upon reading the following elements, these codes are produced
2110    until the composition end is found:
2111
2112    (1) CHAR ... CHAR
2113    (2) ALT ... ALT CHAR ... CHAR
2114    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2115    (4) CHAR ... CHAR
2116    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2117
2118    When the composition end is found, LENGTH and NCHARS in the
2119    annotation header is updated as below:
2120
2121    (1) LENGTH: unchanged, NCHARS: unchanged
2122    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2123    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2124    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2125    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2126
2127    If an error is found while composing, the annotation header is
2128    changed to the original composition header (plus filler -1s) as
2129    below:
2130
2131    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2132    (5)          [ 0x80 0xFF -1 -1- -1 ]
2133
2134    and the sequence [ -2 DECODED-RULE ] is changed to the original
2135    byte sequence as below:
2136         o the original byte sequence is B: [ B -1 ]
2137         o the original byte sequence is B1 B2: [ B1 B2 ]
2138
2139    Most of the routines are implemented by macros because many
2140    variables and labels in the caller decode_coding_emacs_mule must be
2141    accessible, and they are usually called just once (thus doesn't
2142    increase the size of compiled object).  */
2143
2144 /* Decode a composition rule represented by C as a component of
2145    composition sequence of Emacs 20 style.  Set RULE to the decoded
2146    rule. */
2147
2148 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2149   do {                                                  \
2150     int gref, nref;                                     \
2151                                                         \
2152     c -= 0xA0;                                          \
2153     if (c < 0 || c >= 81)                               \
2154       goto invalid_code;                                \
2155     gref = c / 9, nref = c % 9;                         \
2156     if (gref == 4) gref = 10;                           \
2157     if (nref == 4) nref = 10;                           \
2158     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2159   } while (0)
2160
2161
2162 /* Decode a composition rule represented by C and the following byte
2163    at SRC as a component of composition sequence of Emacs 21 style.
2164    Set RULE to the decoded rule.  */
2165
2166 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2167   do {                                                  \
2168     int gref, nref;                                     \
2169                                                         \
2170     gref = c - 0x20;                                    \
2171     if (gref < 0 || gref >= 81)                         \
2172       goto invalid_code;                                \
2173     ONE_MORE_BYTE (c);                                  \
2174     nref = c - 0x20;                                    \
2175     if (nref < 0 || nref >= 81)                         \
2176       goto invalid_code;                                \
2177     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2178   } while (0)
2179
2180
2181 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2182    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2183    byte length of this composition information, CHARS is the number of
2184    characters composed by this composition.  */
2185
2186 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2187   do {                                                                  \
2188     enum composition_method method = c - 0xF2;                          \
2189     int nbytes, nchars;                                                 \
2190                                                                         \
2191     ONE_MORE_BYTE (c);                                                  \
2192     if (c < 0)                                                          \
2193       goto invalid_code;                                                \
2194     nbytes = c - 0xA0;                                                  \
2195     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2196       goto invalid_code;                                                \
2197     ONE_MORE_BYTE (c);                                                  \
2198     nchars = c - 0xA0;                                                  \
2199     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2200       goto invalid_code;                                                \
2201     cmp_status->old_form = 0;                                           \
2202     cmp_status->method = method;                                        \
2203     if (method == COMPOSITION_RELATIVE)                                 \
2204       cmp_status->state = COMPOSING_CHAR;                               \
2205     else                                                                \
2206       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2207     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2208     cmp_status->nchars = nchars;                                        \
2209     cmp_status->ncomps = nbytes - 4;                                    \
2210     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2211   } while (0)
2212
2213
2214 /* Start of Emacs 20 style format for relative composition.  */
2215
2216 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2217   do {                                                          \
2218     cmp_status->old_form = 1;                                   \
2219     cmp_status->method = COMPOSITION_RELATIVE;                  \
2220     cmp_status->state = COMPOSING_CHAR;                         \
2221     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2222     cmp_status->nchars = cmp_status->ncomps = 0;                \
2223     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2224   } while (0)
2225
2226
2227 /* Start of Emacs 20 style format for rule-base composition.  */
2228
2229 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2230   do {                                                          \
2231     cmp_status->old_form = 1;                                   \
2232     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2233     cmp_status->state = COMPOSING_CHAR;                         \
2234     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2235     cmp_status->nchars = cmp_status->ncomps = 0;                \
2236     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2237   } while (0)
2238
2239
2240 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2241   do {                                                  \
2242     const unsigned char *current_src = src;             \
2243                                                         \
2244     ONE_MORE_BYTE (c);                                  \
2245     if (c < 0)                                          \
2246       goto invalid_code;                                \
2247     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2248         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2249       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2250     else if (c < 0xA0)                                  \
2251       goto invalid_code;                                \
2252     else if (c < 0xC0)                                  \
2253       {                                                 \
2254         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2255         /* Re-read C as a composition component.  */    \
2256         src = current_src;                              \
2257       }                                                 \
2258     else if (c == 0xFF)                                 \
2259       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2260     else                                                \
2261       goto invalid_code;                                \
2262   } while (0)
2263
2264 #define EMACS_MULE_COMPOSITION_END()                            \
2265   do {                                                          \
2266     int idx = - cmp_status->length;                             \
2267                                                                 \
2268     if (cmp_status->old_form)                                   \
2269       charbuf[idx + 2] = cmp_status->nchars;                    \
2270     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2271       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2272     cmp_status->state = COMPOSING_NO;                           \
2273   } while (0)
2274
2275
2276 static int
2277 emacs_mule_finish_composition (int *charbuf,
2278                                struct composition_status *cmp_status)
2279 {
2280   int idx = - cmp_status->length;
2281   int new_chars;
2282
2283   if (cmp_status->old_form && cmp_status->nchars > 0)
2284     {
2285       charbuf[idx + 2] = cmp_status->nchars;
2286       new_chars = 0;
2287       if (cmp_status->method == COMPOSITION_WITH_RULE
2288           && cmp_status->state == COMPOSING_CHAR)
2289         {
2290           /* The last rule was invalid.  */
2291           int rule = charbuf[-1] + 0xA0;
2292
2293           charbuf[-2] = BYTE8_TO_CHAR (rule);
2294           charbuf[-1] = -1;
2295           new_chars = 1;
2296         }
2297     }
2298   else
2299     {
2300       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2301
2302       if (cmp_status->method == COMPOSITION_WITH_RULE)
2303         {
2304           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2305           charbuf[idx++] = -3;
2306           charbuf[idx++] = 0;
2307           new_chars = 1;
2308         }
2309       else
2310         {
2311           int nchars = charbuf[idx + 1] + 0xA0;
2312           int nbytes = charbuf[idx + 2] + 0xA0;
2313
2314           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2315           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2316           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2317           charbuf[idx++] = -1;
2318           new_chars = 4;
2319         }
2320     }
2321   cmp_status->state = COMPOSING_NO;
2322   return new_chars;
2323 }
2324
2325 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2326   do {                                                                    \
2327     if (cmp_status->state != COMPOSING_NO)                                \
2328       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2329   } while (0)
2330
2331
2332 static void
2333 decode_coding_emacs_mule (struct coding_system *coding)
2334 {
2335   const unsigned char *src = coding->source + coding->consumed;
2336   const unsigned char *src_end = coding->source + coding->src_bytes;
2337   const unsigned char *src_base;
2338   int *charbuf = coding->charbuf + coding->charbuf_used;
2339   /* We may produce two annotations (charset and composition) in one
2340      loop and one more charset annotation at the end.  */
2341   int *charbuf_end
2342     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2343       /* We can produce up to 2 characters in a loop.  */
2344       - 1;
2345   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2346   bool multibytep = coding->src_multibyte;
2347   ptrdiff_t char_offset = coding->produced_char;
2348   ptrdiff_t last_offset = char_offset;
2349   int last_id = charset_ascii;
2350   bool eol_dos
2351     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2352   int byte_after_cr = -1;
2353   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2354
2355   if (cmp_status->state != COMPOSING_NO)
2356     {
2357       int i;
2358
2359       if (charbuf_end - charbuf < cmp_status->length)
2360         emacs_abort ();
2361       for (i = 0; i < cmp_status->length; i++)
2362         *charbuf++ = cmp_status->carryover[i];
2363       coding->annotated = 1;
2364     }
2365
2366   while (1)
2367     {
2368       int c;
2369       int id UNINIT;
2370
2371       src_base = src;
2372       consumed_chars_base = consumed_chars;
2373
2374       if (charbuf >= charbuf_end)
2375         {
2376           if (byte_after_cr >= 0)
2377             src_base--;
2378           break;
2379         }
2380
2381       if (byte_after_cr >= 0)
2382         c = byte_after_cr, byte_after_cr = -1;
2383       else
2384         ONE_MORE_BYTE (c);
2385
2386       if (c < 0 || c == 0x80)
2387         {
2388           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2389           if (c < 0)
2390             {
2391               *charbuf++ = -c;
2392               char_offset++;
2393             }
2394           else
2395             DECODE_EMACS_MULE_COMPOSITION_START ();
2396           continue;
2397         }
2398
2399       if (c < 0x80)
2400         {
2401           if (eol_dos && c == '\r')
2402             ONE_MORE_BYTE (byte_after_cr);
2403           id = charset_ascii;
2404           if (cmp_status->state != COMPOSING_NO)
2405             {
2406               if (cmp_status->old_form)
2407                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2408               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2409                 cmp_status->ncomps--;
2410             }
2411         }
2412       else
2413         {
2414           int nchars UNINIT, nbytes UNINIT;
2415           /* emacs_mule_char can load a charset map from a file, which
2416              allocates a large structure and might cause buffer text
2417              to be relocated as result.  Thus, we need to remember the
2418              original pointer to buffer text, and fix up all related
2419              pointers after the call.  */
2420           const unsigned char *orig = coding->source;
2421           ptrdiff_t offset;
2422
2423           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2424                                cmp_status);
2425           offset = coding->source - orig;
2426           if (offset)
2427             {
2428               src += offset;
2429               src_base += offset;
2430               src_end += offset;
2431             }
2432           if (c < 0)
2433             {
2434               if (c == -1)
2435                 goto invalid_code;
2436               if (c == -2)
2437                 break;
2438             }
2439           src = src_base + nbytes;
2440           consumed_chars = consumed_chars_base + nchars;
2441           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2442             cmp_status->ncomps -= nchars;
2443         }
2444
2445       /* Now if C >= 0, we found a normally encoded character, if C <
2446          0, we found an old-style composition component character or
2447          rule.  */
2448
2449       if (cmp_status->state == COMPOSING_NO)
2450         {
2451           if (last_id != id)
2452             {
2453               if (last_id != charset_ascii)
2454                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2455                                   last_id);
2456               last_id = id;
2457               last_offset = char_offset;
2458             }
2459           *charbuf++ = c;
2460           char_offset++;
2461         }
2462       else if (cmp_status->state == COMPOSING_CHAR)
2463         {
2464           if (cmp_status->old_form)
2465             {
2466               if (c >= 0)
2467                 {
2468                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2469                   *charbuf++ = c;
2470                   char_offset++;
2471                 }
2472               else
2473                 {
2474                   *charbuf++ = -c;
2475                   cmp_status->nchars++;
2476                   cmp_status->length++;
2477                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2478                     EMACS_MULE_COMPOSITION_END ();
2479                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2480                     cmp_status->state = COMPOSING_RULE;
2481                 }
2482             }
2483           else
2484             {
2485               *charbuf++ = c;
2486               cmp_status->length++;
2487               cmp_status->nchars--;
2488               if (cmp_status->nchars == 0)
2489                 EMACS_MULE_COMPOSITION_END ();
2490             }
2491         }
2492       else if (cmp_status->state == COMPOSING_RULE)
2493         {
2494           int rule;
2495
2496           if (c >= 0)
2497             {
2498               EMACS_MULE_COMPOSITION_END ();
2499               *charbuf++ = c;
2500               char_offset++;
2501             }
2502           else
2503             {
2504               c = -c;
2505               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2506               if (rule < 0)
2507                 goto invalid_code;
2508               *charbuf++ = -2;
2509               *charbuf++ = rule;
2510               cmp_status->length += 2;
2511               cmp_status->state = COMPOSING_CHAR;
2512             }
2513         }
2514       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2515         {
2516           *charbuf++ = c;
2517           cmp_status->length++;
2518           if (cmp_status->ncomps == 0)
2519             cmp_status->state = COMPOSING_CHAR;
2520           else if (cmp_status->ncomps > 0)
2521             {
2522               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2523                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2524             }
2525           else
2526             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2527         }
2528       else                      /* COMPOSING_COMPONENT_RULE */
2529         {
2530           int rule;
2531
2532           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2533           if (rule < 0)
2534             goto invalid_code;
2535           *charbuf++ = -2;
2536           *charbuf++ = rule;
2537           cmp_status->length += 2;
2538           cmp_status->ncomps--;
2539           if (cmp_status->ncomps > 0)
2540             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2541           else
2542             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2543         }
2544       continue;
2545
2546     invalid_code:
2547       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2548       src = src_base;
2549       consumed_chars = consumed_chars_base;
2550       ONE_MORE_BYTE (c);
2551       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2552       char_offset++;
2553     }
2554
2555  no_more_source:
2556   if (cmp_status->state != COMPOSING_NO)
2557     {
2558       if (coding->mode & CODING_MODE_LAST_BLOCK)
2559         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2560       else
2561         {
2562           int i;
2563
2564           charbuf -= cmp_status->length;
2565           for (i = 0; i < cmp_status->length; i++)
2566             cmp_status->carryover[i] = charbuf[i];
2567         }
2568     }
2569   if (last_id != charset_ascii)
2570     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2571   coding->consumed_char += consumed_chars_base;
2572   coding->consumed = src_base - coding->source;
2573   coding->charbuf_used = charbuf - coding->charbuf;
2574 }
2575
2576
2577 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2578   do {                                          \
2579     if (id < 0xA0)                              \
2580       codes[0] = id, codes[1] = 0;              \
2581     else if (id < 0xE0)                         \
2582       codes[0] = 0x9A, codes[1] = id;           \
2583     else if (id < 0xF0)                         \
2584       codes[0] = 0x9B, codes[1] = id;           \
2585     else if (id < 0xF5)                         \
2586       codes[0] = 0x9C, codes[1] = id;           \
2587     else                                        \
2588       codes[0] = 0x9D, codes[1] = id;           \
2589   } while (0);
2590
2591
2592 static bool
2593 encode_coding_emacs_mule (struct coding_system *coding)
2594 {
2595   bool multibytep = coding->dst_multibyte;
2596   int *charbuf = coding->charbuf;
2597   int *charbuf_end = charbuf + coding->charbuf_used;
2598   unsigned char *dst = coding->destination + coding->produced;
2599   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2600   int safe_room = 8;
2601   ptrdiff_t produced_chars = 0;
2602   Lisp_Object attrs, charset_list;
2603   int c;
2604   int preferred_charset_id = -1;
2605
2606   CODING_GET_INFO (coding, attrs, charset_list);
2607   if (! EQ (charset_list, Vemacs_mule_charset_list))
2608     {
2609       charset_list = Vemacs_mule_charset_list;
2610       ASET (attrs, coding_attr_charset_list, charset_list);
2611     }
2612
2613   while (charbuf < charbuf_end)
2614     {
2615       ASSURE_DESTINATION (safe_room);
2616       c = *charbuf++;
2617
2618       if (c < 0)
2619         {
2620           /* Handle an annotation.  */
2621           switch (*charbuf)
2622             {
2623             case CODING_ANNOTATE_COMPOSITION_MASK:
2624               /* Not yet implemented.  */
2625               break;
2626             case CODING_ANNOTATE_CHARSET_MASK:
2627               preferred_charset_id = charbuf[3];
2628               if (preferred_charset_id >= 0
2629                   && NILP (Fmemq (make_number (preferred_charset_id),
2630                                   charset_list)))
2631                 preferred_charset_id = -1;
2632               break;
2633             default:
2634               emacs_abort ();
2635             }
2636           charbuf += -c - 1;
2637           continue;
2638         }
2639
2640       if (ASCII_CHAR_P (c))
2641         EMIT_ONE_ASCII_BYTE (c);
2642       else if (CHAR_BYTE8_P (c))
2643         {
2644           c = CHAR_TO_BYTE8 (c);
2645           EMIT_ONE_BYTE (c);
2646         }
2647       else
2648         {
2649           struct charset *charset;
2650           unsigned code;
2651           int dimension;
2652           int emacs_mule_id;
2653           unsigned char leading_codes[2];
2654
2655           if (preferred_charset_id >= 0)
2656             {
2657               bool result;
2658
2659               charset = CHARSET_FROM_ID (preferred_charset_id);
2660               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2661               if (result)
2662                 code = ENCODE_CHAR (charset, c);
2663               else
2664                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2665                                      &code, charset);
2666             }
2667           else
2668             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2669                                  &code, charset);
2670           if (! charset)
2671             {
2672               c = coding->default_char;
2673               if (ASCII_CHAR_P (c))
2674                 {
2675                   EMIT_ONE_ASCII_BYTE (c);
2676                   continue;
2677                 }
2678               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2679                                    &code, charset);
2680             }
2681           dimension = CHARSET_DIMENSION (charset);
2682           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2683           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2684           EMIT_ONE_BYTE (leading_codes[0]);
2685           if (leading_codes[1])
2686             EMIT_ONE_BYTE (leading_codes[1]);
2687           if (dimension == 1)
2688             EMIT_ONE_BYTE (code | 0x80);
2689           else
2690             {
2691               code |= 0x8080;
2692               EMIT_ONE_BYTE (code >> 8);
2693               EMIT_ONE_BYTE (code & 0xFF);
2694             }
2695         }
2696     }
2697   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2698   coding->produced_char += produced_chars;
2699   coding->produced = dst - coding->destination;
2700   return 0;
2701 }
2702
2703 \f
2704 /*** 7. ISO2022 handlers ***/
2705
2706 /* The following note describes the coding system ISO2022 briefly.
2707    Since the intention of this note is to help understand the
2708    functions in this file, some parts are NOT ACCURATE or are OVERLY
2709    SIMPLIFIED.  For thorough understanding, please refer to the
2710    original document of ISO2022.  This is equivalent to the standard
2711    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2712
2713    ISO2022 provides many mechanisms to encode several character sets
2714    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2715    is encoded using bytes less than 128.  This may make the encoded
2716    text a little bit longer, but the text passes more easily through
2717    several types of gateway, some of which strip off the MSB (Most
2718    Significant Bit).
2719
2720    There are two kinds of character sets: control character sets and
2721    graphic character sets.  The former contain control characters such
2722    as `newline' and `escape' to provide control functions (control
2723    functions are also provided by escape sequences).  The latter
2724    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2725    two control character sets and many graphic character sets.
2726
2727    Graphic character sets are classified into one of the following
2728    four classes, according to the number of bytes (DIMENSION) and
2729    number of characters in one dimension (CHARS) of the set:
2730    - DIMENSION1_CHARS94
2731    - DIMENSION1_CHARS96
2732    - DIMENSION2_CHARS94
2733    - DIMENSION2_CHARS96
2734
2735    In addition, each character set is assigned an identification tag,
2736    unique for each set, called the "final character" (denoted as <F>
2737    hereafter).  The <F> of each character set is decided by ECMA(*)
2738    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2739    (0x30..0x3F are for private use only).
2740
2741    Note (*): ECMA = European Computer Manufacturers Association
2742
2743    Here are examples of graphic character sets [NAME(<F>)]:
2744         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2745         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2746         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2747         o DIMENSION2_CHARS96 -- none for the moment
2748
2749    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2750         C0 [0x00..0x1F] -- control character plane 0
2751         GL [0x20..0x7F] -- graphic character plane 0
2752         C1 [0x80..0x9F] -- control character plane 1
2753         GR [0xA0..0xFF] -- graphic character plane 1
2754
2755    A control character set is directly designated and invoked to C0 or
2756    C1 by an escape sequence.  The most common case is that:
2757    - ISO646's  control character set is designated/invoked to C0, and
2758    - ISO6429's control character set is designated/invoked to C1,
2759    and usually these designations/invocations are omitted in encoded
2760    text.  In a 7-bit environment, only C0 can be used, and a control
2761    character for C1 is encoded by an appropriate escape sequence to
2762    fit into the environment.  All control characters for C1 are
2763    defined to have corresponding escape sequences.
2764
2765    A graphic character set is at first designated to one of four
2766    graphic registers (G0 through G3), then these graphic registers are
2767    invoked to GL or GR.  These designations and invocations can be
2768    done independently.  The most common case is that G0 is invoked to
2769    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2770    these invocations and designations are omitted in encoded text.
2771    In a 7-bit environment, only GL can be used.
2772
2773    When a graphic character set of CHARS94 is invoked to GL, codes
2774    0x20 and 0x7F of the GL area work as control characters SPACE and
2775    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2776    be used.
2777
2778    There are two ways of invocation: locking-shift and single-shift.
2779    With locking-shift, the invocation lasts until the next different
2780    invocation, whereas with single-shift, the invocation affects the
2781    following character only and doesn't affect the locking-shift
2782    state.  Invocations are done by the following control characters or
2783    escape sequences:
2784
2785    ----------------------------------------------------------------------
2786    abbrev  function                  cntrl escape seq   description
2787    ----------------------------------------------------------------------
2788    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2789    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2790    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2791    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2792    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2793    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2794    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2795    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2796    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2797    ----------------------------------------------------------------------
2798    (*) These are not used by any known coding system.
2799
2800    Control characters for these functions are defined by macros
2801    ISO_CODE_XXX in `coding.h'.
2802
2803    Designations are done by the following escape sequences:
2804    ----------------------------------------------------------------------
2805    escape sequence      description
2806    ----------------------------------------------------------------------
2807    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2808    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2809    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2810    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2811    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2812    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2813    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2814    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2815    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2816    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2817    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2818    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2819    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2820    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2821    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2822    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2823    ----------------------------------------------------------------------
2824
2825    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2826    of dimension 1, chars 94, and final character <F>, etc...
2827
2828    Note (*): Although these designations are not allowed in ISO2022,
2829    Emacs accepts them on decoding, and produces them on encoding
2830    CHARS96 character sets in a coding system which is characterized as
2831    7-bit environment, non-locking-shift, and non-single-shift.
2832
2833    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2834    '(' must be omitted.  We refer to this as "short-form" hereafter.
2835
2836    Now you may notice that there are a lot of ways of encoding the
2837    same multilingual text in ISO2022.  Actually, there exist many
2838    coding systems such as Compound Text (used in X11's inter client
2839    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2840    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2841    localized platforms), and all of these are variants of ISO2022.
2842
2843    In addition to the above, Emacs handles two more kinds of escape
2844    sequences: ISO6429's direction specification and Emacs' private
2845    sequence for specifying character composition.
2846
2847    ISO6429's direction specification takes the following form:
2848         o CSI ']'      -- end of the current direction
2849         o CSI '0' ']'  -- end of the current direction
2850         o CSI '1' ']'  -- start of left-to-right text
2851         o CSI '2' ']'  -- start of right-to-left text
2852    The control character CSI (0x9B: control sequence introducer) is
2853    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2854
2855    Character composition specification takes the following form:
2856         o ESC '0' -- start relative composition
2857         o ESC '1' -- end composition
2858         o ESC '2' -- start rule-base composition (*)
2859         o ESC '3' -- start relative composition with alternate chars  (**)
2860         o ESC '4' -- start rule-base composition with alternate chars  (**)
2861   Since these are not standard escape sequences of any ISO standard,
2862   the use of them with these meanings is restricted to Emacs only.
2863
2864   (*) This form is used only in Emacs 20.7 and older versions,
2865   but newer versions can safely decode it.
2866   (**) This form is used only in Emacs 21.1 and newer versions,
2867   and older versions can't decode it.
2868
2869   Here's a list of example usages of these composition escape
2870   sequences (categorized by `enum composition_method').
2871
2872   COMPOSITION_RELATIVE:
2873         ESC 0 CHAR [ CHAR ] ESC 1
2874   COMPOSITION_WITH_RULE:
2875         ESC 2 CHAR [ RULE CHAR ] ESC 1
2876   COMPOSITION_WITH_ALTCHARS:
2877         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2878   COMPOSITION_WITH_RULE_ALTCHARS:
2879         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2880
2881 static enum iso_code_class_type iso_code_class[256];
2882
2883 #define SAFE_CHARSET_P(coding, id)      \
2884   ((id) <= (coding)->max_charset_id     \
2885    && (coding)->safe_charsets[id] != 255)
2886
2887 static void
2888 setup_iso_safe_charsets (Lisp_Object attrs)
2889 {
2890   Lisp_Object charset_list, safe_charsets;
2891   Lisp_Object request;
2892   Lisp_Object reg_usage;
2893   Lisp_Object tail;
2894   EMACS_INT reg94, reg96;
2895   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2896   int max_charset_id;
2897
2898   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2899   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2900       && ! EQ (charset_list, Viso_2022_charset_list))
2901     {
2902       charset_list = Viso_2022_charset_list;
2903       ASET (attrs, coding_attr_charset_list, charset_list);
2904       ASET (attrs, coding_attr_safe_charsets, Qnil);
2905     }
2906
2907   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2908     return;
2909
2910   max_charset_id = 0;
2911   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2912     {
2913       int id = XINT (XCAR (tail));
2914       if (max_charset_id < id)
2915         max_charset_id = id;
2916     }
2917
2918   safe_charsets = make_uninit_string (max_charset_id + 1);
2919   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2920   request = AREF (attrs, coding_attr_iso_request);
2921   reg_usage = AREF (attrs, coding_attr_iso_usage);
2922   reg94 = XINT (XCAR (reg_usage));
2923   reg96 = XINT (XCDR (reg_usage));
2924
2925   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2926     {
2927       Lisp_Object id;
2928       Lisp_Object reg;
2929       struct charset *charset;
2930
2931       id = XCAR (tail);
2932       charset = CHARSET_FROM_ID (XINT (id));
2933       reg = Fcdr (Fassq (id, request));
2934       if (! NILP (reg))
2935         SSET (safe_charsets, XINT (id), XINT (reg));
2936       else if (charset->iso_chars_96)
2937         {
2938           if (reg96 < 4)
2939             SSET (safe_charsets, XINT (id), reg96);
2940         }
2941       else
2942         {
2943           if (reg94 < 4)
2944             SSET (safe_charsets, XINT (id), reg94);
2945         }
2946     }
2947   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2948 }
2949
2950
2951 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2952    Return true if a text is encoded in one of ISO-2022 based coding
2953    systems.  */
2954
2955 static bool
2956 detect_coding_iso_2022 (struct coding_system *coding,
2957                         struct coding_detection_info *detect_info)
2958 {
2959   const unsigned char *src = coding->source, *src_base = src;
2960   const unsigned char *src_end = coding->source + coding->src_bytes;
2961   bool multibytep = coding->src_multibyte;
2962   bool single_shifting = 0;
2963   int id;
2964   int c, c1;
2965   ptrdiff_t consumed_chars = 0;
2966   int i;
2967   int rejected = 0;
2968   int found = 0;
2969   int composition_count = -1;
2970
2971   detect_info->checked |= CATEGORY_MASK_ISO;
2972
2973   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2974     {
2975       struct coding_system *this = &(coding_categories[i]);
2976       Lisp_Object attrs, val;
2977
2978       if (this->id < 0)
2979         continue;
2980       attrs = CODING_ID_ATTRS (this->id);
2981       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2982           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2983         setup_iso_safe_charsets (attrs);
2984       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2985       this->max_charset_id = SCHARS (val) - 1;
2986       this->safe_charsets = SDATA (val);
2987     }
2988
2989   /* A coding system of this category is always ASCII compatible.  */
2990   src += coding->head_ascii;
2991
2992   while (rejected != CATEGORY_MASK_ISO)
2993     {
2994       src_base = src;
2995       ONE_MORE_BYTE (c);
2996       switch (c)
2997         {
2998         case ISO_CODE_ESC:
2999           if (inhibit_iso_escape_detection)
3000             break;
3001           single_shifting = 0;
3002           ONE_MORE_BYTE (c);
3003           if (c == 'N' || c == 'O')
3004             {
3005               /* ESC <Fe> for SS2 or SS3.  */
3006               single_shifting = 1;
3007               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3008             }
3009           else if (c == '1')
3010             {
3011               /* End of composition.  */
3012               if (composition_count < 0
3013                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3014                 /* Invalid */
3015                 break;
3016               composition_count = -1;
3017               found |= CATEGORY_MASK_ISO;
3018             }
3019           else if (c >= '0' && c <= '4')
3020             {
3021               /* ESC <Fp> for start/end composition.  */
3022               composition_count = 0;
3023             }
3024           else
3025             {
3026               if (c >= '(' && c <= '/')
3027                 {
3028                   /* Designation sequence for a charset of dimension 1.  */
3029                   ONE_MORE_BYTE (c1);
3030                   if (c1 < ' ' || c1 >= 0x80
3031                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3032                     {
3033                       /* Invalid designation sequence.  Just ignore.  */
3034                       if (c1 >= 0x80)
3035                         rejected |= (CATEGORY_MASK_ISO_7BIT
3036                                      | CATEGORY_MASK_ISO_7_ELSE);
3037                       break;
3038                     }
3039                 }
3040               else if (c == '$')
3041                 {
3042                   /* Designation sequence for a charset of dimension 2.  */
3043                   ONE_MORE_BYTE (c);
3044                   if (c >= '@' && c <= 'B')
3045                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3046                     id = iso_charset_table[1][0][c];
3047                   else if (c >= '(' && c <= '/')
3048                     {
3049                       ONE_MORE_BYTE (c1);
3050                       if (c1 < ' ' || c1 >= 0x80
3051                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3052                         {
3053                           /* Invalid designation sequence.  Just ignore.  */
3054                           if (c1 >= 0x80)
3055                             rejected |= (CATEGORY_MASK_ISO_7BIT
3056                                          | CATEGORY_MASK_ISO_7_ELSE);
3057                           break;
3058                         }
3059                     }
3060                   else
3061                     {
3062                       /* Invalid designation sequence.  Just ignore it.  */
3063                       if (c >= 0x80)
3064                         rejected |= (CATEGORY_MASK_ISO_7BIT
3065                                      | CATEGORY_MASK_ISO_7_ELSE);
3066                       break;
3067                     }
3068                 }
3069               else
3070                 {
3071                   /* Invalid escape sequence.  Just ignore it.  */
3072                   if (c >= 0x80)
3073                     rejected |= (CATEGORY_MASK_ISO_7BIT
3074                                  | CATEGORY_MASK_ISO_7_ELSE);
3075                   break;
3076                 }
3077
3078               /* We found a valid designation sequence for CHARSET.  */
3079               rejected |= CATEGORY_MASK_ISO_8BIT;
3080               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3081                                   id))
3082                 found |= CATEGORY_MASK_ISO_7;
3083               else
3084                 rejected |= CATEGORY_MASK_ISO_7;
3085               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3086                                   id))
3087                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3088               else
3089                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3090               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3091                                   id))
3092                 found |= CATEGORY_MASK_ISO_7_ELSE;
3093               else
3094                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3095               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3096                                   id))
3097                 found |= CATEGORY_MASK_ISO_8_ELSE;
3098               else
3099                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3100             }
3101           break;
3102
3103         case ISO_CODE_SO:
3104         case ISO_CODE_SI:
3105           /* Locking shift out/in.  */
3106           if (inhibit_iso_escape_detection)
3107             break;
3108           single_shifting = 0;
3109           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3110           break;
3111
3112         case ISO_CODE_CSI:
3113           /* Control sequence introducer.  */
3114           single_shifting = 0;
3115           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3116           found |= CATEGORY_MASK_ISO_8_ELSE;
3117           goto check_extra_latin;
3118
3119         case ISO_CODE_SS2:
3120         case ISO_CODE_SS3:
3121           /* Single shift.   */
3122           if (inhibit_iso_escape_detection)
3123             break;
3124           single_shifting = 0;
3125           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3126           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3127               & CODING_ISO_FLAG_SINGLE_SHIFT)
3128             {
3129               found |= CATEGORY_MASK_ISO_8_1;
3130               single_shifting = 1;
3131             }
3132           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3133               & CODING_ISO_FLAG_SINGLE_SHIFT)
3134             {
3135               found |= CATEGORY_MASK_ISO_8_2;
3136               single_shifting = 1;
3137             }
3138           if (single_shifting)
3139             break;
3140           goto check_extra_latin;
3141
3142         default:
3143           if (c < 0)
3144             continue;
3145           if (c < 0x80)
3146             {
3147               if (composition_count >= 0)
3148                 composition_count++;
3149               single_shifting = 0;
3150               break;
3151             }
3152           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3153           if (c >= 0xA0)
3154             {
3155               found |= CATEGORY_MASK_ISO_8_1;
3156               /* Check the length of succeeding codes of the range
3157                  0xA0..0FF.  If the byte length is even, we include
3158                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3159                  only when we are not single shifting.  */
3160               if (! single_shifting
3161                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3162                 {
3163                   ptrdiff_t len = 1;
3164                   while (src < src_end)
3165                     {
3166                       src_base = src;
3167                       ONE_MORE_BYTE (c);
3168                       if (c < 0xA0)
3169                         {
3170                           src = src_base;
3171                           break;
3172                         }
3173                       len++;
3174                     }
3175
3176                   if (len & 1 && src < src_end)
3177                     {
3178                       rejected |= CATEGORY_MASK_ISO_8_2;
3179                       if (composition_count >= 0)
3180                         composition_count += len;
3181                     }
3182                   else
3183                     {
3184                       found |= CATEGORY_MASK_ISO_8_2;
3185                       if (composition_count >= 0)
3186                         composition_count += len / 2;
3187                     }
3188                 }
3189               break;
3190             }
3191         check_extra_latin:
3192           if (! VECTORP (Vlatin_extra_code_table)
3193               || NILP (AREF (Vlatin_extra_code_table, c)))
3194             {
3195               rejected = CATEGORY_MASK_ISO;
3196               break;
3197             }
3198           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3199               & CODING_ISO_FLAG_LATIN_EXTRA)
3200             found |= CATEGORY_MASK_ISO_8_1;
3201           else
3202             rejected |= CATEGORY_MASK_ISO_8_1;
3203           rejected |= CATEGORY_MASK_ISO_8_2;
3204           break;
3205         }
3206     }
3207   detect_info->rejected |= CATEGORY_MASK_ISO;
3208   return 0;
3209
3210  no_more_source:
3211   detect_info->rejected |= rejected;
3212   detect_info->found |= (found & ~rejected);
3213   return 1;
3214 }
3215
3216
3217 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3218    escape sequence should be kept.  */
3219 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3220   do {                                                                  \
3221     int id, prev;                                                       \
3222                                                                         \
3223     if (final < '0' || final >= 128                                     \
3224         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3225         || !SAFE_CHARSET_P (coding, id))                                \
3226       {                                                                 \
3227         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3228         chars_96 = -1;                                                  \
3229         break;                                                          \
3230       }                                                                 \
3231     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3232     if (id == charset_jisx0201_roman)                                   \
3233       {                                                                 \
3234         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3235           id = charset_ascii;                                           \
3236       }                                                                 \
3237     else if (id == charset_jisx0208_1978)                               \
3238       {                                                                 \
3239         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3240           id = charset_jisx0208;                                        \
3241       }                                                                 \
3242     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3243     /* If there was an invalid designation to REG previously, and this  \
3244        designation is ASCII to REG, we should keep this designation     \
3245        sequence.  */                                                    \
3246     if (prev == -2 && id == charset_ascii)                              \
3247       chars_96 = -1;                                                    \
3248   } while (0)
3249
3250
3251 /* Handle these composition sequence (ALT: alternate char):
3252
3253    (1) relative composition: ESC 0 CHAR ... ESC 1
3254    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3255    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3256    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3257
3258    When the start sequence (ESC 0/2/3/4) is found, this annotation
3259    header is produced.
3260
3261         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3262
3263    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3264    produced until the end sequence (ESC 1) is found:
3265
3266    (1) CHAR ... CHAR
3267    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3268    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3269    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3270
3271    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3272    annotation header is updated as below:
3273
3274    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3275    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3276    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3277    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3278
3279    If an error is found while composing, the annotation header is
3280    changed to:
3281
3282         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3283
3284    and the sequence [ -2 DECODED-RULE ] is changed to the original
3285    byte sequence as below:
3286         o the original byte sequence is B: [ B -1 ]
3287         o the original byte sequence is B1 B2: [ B1 B2 ]
3288    and the sequence [ -1 -1 ] is changed to the original byte
3289    sequence:
3290         [ ESC '0' ]
3291 */
3292
3293 /* Decode a composition rule C1 and maybe one more byte from the
3294    source, and set RULE to the encoded composition rule.  If the rule
3295    is invalid, goto invalid_code.  */
3296
3297 #define DECODE_COMPOSITION_RULE(rule)                                   \
3298   do {                                                                  \
3299     rule = c1 - 32;                                                     \
3300     if (rule < 0)                                                       \
3301       goto invalid_code;                                                \
3302     if (rule < 81)              /* old format (before ver.21) */        \
3303       {                                                                 \
3304         int gref = (rule) / 9;                                          \
3305         int nref = (rule) % 9;                                          \
3306         if (gref == 4) gref = 10;                                       \
3307         if (nref == 4) nref = 10;                                       \
3308         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3309       }                                                                 \
3310     else                        /* new format (after ver.21) */         \
3311       {                                                                 \
3312         int b;                                                          \
3313                                                                         \
3314         ONE_MORE_BYTE (b);                                              \
3315         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3316           goto invalid_code;                                            \
3317         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3318         rule += 0x100;   /* Distinguish it from the old format.  */     \
3319       }                                                                 \
3320   } while (0)
3321
3322 #define ENCODE_COMPOSITION_RULE(rule)                           \
3323   do {                                                          \
3324     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3325                                                                 \
3326     if (rule < 0x100)           /* old format */                \
3327       {                                                         \
3328         if (gref == 10) gref = 4;                               \
3329         if (nref == 10) nref = 4;                               \
3330         charbuf[idx] = 32 + gref * 9 + nref;                    \
3331         charbuf[idx + 1] = -1;                                  \
3332         new_chars++;                                            \
3333       }                                                         \
3334     else                                /* new format */        \
3335       {                                                         \
3336         charbuf[idx] = 32 + 81 + gref;                          \
3337         charbuf[idx + 1] = 32 + nref;                           \
3338         new_chars += 2;                                         \
3339       }                                                         \
3340   } while (0)
3341
3342 /* Finish the current composition as invalid.  */
3343
3344 static int
3345 finish_composition (int *charbuf, struct composition_status *cmp_status)
3346 {
3347   int idx = - cmp_status->length;
3348   int new_chars;
3349
3350   /* Recover the original ESC sequence */
3351   charbuf[idx++] = ISO_CODE_ESC;
3352   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3353                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3354                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3355                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3356                     : '4');
3357   charbuf[idx++] = -2;
3358   charbuf[idx++] = 0;
3359   charbuf[idx++] = -1;
3360   new_chars = cmp_status->nchars;
3361   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3362     for (; idx < 0; idx++)
3363       {
3364         int elt = charbuf[idx];
3365
3366         if (elt == -2)
3367           {
3368             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3369             idx++;
3370           }
3371         else if (elt == -1)
3372           {
3373             charbuf[idx++] = ISO_CODE_ESC;
3374             charbuf[idx] = '0';
3375             new_chars += 2;
3376           }
3377       }
3378   cmp_status->state = COMPOSING_NO;
3379   return new_chars;
3380 }
3381
3382 /* If characters are under composition, finish the composition.  */
3383 #define MAYBE_FINISH_COMPOSITION()                              \
3384   do {                                                          \
3385     if (cmp_status->state != COMPOSING_NO)                      \
3386       char_offset += finish_composition (charbuf, cmp_status);  \
3387   } while (0)
3388
3389 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3390
3391    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3392    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3393    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3394    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3395
3396    Produce this annotation sequence now:
3397
3398    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3399 */
3400
3401 #define DECODE_COMPOSITION_START(c1)                                       \
3402   do {                                                                     \
3403     if (c1 == '0'                                                          \
3404         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3405              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3406             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3407                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3408       {                                                                    \
3409         *charbuf++ = -1;                                                   \
3410         *charbuf++= -1;                                                    \
3411         cmp_status->state = COMPOSING_CHAR;                                \
3412         cmp_status->length += 2;                                           \
3413       }                                                                    \
3414     else                                                                   \
3415       {                                                                    \
3416         MAYBE_FINISH_COMPOSITION ();                                       \
3417         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3418                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3419                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3420                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3421         cmp_status->state                                                  \
3422           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3423         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3424         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3425         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3426         coding->annotated = 1;                                             \
3427       }                                                                    \
3428   } while (0)
3429
3430
3431 /* Handle composition end sequence ESC 1.  */
3432
3433 #define DECODE_COMPOSITION_END()                                        \
3434   do {                                                                  \
3435     if (cmp_status->nchars == 0                                         \
3436         || ((cmp_status->state == COMPOSING_CHAR)                       \
3437             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3438       {                                                                 \
3439         MAYBE_FINISH_COMPOSITION ();                                    \
3440         goto invalid_code;                                              \
3441       }                                                                 \
3442     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3443       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3444     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3445       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3446     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3447     char_offset += cmp_status->nchars;                                  \
3448     cmp_status->state = COMPOSING_NO;                                   \
3449   } while (0)
3450
3451 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3452
3453 #define STORE_COMPOSITION_RULE(rule)    \
3454   do {                                  \
3455     *charbuf++ = -2;                    \
3456     *charbuf++ = rule;                  \
3457     cmp_status->length += 2;            \
3458     cmp_status->state--;                \
3459   } while (0)
3460
3461 /* Store a composed char or a component char C in charbuf, and update
3462    cmp_status.  */
3463
3464 #define STORE_COMPOSITION_CHAR(c)                                       \
3465   do {                                                                  \
3466     *charbuf++ = (c);                                                   \
3467     cmp_status->length++;                                               \
3468     if (cmp_status->state == COMPOSING_CHAR)                            \
3469       cmp_status->nchars++;                                             \
3470     else                                                                \
3471       cmp_status->ncomps++;                                             \
3472     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3473         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3474             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3475       cmp_status->state++;                                              \
3476   } while (0)
3477
3478
3479 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3480
3481 static void
3482 decode_coding_iso_2022 (struct coding_system *coding)
3483 {
3484   const unsigned char *src = coding->source + coding->consumed;
3485   const unsigned char *src_end = coding->source + coding->src_bytes;
3486   const unsigned char *src_base;
3487   int *charbuf = coding->charbuf + coding->charbuf_used;
3488   /* We may produce two annotations (charset and composition) in one
3489      loop and one more charset annotation at the end.  */
3490   int *charbuf_end
3491     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3492   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3493   bool multibytep = coding->src_multibyte;
3494   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3495   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3496   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3497   int charset_id_2, charset_id_3;
3498   struct charset *charset;
3499   int c;
3500   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3501   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3502   ptrdiff_t char_offset = coding->produced_char;
3503   ptrdiff_t last_offset = char_offset;
3504   int last_id = charset_ascii;
3505   bool eol_dos
3506     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3507   int byte_after_cr = -1;
3508   int i;
3509
3510   setup_iso_safe_charsets (attrs);
3511   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3512
3513   if (cmp_status->state != COMPOSING_NO)
3514     {
3515       if (charbuf_end - charbuf < cmp_status->length)
3516         emacs_abort ();
3517       for (i = 0; i < cmp_status->length; i++)
3518         *charbuf++ = cmp_status->carryover[i];
3519       coding->annotated = 1;
3520     }
3521
3522   while (1)
3523     {
3524       int c1, c2, c3;
3525
3526       src_base = src;
3527       consumed_chars_base = consumed_chars;
3528
3529       if (charbuf >= charbuf_end)
3530         {
3531           if (byte_after_cr >= 0)
3532             src_base--;
3533           break;
3534         }
3535
3536       if (byte_after_cr >= 0)
3537         c1 = byte_after_cr, byte_after_cr = -1;
3538       else
3539         ONE_MORE_BYTE (c1);
3540       if (c1 < 0)
3541         goto invalid_code;
3542
3543       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3544         {
3545           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3546           char_offset++;
3547           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3548           continue;
3549         }
3550
3551       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3552         {
3553           if (c1 == ISO_CODE_ESC)
3554             {
3555               if (src + 1 >= src_end)
3556                 goto no_more_source;
3557               *charbuf++ = ISO_CODE_ESC;
3558               char_offset++;
3559               if (src[0] == '%' && src[1] == '@')
3560                 {
3561                   src += 2;
3562                   consumed_chars += 2;
3563                   char_offset += 2;
3564                   /* We are sure charbuf can contain two more chars. */
3565                   *charbuf++ = '%';
3566                   *charbuf++ = '@';
3567                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3568                 }
3569             }
3570           else
3571             {
3572               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3573               char_offset++;
3574             }
3575           continue;
3576         }
3577
3578       if ((cmp_status->state == COMPOSING_RULE
3579            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3580           && c1 != ISO_CODE_ESC)
3581         {
3582           int rule;
3583
3584           DECODE_COMPOSITION_RULE (rule);
3585           STORE_COMPOSITION_RULE (rule);
3586           continue;
3587         }
3588
3589       /* We produce at most one character.  */
3590       switch (iso_code_class [c1])
3591         {
3592         case ISO_0x20_or_0x7F:
3593           if (charset_id_0 < 0
3594               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3595             /* This is SPACE or DEL.  */
3596             charset = CHARSET_FROM_ID (charset_ascii);
3597           else
3598             charset = CHARSET_FROM_ID (charset_id_0);
3599           break;
3600
3601         case ISO_graphic_plane_0:
3602           if (charset_id_0 < 0)
3603             charset = CHARSET_FROM_ID (charset_ascii);
3604           else
3605             charset = CHARSET_FROM_ID (charset_id_0);
3606           break;
3607
3608         case ISO_0xA0_or_0xFF:
3609           if (charset_id_1 < 0
3610               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3611               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3612             goto invalid_code;
3613           /* This is a graphic character, we fall down ... */
3614
3615         case ISO_graphic_plane_1:
3616           if (charset_id_1 < 0)
3617             goto invalid_code;
3618           charset = CHARSET_FROM_ID (charset_id_1);
3619           break;
3620
3621         case ISO_control_0:
3622           if (eol_dos && c1 == '\r')
3623             ONE_MORE_BYTE (byte_after_cr);
3624           MAYBE_FINISH_COMPOSITION ();
3625           charset = CHARSET_FROM_ID (charset_ascii);
3626           break;
3627
3628         case ISO_control_1:
3629           goto invalid_code;
3630
3631         case ISO_shift_out:
3632           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3633               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3634             goto invalid_code;
3635           CODING_ISO_INVOCATION (coding, 0) = 1;
3636           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3637           continue;
3638
3639         case ISO_shift_in:
3640           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3641             goto invalid_code;
3642           CODING_ISO_INVOCATION (coding, 0) = 0;
3643           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3644           continue;
3645
3646         case ISO_single_shift_2_7:
3647           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3648             goto invalid_code;
3649         case ISO_single_shift_2:
3650           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3651             goto invalid_code;
3652           /* SS2 is handled as an escape sequence of ESC 'N' */
3653           c1 = 'N';
3654           goto label_escape_sequence;
3655
3656         case ISO_single_shift_3:
3657           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3658             goto invalid_code;
3659           /* SS2 is handled as an escape sequence of ESC 'O' */
3660           c1 = 'O';
3661           goto label_escape_sequence;
3662
3663         case ISO_control_sequence_introducer:
3664           /* CSI is handled as an escape sequence of ESC '[' ...  */
3665           c1 = '[';
3666           goto label_escape_sequence;
3667
3668         case ISO_escape:
3669           ONE_MORE_BYTE (c1);
3670         label_escape_sequence:
3671           /* Escape sequences handled here are invocation,
3672              designation, direction specification, and character
3673              composition specification.  */
3674           switch (c1)
3675             {
3676             case '&':           /* revision of following character set */
3677               ONE_MORE_BYTE (c1);
3678               if (!(c1 >= '@' && c1 <= '~'))
3679                 goto invalid_code;
3680               ONE_MORE_BYTE (c1);
3681               if (c1 != ISO_CODE_ESC)
3682                 goto invalid_code;
3683               ONE_MORE_BYTE (c1);
3684               goto label_escape_sequence;
3685
3686             case '$':           /* designation of 2-byte character set */
3687               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3688                 goto invalid_code;
3689               {
3690                 int reg, chars96;
3691
3692                 ONE_MORE_BYTE (c1);
3693                 if (c1 >= '@' && c1 <= 'B')
3694                   {     /* designation of JISX0208.1978, GB2312.1980,
3695                            or JISX0208.1980 */
3696                     reg = 0, chars96 = 0;
3697                   }
3698                 else if (c1 >= 0x28 && c1 <= 0x2B)
3699                   { /* designation of DIMENSION2_CHARS94 character set */
3700                     reg = c1 - 0x28, chars96 = 0;
3701                     ONE_MORE_BYTE (c1);
3702                   }
3703                 else if (c1 >= 0x2C && c1 <= 0x2F)
3704                   { /* designation of DIMENSION2_CHARS96 character set */
3705                     reg = c1 - 0x2C, chars96 = 1;
3706                     ONE_MORE_BYTE (c1);
3707                   }
3708                 else
3709                   goto invalid_code;
3710                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3711                 /* We must update these variables now.  */
3712                 if (reg == 0)
3713                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3714                 else if (reg == 1)
3715                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3716                 if (chars96 < 0)
3717                   goto invalid_code;
3718               }
3719               continue;
3720
3721             case 'n':           /* invocation of locking-shift-2 */
3722               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3723                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3724                 goto invalid_code;
3725               CODING_ISO_INVOCATION (coding, 0) = 2;
3726               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3727               continue;
3728
3729             case 'o':           /* invocation of locking-shift-3 */
3730               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3731                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3732                 goto invalid_code;
3733               CODING_ISO_INVOCATION (coding, 0) = 3;
3734               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3735               continue;
3736
3737             case 'N':           /* invocation of single-shift-2 */
3738               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3739                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3740                 goto invalid_code;
3741               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3742               if (charset_id_2 < 0)
3743                 charset = CHARSET_FROM_ID (charset_ascii);
3744               else
3745                 charset = CHARSET_FROM_ID (charset_id_2);
3746               ONE_MORE_BYTE (c1);
3747               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3748                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3749                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3750                           ? c1 >= 0x80 : c1 < 0x80)))
3751                 goto invalid_code;
3752               break;
3753
3754             case 'O':           /* invocation of single-shift-3 */
3755               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3756                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3757                 goto invalid_code;
3758               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3759               if (charset_id_3 < 0)
3760                 charset = CHARSET_FROM_ID (charset_ascii);
3761               else
3762                 charset = CHARSET_FROM_ID (charset_id_3);
3763               ONE_MORE_BYTE (c1);
3764               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3765                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3766                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3767                           ? c1 >= 0x80 : c1 < 0x80)))
3768                 goto invalid_code;
3769               break;
3770
3771             case '0': case '2': case '3': case '4': /* start composition */
3772               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3773                 goto invalid_code;
3774               if (last_id != charset_ascii)
3775                 {
3776                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3777                   last_id = charset_ascii;
3778                   last_offset = char_offset;
3779                 }
3780               DECODE_COMPOSITION_START (c1);
3781               continue;
3782
3783             case '1':           /* end composition */
3784               if (cmp_status->state == COMPOSING_NO)
3785                 goto invalid_code;
3786               DECODE_COMPOSITION_END ();
3787               continue;
3788
3789             case '[':           /* specification of direction */
3790               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3791                 goto invalid_code;
3792               /* For the moment, nested direction is not supported.
3793                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3794                  left-to-right, and nonzero means right-to-left.  */
3795               ONE_MORE_BYTE (c1);
3796               switch (c1)
3797                 {
3798                 case ']':       /* end of the current direction */
3799                   coding->mode &= ~CODING_MODE_DIRECTION;
3800
3801                 case '0':       /* end of the current direction */
3802                 case '1':       /* start of left-to-right direction */
3803                   ONE_MORE_BYTE (c1);
3804                   if (c1 == ']')
3805                     coding->mode &= ~CODING_MODE_DIRECTION;
3806                   else
3807                     goto invalid_code;
3808                   break;
3809
3810                 case '2':       /* start of right-to-left direction */
3811                   ONE_MORE_BYTE (c1);
3812                   if (c1 == ']')
3813                     coding->mode |= CODING_MODE_DIRECTION;
3814                   else
3815                     goto invalid_code;
3816                   break;
3817
3818                 default:
3819                   goto invalid_code;
3820                 }
3821               continue;
3822
3823             case '%':
3824               ONE_MORE_BYTE (c1);
3825               if (c1 == '/')
3826                 {
3827                   /* CTEXT extended segment:
3828                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3829                      We keep these bytes as is for the moment.
3830                      They may be decoded by post-read-conversion.  */
3831                   int dim, M, L;
3832                   int size;
3833
3834                   ONE_MORE_BYTE (dim);
3835                   if (dim < '0' || dim > '4')
3836                     goto invalid_code;
3837                   ONE_MORE_BYTE (M);
3838                   if (M < 128)
3839                     goto invalid_code;
3840                   ONE_MORE_BYTE (L);
3841                   if (L < 128)
3842                     goto invalid_code;
3843                   size = ((M - 128) * 128) + (L - 128);
3844                   if (charbuf + 6 > charbuf_end)
3845                     goto break_loop;
3846                   *charbuf++ = ISO_CODE_ESC;
3847                   *charbuf++ = '%';
3848                   *charbuf++ = '/';
3849                   *charbuf++ = dim;
3850                   *charbuf++ = BYTE8_TO_CHAR (M);
3851                   *charbuf++ = BYTE8_TO_CHAR (L);
3852                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3853                 }
3854               else if (c1 == 'G')
3855                 {
3856                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3857                      ESC % G --UTF-8-BYTES-- ESC % @
3858                      We keep these bytes as is for the moment.
3859                      They may be decoded by post-read-conversion.  */
3860                   if (charbuf + 3 > charbuf_end)
3861                     goto break_loop;
3862                   *charbuf++ = ISO_CODE_ESC;
3863                   *charbuf++ = '%';
3864                   *charbuf++ = 'G';
3865                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3866                 }
3867               else
3868                 goto invalid_code;
3869               continue;
3870               break;
3871
3872             default:
3873               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3874                 goto invalid_code;
3875               {
3876                 int reg, chars96;
3877
3878                 if (c1 >= 0x28 && c1 <= 0x2B)
3879                   { /* designation of DIMENSION1_CHARS94 character set */
3880                     reg = c1 - 0x28, chars96 = 0;
3881                     ONE_MORE_BYTE (c1);
3882                   }
3883                 else if (c1 >= 0x2C && c1 <= 0x2F)
3884                   { /* designation of DIMENSION1_CHARS96 character set */
3885                     reg = c1 - 0x2C, chars96 = 1;
3886                     ONE_MORE_BYTE (c1);
3887                   }
3888                 else
3889                   goto invalid_code;
3890                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3891                 /* We must update these variables now.  */
3892                 if (reg == 0)
3893                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3894                 else if (reg == 1)
3895                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3896                 if (chars96 < 0)
3897                   goto invalid_code;
3898               }
3899               continue;
3900             }
3901           break;
3902
3903         default:
3904           emacs_abort ();
3905         }
3906
3907       if (cmp_status->state == COMPOSING_NO
3908           && charset->id != charset_ascii
3909           && last_id != charset->id)
3910         {
3911           if (last_id != charset_ascii)
3912             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3913           last_id = charset->id;
3914           last_offset = char_offset;
3915         }
3916
3917       /* Now we know CHARSET and 1st position code C1 of a character.
3918          Produce a decoded character while getting 2nd and 3rd
3919          position codes C2, C3 if necessary.  */
3920       if (CHARSET_DIMENSION (charset) > 1)
3921         {
3922           ONE_MORE_BYTE (c2);
3923           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3924               || ((c1 & 0x80) != (c2 & 0x80)))
3925             /* C2 is not in a valid range.  */
3926             goto invalid_code;
3927           if (CHARSET_DIMENSION (charset) == 2)
3928             c1 = (c1 << 8) | c2;
3929           else
3930             {
3931               ONE_MORE_BYTE (c3);
3932               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3933                   || ((c1 & 0x80) != (c3 & 0x80)))
3934                 /* C3 is not in a valid range.  */
3935                 goto invalid_code;
3936               c1 = (c1 << 16) | (c2 << 8) | c2;
3937             }
3938         }
3939       c1 &= 0x7F7F7F;
3940       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3941       if (c < 0)
3942         {
3943           MAYBE_FINISH_COMPOSITION ();
3944           for (; src_base < src; src_base++, char_offset++)
3945             {
3946               if (ASCII_CHAR_P (*src_base))
3947                 *charbuf++ = *src_base;
3948               else
3949                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3950             }
3951         }
3952       else if (cmp_status->state == COMPOSING_NO)
3953         {
3954           *charbuf++ = c;
3955           char_offset++;
3956         }
3957       else if ((cmp_status->state == COMPOSING_CHAR
3958                 ? cmp_status->nchars
3959                 : cmp_status->ncomps)
3960                >= MAX_COMPOSITION_COMPONENTS)
3961         {
3962           /* Too long composition.  */
3963           MAYBE_FINISH_COMPOSITION ();
3964           *charbuf++ = c;
3965           char_offset++;
3966         }
3967       else
3968         STORE_COMPOSITION_CHAR (c);
3969       continue;
3970
3971     invalid_code:
3972       MAYBE_FINISH_COMPOSITION ();
3973       src = src_base;
3974       consumed_chars = consumed_chars_base;
3975       ONE_MORE_BYTE (c);
3976       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
3977       char_offset++;
3978       /* Reset the invocation and designation status to the safest
3979          one; i.e. designate ASCII to the graphic register 0, and
3980          invoke that register to the graphic plane 0.  This typically
3981          helps the case that an designation sequence for ASCII "ESC (
3982          B" is somehow broken (e.g. broken by a newline).  */
3983       CODING_ISO_INVOCATION (coding, 0) = 0;
3984       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3985       charset_id_0 = charset_ascii;
3986       continue;
3987
3988     break_loop:
3989       break;
3990     }
3991
3992  no_more_source:
3993   if (cmp_status->state != COMPOSING_NO)
3994     {
3995       if (coding->mode & CODING_MODE_LAST_BLOCK)
3996         MAYBE_FINISH_COMPOSITION ();
3997       else
3998         {
3999           charbuf -= cmp_status->length;
4000           for (i = 0; i < cmp_status->length; i++)
4001             cmp_status->carryover[i] = charbuf[i];
4002         }
4003     }
4004   else if (last_id != charset_ascii)
4005     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4006   coding->consumed_char += consumed_chars_base;
4007   coding->consumed = src_base - coding->source;
4008   coding->charbuf_used = charbuf - coding->charbuf;
4009 }
4010
4011
4012 /* ISO2022 encoding stuff.  */
4013
4014 /*
4015    It is not enough to say just "ISO2022" on encoding, we have to
4016    specify more details.  In Emacs, each coding system of ISO2022
4017    variant has the following specifications:
4018         1. Initial designation to G0 thru G3.
4019         2. Allows short-form designation?
4020         3. ASCII should be designated to G0 before control characters?
4021         4. ASCII should be designated to G0 at end of line?
4022         5. 7-bit environment or 8-bit environment?
4023         6. Use locking-shift?
4024         7. Use Single-shift?
4025    And the following two are only for Japanese:
4026         8. Use ASCII in place of JIS0201-1976-Roman?
4027         9. Use JISX0208-1983 in place of JISX0208-1978?
4028    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4029    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4030    details.
4031 */
4032
4033 /* Produce codes (escape sequence) for designating CHARSET to graphic
4034    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4035    '@', 'A', or 'B' and the coding system CODING allows, produce
4036    designation sequence of short-form.  */
4037
4038 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4039   do {                                                                  \
4040     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4041     const char *intermediate_char_94 = "()*+";                          \
4042     const char *intermediate_char_96 = ",-./";                          \
4043     int revision = -1;                                                  \
4044                                                                         \
4045     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4046       revision = CHARSET_ISO_REVISION (charset);                        \
4047                                                                         \
4048     if (revision >= 0)                                                  \
4049       {                                                                 \
4050         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4051         EMIT_ONE_BYTE ('@' + revision);                                 \
4052       }                                                                 \
4053     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4054     if (CHARSET_DIMENSION (charset) == 1)                               \
4055       {                                                                 \
4056         int b;                                                          \
4057         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4058           b = intermediate_char_94[reg];                                \
4059         else                                                            \
4060           b = intermediate_char_96[reg];                                \
4061         EMIT_ONE_ASCII_BYTE (b);                                        \
4062       }                                                                 \
4063     else                                                                \
4064       {                                                                 \
4065         EMIT_ONE_ASCII_BYTE ('$');                                      \
4066         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4067           {                                                             \
4068             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4069                 || reg != 0                                             \
4070                 || final_char < '@' || final_char > 'B')                \
4071               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4072           }                                                             \
4073         else                                                            \
4074           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4075       }                                                                 \
4076     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4077                                                                         \
4078     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4079   } while (0)
4080
4081
4082 /* The following two macros produce codes (control character or escape
4083    sequence) for ISO2022 single-shift functions (single-shift-2 and
4084    single-shift-3).  */
4085
4086 #define ENCODE_SINGLE_SHIFT_2                                           \
4087   do {                                                                  \
4088     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4089       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4090     else                                                                \
4091       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4092     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4093   } while (0)
4094
4095
4096 #define ENCODE_SINGLE_SHIFT_3                                           \
4097   do {                                                                  \
4098     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4099       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4100     else                                                                \
4101       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4102     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4103   } while (0)
4104
4105
4106 /* The following four macros produce codes (control character or
4107    escape sequence) for ISO2022 locking-shift functions (shift-in,
4108    shift-out, locking-shift-2, and locking-shift-3).  */
4109
4110 #define ENCODE_SHIFT_IN                                 \
4111   do {                                                  \
4112     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4113     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4114   } while (0)
4115
4116
4117 #define ENCODE_SHIFT_OUT                                \
4118   do {                                                  \
4119     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4120     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4121   } while (0)
4122
4123
4124 #define ENCODE_LOCKING_SHIFT_2                          \
4125   do {                                                  \
4126     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4127     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4128   } while (0)
4129
4130
4131 #define ENCODE_LOCKING_SHIFT_3                          \
4132   do {                                                  \
4133     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4134     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4135   } while (0)
4136
4137
4138 /* Produce codes for a DIMENSION1 character whose character set is
4139    CHARSET and whose position-code is C1.  Designation and invocation
4140    sequences are also produced in advance if necessary.  */
4141
4142 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4143   do {                                                                  \
4144     int id = CHARSET_ID (charset);                                      \
4145                                                                         \
4146     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4147         && id == charset_ascii)                                         \
4148       {                                                                 \
4149         id = charset_jisx0201_roman;                                    \
4150         charset = CHARSET_FROM_ID (id);                                 \
4151       }                                                                 \
4152                                                                         \
4153     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4154       {                                                                 \
4155         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4156           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4157         else                                                            \
4158           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4159         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4160         break;                                                          \
4161       }                                                                 \
4162     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4163       {                                                                 \
4164         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4165         break;                                                          \
4166       }                                                                 \
4167     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4168       {                                                                 \
4169         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4170         break;                                                          \
4171       }                                                                 \
4172     else                                                                \
4173       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4174          must invoke it, or, at first, designate it to some graphic     \
4175          register.  Then repeat the loop to actually produce the        \
4176          character.  */                                                 \
4177       dst = encode_invocation_designation (charset, coding, dst,        \
4178                                            &produced_chars);            \
4179   } while (1)
4180
4181
4182 /* Produce codes for a DIMENSION2 character whose character set is
4183    CHARSET and whose position-codes are C1 and C2.  Designation and
4184    invocation codes are also produced in advance if necessary.  */
4185
4186 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4187   do {                                                                  \
4188     int id = CHARSET_ID (charset);                                      \
4189                                                                         \
4190     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4191         && id == charset_jisx0208)                                      \
4192       {                                                                 \
4193         id = charset_jisx0208_1978;                                     \
4194         charset = CHARSET_FROM_ID (id);                                 \
4195       }                                                                 \
4196                                                                         \
4197     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4198       {                                                                 \
4199         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4200           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4201         else                                                            \
4202           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4203         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4204         break;                                                          \
4205       }                                                                 \
4206     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4207       {                                                                 \
4208         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4209         break;                                                          \
4210       }                                                                 \
4211     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4212       {                                                                 \
4213         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4214         break;                                                          \
4215       }                                                                 \
4216     else                                                                \
4217       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4218          must invoke it, or, at first, designate it to some graphic     \
4219          register.  Then repeat the loop to actually produce the        \
4220          character.  */                                                 \
4221       dst = encode_invocation_designation (charset, coding, dst,        \
4222                                            &produced_chars);            \
4223   } while (1)
4224
4225
4226 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4227   do {                                                                     \
4228     unsigned code;                                                         \
4229     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4230                                                                            \
4231     if (CHARSET_DIMENSION (charset) == 1)                                  \
4232       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4233     else                                                                   \
4234       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4235   } while (0)
4236
4237
4238 /* Produce designation and invocation codes at a place pointed by DST
4239    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4240    Return new DST.  */
4241
4242 static unsigned char *
4243 encode_invocation_designation (struct charset *charset,
4244                                struct coding_system *coding,
4245                                unsigned char *dst, ptrdiff_t *p_nchars)
4246 {
4247   bool multibytep = coding->dst_multibyte;
4248   ptrdiff_t produced_chars = *p_nchars;
4249   int reg;                      /* graphic register number */
4250   int id = CHARSET_ID (charset);
4251
4252   /* At first, check designations.  */
4253   for (reg = 0; reg < 4; reg++)
4254     if (id == CODING_ISO_DESIGNATION (coding, reg))
4255       break;
4256
4257   if (reg >= 4)
4258     {
4259       /* CHARSET is not yet designated to any graphic registers.  */
4260       /* At first check the requested designation.  */
4261       reg = CODING_ISO_REQUEST (coding, id);
4262       if (reg < 0)
4263         /* Since CHARSET requests no special designation, designate it
4264            to graphic register 0.  */
4265         reg = 0;
4266
4267       ENCODE_DESIGNATION (charset, reg, coding);
4268     }
4269
4270   if (CODING_ISO_INVOCATION (coding, 0) != reg
4271       && CODING_ISO_INVOCATION (coding, 1) != reg)
4272     {
4273       /* Since the graphic register REG is not invoked to any graphic
4274          planes, invoke it to graphic plane 0.  */
4275       switch (reg)
4276         {
4277         case 0:                 /* graphic register 0 */
4278           ENCODE_SHIFT_IN;
4279           break;
4280
4281         case 1:                 /* graphic register 1 */
4282           ENCODE_SHIFT_OUT;
4283           break;
4284
4285         case 2:                 /* graphic register 2 */
4286           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4287             ENCODE_SINGLE_SHIFT_2;
4288           else
4289             ENCODE_LOCKING_SHIFT_2;
4290           break;
4291
4292         case 3:                 /* graphic register 3 */
4293           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4294             ENCODE_SINGLE_SHIFT_3;
4295           else
4296             ENCODE_LOCKING_SHIFT_3;
4297           break;
4298
4299         default:
4300           break;
4301         }
4302     }
4303
4304   *p_nchars = produced_chars;
4305   return dst;
4306 }
4307
4308
4309 /* Produce codes for designation and invocation to reset the graphic
4310    planes and registers to initial state.  */
4311 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4312   do {                                                                  \
4313     int reg;                                                            \
4314     struct charset *charset;                                            \
4315                                                                         \
4316     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4317       ENCODE_SHIFT_IN;                                                  \
4318     for (reg = 0; reg < 4; reg++)                                       \
4319       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4320           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4321               != CODING_ISO_INITIAL (coding, reg)))                     \
4322         {                                                               \
4323           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4324           ENCODE_DESIGNATION (charset, reg, coding);                    \
4325         }                                                               \
4326   } while (0)
4327
4328
4329 /* Produce designation sequences of charsets in the line started from
4330    CHARBUF to a place pointed by DST, and return the number of
4331    produced bytes.  DST should not directly point a buffer text area
4332    which may be relocated by char_charset call.
4333
4334    If the current block ends before any end-of-line, we may fail to
4335    find all the necessary designations.  */
4336
4337 static ptrdiff_t
4338 encode_designation_at_bol (struct coding_system *coding,
4339                            int *charbuf, int *charbuf_end,
4340                            unsigned char *dst)
4341 {
4342   unsigned char *orig = dst;
4343   struct charset *charset;
4344   /* Table of charsets to be designated to each graphic register.  */
4345   int r[4];
4346   int c, found = 0, reg;
4347   ptrdiff_t produced_chars = 0;
4348   bool multibytep = coding->dst_multibyte;
4349   Lisp_Object attrs;
4350   Lisp_Object charset_list;
4351
4352   attrs = CODING_ID_ATTRS (coding->id);
4353   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4354   if (EQ (charset_list, Qiso_2022))
4355     charset_list = Viso_2022_charset_list;
4356
4357   for (reg = 0; reg < 4; reg++)
4358     r[reg] = -1;
4359
4360   while (charbuf < charbuf_end && found < 4)
4361     {
4362       int id;
4363
4364       c = *charbuf++;
4365       if (c == '\n')
4366         break;
4367       charset = char_charset (c, charset_list, NULL);
4368       id = CHARSET_ID (charset);
4369       reg = CODING_ISO_REQUEST (coding, id);
4370       if (reg >= 0 && r[reg] < 0)
4371         {
4372           found++;
4373           r[reg] = id;
4374         }
4375     }
4376
4377   if (found)
4378     {
4379       for (reg = 0; reg < 4; reg++)
4380         if (r[reg] >= 0
4381             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4382           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4383     }
4384
4385   return dst - orig;
4386 }
4387
4388 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4389
4390 static bool
4391 encode_coding_iso_2022 (struct coding_system *coding)
4392 {
4393   bool multibytep = coding->dst_multibyte;
4394   int *charbuf = coding->charbuf;
4395   int *charbuf_end = charbuf + coding->charbuf_used;
4396   unsigned char *dst = coding->destination + coding->produced;
4397   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4398   int safe_room = 16;
4399   bool bol_designation
4400     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4401        && CODING_ISO_BOL (coding));
4402   ptrdiff_t produced_chars = 0;
4403   Lisp_Object attrs, eol_type, charset_list;
4404   bool ascii_compatible;
4405   int c;
4406   int preferred_charset_id = -1;
4407
4408   CODING_GET_INFO (coding, attrs, charset_list);
4409   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4410   if (VECTORP (eol_type))
4411     eol_type = Qunix;
4412
4413   setup_iso_safe_charsets (attrs);
4414   /* Charset list may have been changed.  */
4415   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4416   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4417
4418   ascii_compatible
4419     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4420        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4421                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4422
4423   while (charbuf < charbuf_end)
4424     {
4425       ASSURE_DESTINATION (safe_room);
4426
4427       if (bol_designation)
4428         {
4429           /* We have to produce designation sequences if any now.  */
4430           unsigned char desig_buf[16];
4431           ptrdiff_t nbytes;
4432           ptrdiff_t offset;
4433
4434           charset_map_loaded = 0;
4435           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4436                                               desig_buf);
4437           if (charset_map_loaded
4438               && (offset = coding_change_destination (coding)))
4439             {
4440               dst += offset;
4441               dst_end += offset;
4442             }
4443           memcpy (dst, desig_buf, nbytes);
4444           dst += nbytes;
4445           /* We are sure that designation sequences are all ASCII bytes.  */
4446           produced_chars += nbytes;
4447           bol_designation = 0;
4448           ASSURE_DESTINATION (safe_room);
4449         }
4450
4451       c = *charbuf++;
4452
4453       if (c < 0)
4454         {
4455           /* Handle an annotation.  */
4456           switch (*charbuf)
4457             {
4458             case CODING_ANNOTATE_COMPOSITION_MASK:
4459               /* Not yet implemented.  */
4460               break;
4461             case CODING_ANNOTATE_CHARSET_MASK:
4462               preferred_charset_id = charbuf[2];
4463               if (preferred_charset_id >= 0
4464                   && NILP (Fmemq (make_number (preferred_charset_id),
4465                                   charset_list)))
4466                 preferred_charset_id = -1;
4467               break;
4468             default:
4469               emacs_abort ();
4470             }
4471           charbuf += -c - 1;
4472           continue;
4473         }
4474
4475       /* Now encode the character C.  */
4476       if (c < 0x20 || c == 0x7F)
4477         {
4478           if (c == '\n'
4479               || (c == '\r' && EQ (eol_type, Qmac)))
4480             {
4481               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4482                 ENCODE_RESET_PLANE_AND_REGISTER ();
4483               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4484                 {
4485                   int i;
4486
4487                   for (i = 0; i < 4; i++)
4488                     CODING_ISO_DESIGNATION (coding, i)
4489                       = CODING_ISO_INITIAL (coding, i);
4490                 }
4491               bol_designation = ((CODING_ISO_FLAGS (coding)
4492                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4493                                  != 0);
4494             }
4495           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4496             ENCODE_RESET_PLANE_AND_REGISTER ();
4497           EMIT_ONE_ASCII_BYTE (c);
4498         }
4499       else if (ASCII_CHAR_P (c))
4500         {
4501           if (ascii_compatible)
4502             EMIT_ONE_ASCII_BYTE (c);
4503           else
4504             {
4505               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4506               ENCODE_ISO_CHARACTER (charset, c);
4507             }
4508         }
4509       else if (CHAR_BYTE8_P (c))
4510         {
4511           c = CHAR_TO_BYTE8 (c);
4512           EMIT_ONE_BYTE (c);
4513         }
4514       else
4515         {
4516           struct charset *charset;
4517
4518           if (preferred_charset_id >= 0)
4519             {
4520               bool result;
4521
4522               charset = CHARSET_FROM_ID (preferred_charset_id);
4523               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4524               if (! result)
4525                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4526                                      NULL, charset);
4527             }
4528           else
4529             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4530                                  NULL, charset);
4531           if (!charset)
4532             {
4533               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4534                 {
4535                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4536                   charset = CHARSET_FROM_ID (charset_ascii);
4537                 }
4538               else
4539                 {
4540                   c = coding->default_char;
4541                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4542                                        charset_list, NULL, charset);
4543                 }
4544             }
4545           ENCODE_ISO_CHARACTER (charset, c);
4546         }
4547     }
4548
4549   if (coding->mode & CODING_MODE_LAST_BLOCK
4550       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4551     {
4552       ASSURE_DESTINATION (safe_room);
4553       ENCODE_RESET_PLANE_AND_REGISTER ();
4554     }
4555   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4556   CODING_ISO_BOL (coding) = bol_designation;
4557   coding->produced_char += produced_chars;
4558   coding->produced = dst - coding->destination;
4559   return 0;
4560 }
4561
4562 \f
4563 /*** 8,9. SJIS and BIG5 handlers ***/
4564
4565 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4566    quite widely.  So, for the moment, Emacs supports them in the bare
4567    C code.  But, in the future, they may be supported only by CCL.  */
4568
4569 /* SJIS is a coding system encoding three character sets: ASCII, right
4570    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4571    as is.  A character of charset katakana-jisx0201 is encoded by
4572    "position-code + 0x80".  A character of charset japanese-jisx0208
4573    is encoded in 2-byte but two position-codes are divided and shifted
4574    so that it fit in the range below.
4575
4576    --- CODE RANGE of SJIS ---
4577    (character set)      (range)
4578    ASCII                0x00 .. 0x7F
4579    KATAKANA-JISX0201    0xA0 .. 0xDF
4580    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4581             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4582    -------------------------------
4583
4584 */
4585
4586 /* BIG5 is a coding system encoding two character sets: ASCII and
4587    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4588    character set and is encoded in two-byte.
4589
4590    --- CODE RANGE of BIG5 ---
4591    (character set)      (range)
4592    ASCII                0x00 .. 0x7F
4593    Big5 (1st byte)      0xA1 .. 0xFE
4594         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4595    --------------------------
4596
4597   */
4598
4599 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4600    Return true if a text is encoded in SJIS.  */
4601
4602 static bool
4603 detect_coding_sjis (struct coding_system *coding,
4604                     struct coding_detection_info *detect_info)
4605 {
4606   const unsigned char *src = coding->source, *src_base;
4607   const unsigned char *src_end = coding->source + coding->src_bytes;
4608   bool multibytep = coding->src_multibyte;
4609   ptrdiff_t consumed_chars = 0;
4610   int found = 0;
4611   int c;
4612   Lisp_Object attrs, charset_list;
4613   int max_first_byte_of_2_byte_code;
4614
4615   CODING_GET_INFO (coding, attrs, charset_list);
4616   max_first_byte_of_2_byte_code
4617     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4618
4619   detect_info->checked |= CATEGORY_MASK_SJIS;
4620   /* A coding system of this category is always ASCII compatible.  */
4621   src += coding->head_ascii;
4622
4623   while (1)
4624     {
4625       src_base = src;
4626       ONE_MORE_BYTE (c);
4627       if (c < 0x80)
4628         continue;
4629       if ((c >= 0x81 && c <= 0x9F)
4630           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4631         {
4632           ONE_MORE_BYTE (c);
4633           if (c < 0x40 || c == 0x7F || c > 0xFC)
4634             break;
4635           found = CATEGORY_MASK_SJIS;
4636         }
4637       else if (c >= 0xA0 && c < 0xE0)
4638         found = CATEGORY_MASK_SJIS;
4639       else
4640         break;
4641     }
4642   detect_info->rejected |= CATEGORY_MASK_SJIS;
4643   return 0;
4644
4645  no_more_source:
4646   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4647     {
4648       detect_info->rejected |= CATEGORY_MASK_SJIS;
4649       return 0;
4650     }
4651   detect_info->found |= found;
4652   return 1;
4653 }
4654
4655 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4656    Return true if a text is encoded in BIG5.  */
4657
4658 static bool
4659 detect_coding_big5 (struct coding_system *coding,
4660                     struct coding_detection_info *detect_info)
4661 {
4662   const unsigned char *src = coding->source, *src_base;
4663   const unsigned char *src_end = coding->source + coding->src_bytes;
4664   bool multibytep = coding->src_multibyte;
4665   ptrdiff_t consumed_chars = 0;
4666   int found = 0;
4667   int c;
4668
4669   detect_info->checked |= CATEGORY_MASK_BIG5;
4670   /* A coding system of this category is always ASCII compatible.  */
4671   src += coding->head_ascii;
4672
4673   while (1)
4674     {
4675       src_base = src;
4676       ONE_MORE_BYTE (c);
4677       if (c < 0x80)
4678         continue;
4679       if (c >= 0xA1)
4680         {
4681           ONE_MORE_BYTE (c);
4682           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4683             return 0;
4684           found = CATEGORY_MASK_BIG5;
4685         }
4686       else
4687         break;
4688     }
4689   detect_info->rejected |= CATEGORY_MASK_BIG5;
4690   return 0;
4691
4692  no_more_source:
4693   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4694     {
4695       detect_info->rejected |= CATEGORY_MASK_BIG5;
4696       return 0;
4697     }
4698   detect_info->found |= found;
4699   return 1;
4700 }
4701
4702 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4703
4704 static void
4705 decode_coding_sjis (struct coding_system *coding)
4706 {
4707   const unsigned char *src = coding->source + coding->consumed;
4708   const unsigned char *src_end = coding->source + coding->src_bytes;
4709   const unsigned char *src_base;
4710   int *charbuf = coding->charbuf + coding->charbuf_used;
4711   /* We may produce one charset annotation in one loop and one more at
4712      the end.  */
4713   int *charbuf_end
4714     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4715   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4716   bool multibytep = coding->src_multibyte;
4717   struct charset *charset_roman, *charset_kanji, *charset_kana;
4718   struct charset *charset_kanji2;
4719   Lisp_Object attrs, charset_list, val;
4720   ptrdiff_t char_offset = coding->produced_char;
4721   ptrdiff_t last_offset = char_offset;
4722   int last_id = charset_ascii;
4723   bool eol_dos
4724     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4725   int byte_after_cr = -1;
4726
4727   CODING_GET_INFO (coding, attrs, charset_list);
4728
4729   val = charset_list;
4730   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4731   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4732   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4733   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4734
4735   while (1)
4736     {
4737       int c, c1;
4738       struct charset *charset;
4739
4740       src_base = src;
4741       consumed_chars_base = consumed_chars;
4742
4743       if (charbuf >= charbuf_end)
4744         {
4745           if (byte_after_cr >= 0)
4746             src_base--;
4747           break;
4748         }
4749
4750       if (byte_after_cr >= 0)
4751         c = byte_after_cr, byte_after_cr = -1;
4752       else
4753         ONE_MORE_BYTE (c);
4754       if (c < 0)
4755         goto invalid_code;
4756       if (c < 0x80)
4757         {
4758           if (eol_dos && c == '\r')
4759             ONE_MORE_BYTE (byte_after_cr);
4760           charset = charset_roman;
4761         }
4762       else if (c == 0x80 || c == 0xA0)
4763         goto invalid_code;
4764       else if (c >= 0xA1 && c <= 0xDF)
4765         {
4766           /* SJIS -> JISX0201-Kana */
4767           c &= 0x7F;
4768           charset = charset_kana;
4769         }
4770       else if (c <= 0xEF)
4771         {
4772           /* SJIS -> JISX0208 */
4773           ONE_MORE_BYTE (c1);
4774           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4775             goto invalid_code;
4776           c = (c << 8) | c1;
4777           SJIS_TO_JIS (c);
4778           charset = charset_kanji;
4779         }
4780       else if (c <= 0xFC && charset_kanji2)
4781         {
4782           /* SJIS -> JISX0213-2 */
4783           ONE_MORE_BYTE (c1);
4784           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4785             goto invalid_code;
4786           c = (c << 8) | c1;
4787           SJIS_TO_JIS2 (c);
4788           charset = charset_kanji2;
4789         }
4790       else
4791         goto invalid_code;
4792       if (charset->id != charset_ascii
4793           && last_id != charset->id)
4794         {
4795           if (last_id != charset_ascii)
4796             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4797           last_id = charset->id;
4798           last_offset = char_offset;
4799         }
4800       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4801       *charbuf++ = c;
4802       char_offset++;
4803       continue;
4804
4805     invalid_code:
4806       src = src_base;
4807       consumed_chars = consumed_chars_base;
4808       ONE_MORE_BYTE (c);
4809       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4810       char_offset++;
4811     }
4812
4813  no_more_source:
4814   if (last_id != charset_ascii)
4815     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4816   coding->consumed_char += consumed_chars_base;
4817   coding->consumed = src_base - coding->source;
4818   coding->charbuf_used = charbuf - coding->charbuf;
4819 }
4820
4821 static void
4822 decode_coding_big5 (struct coding_system *coding)
4823 {
4824   const unsigned char *src = coding->source + coding->consumed;
4825   const unsigned char *src_end = coding->source + coding->src_bytes;
4826   const unsigned char *src_base;
4827   int *charbuf = coding->charbuf + coding->charbuf_used;
4828   /* We may produce one charset annotation in one loop and one more at
4829      the end.  */
4830   int *charbuf_end
4831     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4832   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4833   bool multibytep = coding->src_multibyte;
4834   struct charset *charset_roman, *charset_big5;
4835   Lisp_Object attrs, charset_list, val;
4836   ptrdiff_t char_offset = coding->produced_char;
4837   ptrdiff_t last_offset = char_offset;
4838   int last_id = charset_ascii;
4839   bool eol_dos
4840     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4841   int byte_after_cr = -1;
4842
4843   CODING_GET_INFO (coding, attrs, charset_list);
4844   val = charset_list;
4845   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4846   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4847
4848   while (1)
4849     {
4850       int c, c1;
4851       struct charset *charset;
4852
4853       src_base = src;
4854       consumed_chars_base = consumed_chars;
4855
4856       if (charbuf >= charbuf_end)
4857         {
4858           if (byte_after_cr >= 0)
4859             src_base--;
4860           break;
4861         }
4862
4863       if (byte_after_cr >= 0)
4864         c = byte_after_cr, byte_after_cr = -1;
4865       else
4866         ONE_MORE_BYTE (c);
4867
4868       if (c < 0)
4869         goto invalid_code;
4870       if (c < 0x80)
4871         {
4872           if (eol_dos && c == '\r')
4873             ONE_MORE_BYTE (byte_after_cr);
4874           charset = charset_roman;
4875         }
4876       else
4877         {
4878           /* BIG5 -> Big5 */
4879           if (c < 0xA1 || c > 0xFE)
4880             goto invalid_code;
4881           ONE_MORE_BYTE (c1);
4882           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4883             goto invalid_code;
4884           c = c << 8 | c1;
4885           charset = charset_big5;
4886         }
4887       if (charset->id != charset_ascii
4888           && last_id != charset->id)
4889         {
4890           if (last_id != charset_ascii)
4891             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4892           last_id = charset->id;
4893           last_offset = char_offset;
4894         }
4895       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4896       *charbuf++ = c;
4897       char_offset++;
4898       continue;
4899
4900     invalid_code:
4901       src = src_base;
4902       consumed_chars = consumed_chars_base;
4903       ONE_MORE_BYTE (c);
4904       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4905       char_offset++;
4906     }
4907
4908  no_more_source:
4909   if (last_id != charset_ascii)
4910     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4911   coding->consumed_char += consumed_chars_base;
4912   coding->consumed = src_base - coding->source;
4913   coding->charbuf_used = charbuf - coding->charbuf;
4914 }
4915
4916 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4917    This function can encode charsets `ascii', `katakana-jisx0201',
4918    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4919    are sure that all these charsets are registered as official charset
4920    (i.e. do not have extended leading-codes).  Characters of other
4921    charsets are produced without any encoding.  */
4922
4923 static bool
4924 encode_coding_sjis (struct coding_system *coding)
4925 {
4926   bool multibytep = coding->dst_multibyte;
4927   int *charbuf = coding->charbuf;
4928   int *charbuf_end = charbuf + coding->charbuf_used;
4929   unsigned char *dst = coding->destination + coding->produced;
4930   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4931   int safe_room = 4;
4932   ptrdiff_t produced_chars = 0;
4933   Lisp_Object attrs, charset_list, val;
4934   bool ascii_compatible;
4935   struct charset *charset_kanji, *charset_kana;
4936   struct charset *charset_kanji2;
4937   int c;
4938
4939   CODING_GET_INFO (coding, attrs, charset_list);
4940   val = XCDR (charset_list);
4941   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4942   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4943   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4944
4945   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4946
4947   while (charbuf < charbuf_end)
4948     {
4949       ASSURE_DESTINATION (safe_room);
4950       c = *charbuf++;
4951       /* Now encode the character C.  */
4952       if (ASCII_CHAR_P (c) && ascii_compatible)
4953         EMIT_ONE_ASCII_BYTE (c);
4954       else if (CHAR_BYTE8_P (c))
4955         {
4956           c = CHAR_TO_BYTE8 (c);
4957           EMIT_ONE_BYTE (c);
4958         }
4959       else
4960         {
4961           unsigned code;
4962           struct charset *charset;
4963           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4964                                &code, charset);
4965
4966           if (!charset)
4967             {
4968               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4969                 {
4970                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4971                   charset = CHARSET_FROM_ID (charset_ascii);
4972                 }
4973               else
4974                 {
4975                   c = coding->default_char;
4976                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4977                                        charset_list, &code, charset);
4978                 }
4979             }
4980           if (code == CHARSET_INVALID_CODE (charset))
4981             emacs_abort ();
4982           if (charset == charset_kanji)
4983             {
4984               int c1, c2;
4985               JIS_TO_SJIS (code);
4986               c1 = code >> 8, c2 = code & 0xFF;
4987               EMIT_TWO_BYTES (c1, c2);
4988             }
4989           else if (charset == charset_kana)
4990             EMIT_ONE_BYTE (code | 0x80);
4991           else if (charset_kanji2 && charset == charset_kanji2)
4992             {
4993               int c1, c2;
4994
4995               c1 = code >> 8;
4996               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4997                   || c1 == 0x28
4998                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4999                 {
5000                   JIS_TO_SJIS2 (code);
5001                   c1 = code >> 8, c2 = code & 0xFF;
5002                   EMIT_TWO_BYTES (c1, c2);
5003                 }
5004               else
5005                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5006             }
5007           else
5008             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5009         }
5010     }
5011   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5012   coding->produced_char += produced_chars;
5013   coding->produced = dst - coding->destination;
5014   return 0;
5015 }
5016
5017 static bool
5018 encode_coding_big5 (struct coding_system *coding)
5019 {
5020   bool multibytep = coding->dst_multibyte;
5021   int *charbuf = coding->charbuf;
5022   int *charbuf_end = charbuf + coding->charbuf_used;
5023   unsigned char *dst = coding->destination + coding->produced;
5024   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5025   int safe_room = 4;
5026   ptrdiff_t produced_chars = 0;
5027   Lisp_Object attrs, charset_list, val;
5028   bool ascii_compatible;
5029   struct charset *charset_big5;
5030   int c;
5031
5032   CODING_GET_INFO (coding, attrs, charset_list);
5033   val = XCDR (charset_list);
5034   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5035   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5036
5037   while (charbuf < charbuf_end)
5038     {
5039       ASSURE_DESTINATION (safe_room);
5040       c = *charbuf++;
5041       /* Now encode the character C.  */
5042       if (ASCII_CHAR_P (c) && ascii_compatible)
5043         EMIT_ONE_ASCII_BYTE (c);
5044       else if (CHAR_BYTE8_P (c))
5045         {
5046           c = CHAR_TO_BYTE8 (c);
5047           EMIT_ONE_BYTE (c);
5048         }
5049       else
5050         {
5051           unsigned code;
5052           struct charset *charset;
5053           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5054                                &code, charset);
5055
5056           if (! charset)
5057             {
5058               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5059                 {
5060                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5061                   charset = CHARSET_FROM_ID (charset_ascii);
5062                 }
5063               else
5064                 {
5065                   c = coding->default_char;
5066                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5067                                        charset_list, &code, charset);
5068                 }
5069             }
5070           if (code == CHARSET_INVALID_CODE (charset))
5071             emacs_abort ();
5072           if (charset == charset_big5)
5073             {
5074               int c1, c2;
5075
5076               c1 = code >> 8, c2 = code & 0xFF;
5077               EMIT_TWO_BYTES (c1, c2);
5078             }
5079           else
5080             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5081         }
5082     }
5083   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5084   coding->produced_char += produced_chars;
5085   coding->produced = dst - coding->destination;
5086   return 0;
5087 }
5088
5089 \f
5090 /*** 10. CCL handlers ***/
5091
5092 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5093    Return true if a text is encoded in a coding system of which
5094    encoder/decoder are written in CCL program.  */
5095
5096 static bool
5097 detect_coding_ccl (struct coding_system *coding,
5098                    struct coding_detection_info *detect_info)
5099 {
5100   const unsigned char *src = coding->source, *src_base;
5101   const unsigned char *src_end = coding->source + coding->src_bytes;
5102   bool multibytep = coding->src_multibyte;
5103   ptrdiff_t consumed_chars = 0;
5104   int found = 0;
5105   unsigned char *valids;
5106   ptrdiff_t head_ascii = coding->head_ascii;
5107   Lisp_Object attrs;
5108
5109   detect_info->checked |= CATEGORY_MASK_CCL;
5110
5111   coding = &coding_categories[coding_category_ccl];
5112   valids = CODING_CCL_VALIDS (coding);
5113   attrs = CODING_ID_ATTRS (coding->id);
5114   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5115     src += head_ascii;
5116
5117   while (1)
5118     {
5119       int c;
5120
5121       src_base = src;
5122       ONE_MORE_BYTE (c);
5123       if (c < 0 || ! valids[c])
5124         break;
5125       if ((valids[c] > 1))
5126         found = CATEGORY_MASK_CCL;
5127     }
5128   detect_info->rejected |= CATEGORY_MASK_CCL;
5129   return 0;
5130
5131  no_more_source:
5132   detect_info->found |= found;
5133   return 1;
5134 }
5135
5136 static void
5137 decode_coding_ccl (struct coding_system *coding)
5138 {
5139   const unsigned char *src = coding->source + coding->consumed;
5140   const unsigned char *src_end = coding->source + coding->src_bytes;
5141   int *charbuf = coding->charbuf + coding->charbuf_used;
5142   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5143   ptrdiff_t consumed_chars = 0;
5144   bool multibytep = coding->src_multibyte;
5145   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5146   int source_charbuf[1024];
5147   int source_byteidx[1025];
5148   Lisp_Object attrs, charset_list;
5149
5150   CODING_GET_INFO (coding, attrs, charset_list);
5151
5152   while (1)
5153     {
5154       const unsigned char *p = src;
5155       ptrdiff_t offset;
5156       int i = 0;
5157
5158       if (multibytep)
5159         {
5160           while (i < 1024 && p < src_end)
5161             {
5162               source_byteidx[i] = p - src;
5163               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5164             }
5165           source_byteidx[i] = p - src;
5166         }
5167       else
5168         while (i < 1024 && p < src_end)
5169           source_charbuf[i++] = *p++;
5170
5171       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5172         ccl->last_block = true;
5173       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5174       charset_map_loaded = 0;
5175       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5176                   charset_list);
5177       if (charset_map_loaded
5178           && (offset = coding_change_source (coding)))
5179         {
5180           p += offset;
5181           src += offset;
5182           src_end += offset;
5183         }
5184       charbuf += ccl->produced;
5185       if (multibytep)
5186         src += source_byteidx[ccl->consumed];
5187       else
5188         src += ccl->consumed;
5189       consumed_chars += ccl->consumed;
5190       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5191         break;
5192     }
5193
5194   switch (ccl->status)
5195     {
5196     case CCL_STAT_SUSPEND_BY_SRC:
5197       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5198       break;
5199     case CCL_STAT_SUSPEND_BY_DST:
5200       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5201       break;
5202     case CCL_STAT_QUIT:
5203     case CCL_STAT_INVALID_CMD:
5204       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5205       break;
5206     default:
5207       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5208       break;
5209     }
5210   coding->consumed_char += consumed_chars;
5211   coding->consumed = src - coding->source;
5212   coding->charbuf_used = charbuf - coding->charbuf;
5213 }
5214
5215 static bool
5216 encode_coding_ccl (struct coding_system *coding)
5217 {
5218   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5219   bool multibytep = coding->dst_multibyte;
5220   int *charbuf = coding->charbuf;
5221   int *charbuf_end = charbuf + coding->charbuf_used;
5222   unsigned char *dst = coding->destination + coding->produced;
5223   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5224   int destination_charbuf[1024];
5225   ptrdiff_t produced_chars = 0;
5226   int i;
5227   Lisp_Object attrs, charset_list;
5228
5229   CODING_GET_INFO (coding, attrs, charset_list);
5230   if (coding->consumed_char == coding->src_chars
5231       && coding->mode & CODING_MODE_LAST_BLOCK)
5232     ccl->last_block = true;
5233
5234   do
5235     {
5236       ptrdiff_t offset;
5237
5238       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5239       charset_map_loaded = 0;
5240       ccl_driver (ccl, charbuf, destination_charbuf,
5241                   charbuf_end - charbuf, 1024, charset_list);
5242       if (charset_map_loaded
5243           && (offset = coding_change_destination (coding)))
5244         dst += offset;
5245       if (multibytep)
5246         {
5247           ASSURE_DESTINATION (ccl->produced * 2);
5248           for (i = 0; i < ccl->produced; i++)
5249             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5250         }
5251       else
5252         {
5253           ASSURE_DESTINATION (ccl->produced);
5254           for (i = 0; i < ccl->produced; i++)
5255             *dst++ = destination_charbuf[i] & 0xFF;
5256           produced_chars += ccl->produced;
5257         }
5258       charbuf += ccl->consumed;
5259       if (ccl->status == CCL_STAT_QUIT
5260           || ccl->status == CCL_STAT_INVALID_CMD)
5261         break;
5262     }
5263   while (charbuf < charbuf_end);
5264
5265   switch (ccl->status)
5266     {
5267     case CCL_STAT_SUSPEND_BY_SRC:
5268       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5269       break;
5270     case CCL_STAT_SUSPEND_BY_DST:
5271       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5272       break;
5273     case CCL_STAT_QUIT:
5274     case CCL_STAT_INVALID_CMD:
5275       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5276       break;
5277     default:
5278       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5279       break;
5280     }
5281
5282   coding->produced_char += produced_chars;
5283   coding->produced = dst - coding->destination;
5284   return 0;
5285 }
5286
5287 \f
5288 /*** 10, 11. no-conversion handlers ***/
5289
5290 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5291
5292 static void
5293 decode_coding_raw_text (struct coding_system *coding)
5294 {
5295   bool eol_dos
5296     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5297
5298   coding->chars_at_source = 1;
5299   coding->consumed_char = coding->src_chars;
5300   coding->consumed = coding->src_bytes;
5301   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5302     {
5303       coding->consumed_char--;
5304       coding->consumed--;
5305       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5306     }
5307   else
5308     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5309 }
5310
5311 static bool
5312 encode_coding_raw_text (struct coding_system *coding)
5313 {
5314   bool multibytep = coding->dst_multibyte;
5315   int *charbuf = coding->charbuf;
5316   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5317   unsigned char *dst = coding->destination + coding->produced;
5318   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5319   ptrdiff_t produced_chars = 0;
5320   int c;
5321
5322   if (multibytep)
5323     {
5324       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5325
5326       if (coding->src_multibyte)
5327         while (charbuf < charbuf_end)
5328           {
5329             ASSURE_DESTINATION (safe_room);
5330             c = *charbuf++;
5331             if (ASCII_CHAR_P (c))
5332               EMIT_ONE_ASCII_BYTE (c);
5333             else if (CHAR_BYTE8_P (c))
5334               {
5335                 c = CHAR_TO_BYTE8 (c);
5336                 EMIT_ONE_BYTE (c);
5337               }
5338             else
5339               {
5340                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5341
5342                 CHAR_STRING_ADVANCE (c, p1);
5343                 do
5344                   {
5345                     EMIT_ONE_BYTE (*p0);
5346                     p0++;
5347                   }
5348                 while (p0 < p1);
5349               }
5350           }
5351       else
5352         while (charbuf < charbuf_end)
5353           {
5354             ASSURE_DESTINATION (safe_room);
5355             c = *charbuf++;
5356             EMIT_ONE_BYTE (c);
5357           }
5358     }
5359   else
5360     {
5361       if (coding->src_multibyte)
5362         {
5363           int safe_room = MAX_MULTIBYTE_LENGTH;
5364
5365           while (charbuf < charbuf_end)
5366             {
5367               ASSURE_DESTINATION (safe_room);
5368               c = *charbuf++;
5369               if (ASCII_CHAR_P (c))
5370                 *dst++ = c;
5371               else if (CHAR_BYTE8_P (c))
5372                 *dst++ = CHAR_TO_BYTE8 (c);
5373               else
5374                 CHAR_STRING_ADVANCE (c, dst);
5375             }
5376         }
5377       else
5378         {
5379           ASSURE_DESTINATION (charbuf_end - charbuf);
5380           while (charbuf < charbuf_end && dst < dst_end)
5381             *dst++ = *charbuf++;
5382         }
5383       produced_chars = dst - (coding->destination + coding->produced);
5384     }
5385   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5386   coding->produced_char += produced_chars;
5387   coding->produced = dst - coding->destination;
5388   return 0;
5389 }
5390
5391 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5392    Return true if a text is encoded in a charset-based coding system.  */
5393
5394 static bool
5395 detect_coding_charset (struct coding_system *coding,
5396                        struct coding_detection_info *detect_info)
5397 {
5398   const unsigned char *src = coding->source, *src_base;
5399   const unsigned char *src_end = coding->source + coding->src_bytes;
5400   bool multibytep = coding->src_multibyte;
5401   ptrdiff_t consumed_chars = 0;
5402   Lisp_Object attrs, valids, name;
5403   int found = 0;
5404   ptrdiff_t head_ascii = coding->head_ascii;
5405   bool check_latin_extra = 0;
5406
5407   detect_info->checked |= CATEGORY_MASK_CHARSET;
5408
5409   coding = &coding_categories[coding_category_charset];
5410   attrs = CODING_ID_ATTRS (coding->id);
5411   valids = AREF (attrs, coding_attr_charset_valids);
5412   name = CODING_ID_NAME (coding->id);
5413   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5414                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5415       || strncmp (SSDATA (SYMBOL_NAME (name)),
5416                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5417     check_latin_extra = 1;
5418
5419   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5420     src += head_ascii;
5421
5422   while (1)
5423     {
5424       int c;
5425       Lisp_Object val;
5426       struct charset *charset;
5427       int dim, idx;
5428
5429       src_base = src;
5430       ONE_MORE_BYTE (c);
5431       if (c < 0)
5432         continue;
5433       val = AREF (valids, c);
5434       if (NILP (val))
5435         break;
5436       if (c >= 0x80)
5437         {
5438           if (c < 0xA0
5439               && check_latin_extra
5440               && (!VECTORP (Vlatin_extra_code_table)
5441                   || NILP (AREF (Vlatin_extra_code_table, c))))
5442             break;
5443           found = CATEGORY_MASK_CHARSET;
5444         }
5445       if (INTEGERP (val))
5446         {
5447           charset = CHARSET_FROM_ID (XFASTINT (val));
5448           dim = CHARSET_DIMENSION (charset);
5449           for (idx = 1; idx < dim; idx++)
5450             {
5451               if (src == src_end)
5452                 goto too_short;
5453               ONE_MORE_BYTE (c);
5454               if (c < charset->code_space[(dim - 1 - idx) * 4]
5455                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5456                 break;
5457             }
5458           if (idx < dim)
5459             break;
5460         }
5461       else
5462         {
5463           idx = 1;
5464           for (; CONSP (val); val = XCDR (val))
5465             {
5466               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5467               dim = CHARSET_DIMENSION (charset);
5468               while (idx < dim)
5469                 {
5470                   if (src == src_end)
5471                     goto too_short;
5472                   ONE_MORE_BYTE (c);
5473                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5474                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5475                     break;
5476                   idx++;
5477                 }
5478               if (idx == dim)
5479                 {
5480                   val = Qnil;
5481                   break;
5482                 }
5483             }
5484           if (CONSP (val))
5485             break;
5486         }
5487     }
5488  too_short:
5489   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5490   return 0;
5491
5492  no_more_source:
5493   detect_info->found |= found;
5494   return 1;
5495 }
5496
5497 static void
5498 decode_coding_charset (struct coding_system *coding)
5499 {
5500   const unsigned char *src = coding->source + coding->consumed;
5501   const unsigned char *src_end = coding->source + coding->src_bytes;
5502   const unsigned char *src_base;
5503   int *charbuf = coding->charbuf + coding->charbuf_used;
5504   /* We may produce one charset annotation in one loop and one more at
5505      the end.  */
5506   int *charbuf_end
5507     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5508   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5509   bool multibytep = coding->src_multibyte;
5510   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5511   Lisp_Object valids;
5512   ptrdiff_t char_offset = coding->produced_char;
5513   ptrdiff_t last_offset = char_offset;
5514   int last_id = charset_ascii;
5515   bool eol_dos
5516     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5517   int byte_after_cr = -1;
5518
5519   valids = AREF (attrs, coding_attr_charset_valids);
5520
5521   while (1)
5522     {
5523       int c;
5524       Lisp_Object val;
5525       struct charset *charset;
5526       int dim;
5527       int len = 1;
5528       unsigned code;
5529
5530       src_base = src;
5531       consumed_chars_base = consumed_chars;
5532
5533       if (charbuf >= charbuf_end)
5534         {
5535           if (byte_after_cr >= 0)
5536             src_base--;
5537           break;
5538         }
5539
5540       if (byte_after_cr >= 0)
5541         {
5542           c = byte_after_cr;
5543           byte_after_cr = -1;
5544         }
5545       else
5546         {
5547           ONE_MORE_BYTE (c);
5548           if (eol_dos && c == '\r')
5549             ONE_MORE_BYTE (byte_after_cr);
5550         }
5551       if (c < 0)
5552         goto invalid_code;
5553       code = c;
5554
5555       val = AREF (valids, c);
5556       if (! INTEGERP (val) && ! CONSP (val))
5557         goto invalid_code;
5558       if (INTEGERP (val))
5559         {
5560           charset = CHARSET_FROM_ID (XFASTINT (val));
5561           dim = CHARSET_DIMENSION (charset);
5562           while (len < dim)
5563             {
5564               ONE_MORE_BYTE (c);
5565               code = (code << 8) | c;
5566               len++;
5567             }
5568           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5569                               charset, code, c);
5570         }
5571       else
5572         {
5573           /* VAL is a list of charset IDs.  It is assured that the
5574              list is sorted by charset dimensions (smaller one
5575              comes first).  */
5576           while (CONSP (val))
5577             {
5578               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5579               dim = CHARSET_DIMENSION (charset);
5580               while (len < dim)
5581                 {
5582                   ONE_MORE_BYTE (c);
5583                   code = (code << 8) | c;
5584                   len++;
5585                 }
5586               CODING_DECODE_CHAR (coding, src, src_base,
5587                                   src_end, charset, code, c);
5588               if (c >= 0)
5589                 break;
5590               val = XCDR (val);
5591             }
5592         }
5593       if (c < 0)
5594         goto invalid_code;
5595       if (charset->id != charset_ascii
5596           && last_id != charset->id)
5597         {
5598           if (last_id != charset_ascii)
5599             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5600           last_id = charset->id;
5601           last_offset = char_offset;
5602         }
5603
5604       *charbuf++ = c;
5605       char_offset++;
5606       continue;
5607
5608     invalid_code:
5609       src = src_base;
5610       consumed_chars = consumed_chars_base;
5611       ONE_MORE_BYTE (c);
5612       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5613       char_offset++;
5614     }
5615
5616  no_more_source:
5617   if (last_id != charset_ascii)
5618     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5619   coding->consumed_char += consumed_chars_base;
5620   coding->consumed = src_base - coding->source;
5621   coding->charbuf_used = charbuf - coding->charbuf;
5622 }
5623
5624 static bool
5625 encode_coding_charset (struct coding_system *coding)
5626 {
5627   bool multibytep = coding->dst_multibyte;
5628   int *charbuf = coding->charbuf;
5629   int *charbuf_end = charbuf + coding->charbuf_used;
5630   unsigned char *dst = coding->destination + coding->produced;
5631   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5632   int safe_room = MAX_MULTIBYTE_LENGTH;
5633   ptrdiff_t produced_chars = 0;
5634   Lisp_Object attrs, charset_list;
5635   bool ascii_compatible;
5636   int c;
5637
5638   CODING_GET_INFO (coding, attrs, charset_list);
5639   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5640
5641   while (charbuf < charbuf_end)
5642     {
5643       struct charset *charset;
5644       unsigned code;
5645
5646       ASSURE_DESTINATION (safe_room);
5647       c = *charbuf++;
5648       if (ascii_compatible && ASCII_CHAR_P (c))
5649         EMIT_ONE_ASCII_BYTE (c);
5650       else if (CHAR_BYTE8_P (c))
5651         {
5652           c = CHAR_TO_BYTE8 (c);
5653           EMIT_ONE_BYTE (c);
5654         }
5655       else
5656         {
5657           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5658                                &code, charset);
5659
5660           if (charset)
5661             {
5662               if (CHARSET_DIMENSION (charset) == 1)
5663                 EMIT_ONE_BYTE (code);
5664               else if (CHARSET_DIMENSION (charset) == 2)
5665                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5666               else if (CHARSET_DIMENSION (charset) == 3)
5667                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5668               else
5669                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5670                                  (code >> 8) & 0xFF, code & 0xFF);
5671             }
5672           else
5673             {
5674               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5675                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5676               else
5677                 c = coding->default_char;
5678               EMIT_ONE_BYTE (c);
5679             }
5680         }
5681     }
5682
5683   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5684   coding->produced_char += produced_chars;
5685   coding->produced = dst - coding->destination;
5686   return 0;
5687 }
5688
5689 \f
5690 /*** 7. C library functions ***/
5691
5692 /* Setup coding context CODING from information about CODING_SYSTEM.
5693    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5694    CODING_SYSTEM is invalid, signal an error.  */
5695
5696 void
5697 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5698 {
5699   Lisp_Object attrs;
5700   Lisp_Object eol_type;
5701   Lisp_Object coding_type;
5702   Lisp_Object val;
5703
5704   if (NILP (coding_system))
5705     coding_system = Qundecided;
5706
5707   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5708
5709   attrs = CODING_ID_ATTRS (coding->id);
5710   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5711
5712   coding->mode = 0;
5713   if (VECTORP (eol_type))
5714     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5715                             | CODING_REQUIRE_DETECTION_MASK);
5716   else if (! EQ (eol_type, Qunix))
5717     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5718                             | CODING_REQUIRE_ENCODING_MASK);
5719   else
5720     coding->common_flags = 0;
5721   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5722     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5723   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5724     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5725   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5726     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5727
5728   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5729   coding->max_charset_id = SCHARS (val) - 1;
5730   coding->safe_charsets = SDATA (val);
5731   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5732   coding->carryover_bytes = 0;
5733   coding->raw_destination = 0;
5734
5735   coding_type = CODING_ATTR_TYPE (attrs);
5736   if (EQ (coding_type, Qundecided))
5737     {
5738       coding->detector = NULL;
5739       coding->decoder = decode_coding_raw_text;
5740       coding->encoder = encode_coding_raw_text;
5741       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5742       coding->spec.undecided.inhibit_nbd
5743         = (encode_inhibit_flag
5744            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5745       coding->spec.undecided.inhibit_ied
5746         = (encode_inhibit_flag
5747            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5748       coding->spec.undecided.prefer_utf_8
5749         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5750     }
5751   else if (EQ (coding_type, Qiso_2022))
5752     {
5753       int i;
5754       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5755
5756       /* Invoke graphic register 0 to plane 0.  */
5757       CODING_ISO_INVOCATION (coding, 0) = 0;
5758       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5759       CODING_ISO_INVOCATION (coding, 1)
5760         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5761       /* Setup the initial status of designation.  */
5762       for (i = 0; i < 4; i++)
5763         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5764       /* Not single shifting initially.  */
5765       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5766       /* Beginning of buffer should also be regarded as bol. */
5767       CODING_ISO_BOL (coding) = 1;
5768       coding->detector = detect_coding_iso_2022;
5769       coding->decoder = decode_coding_iso_2022;
5770       coding->encoder = encode_coding_iso_2022;
5771       if (flags & CODING_ISO_FLAG_SAFE)
5772         coding->mode |= CODING_MODE_SAFE_ENCODING;
5773       coding->common_flags
5774         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5775             | CODING_REQUIRE_FLUSHING_MASK);
5776       if (flags & CODING_ISO_FLAG_COMPOSITION)
5777         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5778       if (flags & CODING_ISO_FLAG_DESIGNATION)
5779         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5780       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5781         {
5782           setup_iso_safe_charsets (attrs);
5783           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5784           coding->max_charset_id = SCHARS (val) - 1;
5785           coding->safe_charsets = SDATA (val);
5786         }
5787       CODING_ISO_FLAGS (coding) = flags;
5788       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5789       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5790       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5791       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5792     }
5793   else if (EQ (coding_type, Qcharset))
5794     {
5795       coding->detector = detect_coding_charset;
5796       coding->decoder = decode_coding_charset;
5797       coding->encoder = encode_coding_charset;
5798       coding->common_flags
5799         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5800     }
5801   else if (EQ (coding_type, Qutf_8))
5802     {
5803       val = AREF (attrs, coding_attr_utf_bom);
5804       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5805                                    : EQ (val, Qt) ? utf_with_bom
5806                                    : utf_without_bom);
5807       coding->detector = detect_coding_utf_8;
5808       coding->decoder = decode_coding_utf_8;
5809       coding->encoder = encode_coding_utf_8;
5810       coding->common_flags
5811         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5812       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5813         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5814     }
5815   else if (EQ (coding_type, Qutf_16))
5816     {
5817       val = AREF (attrs, coding_attr_utf_bom);
5818       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5819                                     : EQ (val, Qt) ? utf_with_bom
5820                                     : utf_without_bom);
5821       val = AREF (attrs, coding_attr_utf_16_endian);
5822       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5823                                        : utf_16_little_endian);
5824       CODING_UTF_16_SURROGATE (coding) = 0;
5825       coding->detector = detect_coding_utf_16;
5826       coding->decoder = decode_coding_utf_16;
5827       coding->encoder = encode_coding_utf_16;
5828       coding->common_flags
5829         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5830       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5831         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5832     }
5833   else if (EQ (coding_type, Qccl))
5834     {
5835       coding->detector = detect_coding_ccl;
5836       coding->decoder = decode_coding_ccl;
5837       coding->encoder = encode_coding_ccl;
5838       coding->common_flags
5839         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5840             | CODING_REQUIRE_FLUSHING_MASK);
5841     }
5842   else if (EQ (coding_type, Qemacs_mule))
5843     {
5844       coding->detector = detect_coding_emacs_mule;
5845       coding->decoder = decode_coding_emacs_mule;
5846       coding->encoder = encode_coding_emacs_mule;
5847       coding->common_flags
5848         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5849       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5850           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5851         {
5852           Lisp_Object tail, safe_charsets;
5853           int max_charset_id = 0;
5854
5855           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5856                tail = XCDR (tail))
5857             if (max_charset_id < XFASTINT (XCAR (tail)))
5858               max_charset_id = XFASTINT (XCAR (tail));
5859           safe_charsets = make_uninit_string (max_charset_id + 1);
5860           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5861           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5862                tail = XCDR (tail))
5863             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5864           coding->max_charset_id = max_charset_id;
5865           coding->safe_charsets = SDATA (safe_charsets);
5866         }
5867       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5868       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5869     }
5870   else if (EQ (coding_type, Qshift_jis))
5871     {
5872       coding->detector = detect_coding_sjis;
5873       coding->decoder = decode_coding_sjis;
5874       coding->encoder = encode_coding_sjis;
5875       coding->common_flags
5876         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5877     }
5878   else if (EQ (coding_type, Qbig5))
5879     {
5880       coding->detector = detect_coding_big5;
5881       coding->decoder = decode_coding_big5;
5882       coding->encoder = encode_coding_big5;
5883       coding->common_flags
5884         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5885     }
5886   else                          /* EQ (coding_type, Qraw_text) */
5887     {
5888       coding->detector = NULL;
5889       coding->decoder = decode_coding_raw_text;
5890       coding->encoder = encode_coding_raw_text;
5891       if (! EQ (eol_type, Qunix))
5892         {
5893           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5894           if (! VECTORP (eol_type))
5895             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5896         }
5897
5898     }
5899
5900   return;
5901 }
5902
5903 /* Return a list of charsets supported by CODING.  */
5904
5905 Lisp_Object
5906 coding_charset_list (struct coding_system *coding)
5907 {
5908   Lisp_Object attrs, charset_list;
5909
5910   CODING_GET_INFO (coding, attrs, charset_list);
5911   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5912     {
5913       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5914
5915       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5916         charset_list = Viso_2022_charset_list;
5917     }
5918   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5919     {
5920       charset_list = Vemacs_mule_charset_list;
5921     }
5922   return charset_list;
5923 }
5924
5925
5926 /* Return a list of charsets supported by CODING-SYSTEM.  */
5927
5928 Lisp_Object
5929 coding_system_charset_list (Lisp_Object coding_system)
5930 {
5931   ptrdiff_t id;
5932   Lisp_Object attrs, charset_list;
5933
5934   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5935   attrs = CODING_ID_ATTRS (id);
5936
5937   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5938     {
5939       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5940
5941       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5942         charset_list = Viso_2022_charset_list;
5943       else
5944         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5945     }
5946   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5947     {
5948       charset_list = Vemacs_mule_charset_list;
5949     }
5950   else
5951     {
5952       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5953     }
5954   return charset_list;
5955 }
5956
5957
5958 /* Return raw-text or one of its subsidiaries that has the same
5959    eol_type as CODING-SYSTEM.  */
5960
5961 Lisp_Object
5962 raw_text_coding_system (Lisp_Object coding_system)
5963 {
5964   Lisp_Object spec, attrs;
5965   Lisp_Object eol_type, raw_text_eol_type;
5966
5967   if (NILP (coding_system))
5968     return Qraw_text;
5969   spec = CODING_SYSTEM_SPEC (coding_system);
5970   attrs = AREF (spec, 0);
5971
5972   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5973     return coding_system;
5974
5975   eol_type = AREF (spec, 2);
5976   if (VECTORP (eol_type))
5977     return Qraw_text;
5978   spec = CODING_SYSTEM_SPEC (Qraw_text);
5979   raw_text_eol_type = AREF (spec, 2);
5980   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5981           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5982           : AREF (raw_text_eol_type, 2));
5983 }
5984
5985 /* Return true if CODING corresponds to raw-text coding-system.  */
5986
5987 bool
5988 raw_text_coding_system_p (struct coding_system *coding)
5989 {
5990   return (coding->decoder == decode_coding_raw_text
5991           && coding->encoder == encode_coding_raw_text) ? true : false;
5992 }
5993
5994
5995 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5996    the subsidiary that has the same eol-spec as PARENT (if it is not
5997    nil and specifies end-of-line format) or the system's setting
5998    (system_eol_type).  */
5999
6000 Lisp_Object
6001 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6002 {
6003   Lisp_Object spec, eol_type;
6004
6005   if (NILP (coding_system))
6006     coding_system = Qraw_text;
6007   else
6008     CHECK_CODING_SYSTEM (coding_system);
6009   spec = CODING_SYSTEM_SPEC (coding_system);
6010   eol_type = AREF (spec, 2);
6011   if (VECTORP (eol_type))
6012     {
6013       Lisp_Object parent_eol_type;
6014
6015       if (! NILP (parent))
6016         {
6017           Lisp_Object parent_spec;
6018
6019           CHECK_CODING_SYSTEM (parent);
6020           parent_spec = CODING_SYSTEM_SPEC (parent);
6021           parent_eol_type = AREF (parent_spec, 2);
6022           if (VECTORP (parent_eol_type))
6023             parent_eol_type = system_eol_type;
6024         }
6025       else
6026         parent_eol_type = system_eol_type;
6027       if (EQ (parent_eol_type, Qunix))
6028         coding_system = AREF (eol_type, 0);
6029       else if (EQ (parent_eol_type, Qdos))
6030         coding_system = AREF (eol_type, 1);
6031       else if (EQ (parent_eol_type, Qmac))
6032         coding_system = AREF (eol_type, 2);
6033     }
6034   return coding_system;
6035 }
6036
6037
6038 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6039    decided for writing to a process.  If not, complement them, and
6040    return a new coding system.  */
6041
6042 Lisp_Object
6043 complement_process_encoding_system (Lisp_Object coding_system)
6044 {
6045   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6046   Lisp_Object spec, attrs;
6047   int i;
6048
6049   for (i = 0; i < 3; i++)
6050     {
6051       if (i == 1)
6052         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6053       else if (i == 2)
6054         coding_system = preferred_coding_system ();
6055       spec = CODING_SYSTEM_SPEC (coding_system);
6056       if (NILP (spec))
6057         continue;
6058       attrs = AREF (spec, 0);
6059       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6060         coding_base = CODING_ATTR_BASE_NAME (attrs);
6061       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6062         eol_base = coding_system;
6063       if (! NILP (coding_base) && ! NILP (eol_base))
6064         break;
6065     }
6066
6067   if (i > 0)
6068     /* The original CODING_SYSTEM didn't specify text-conversion or
6069        eol-conversion.  Be sure that we return a fully complemented
6070        coding system.  */
6071     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6072   return coding_system;
6073 }
6074
6075
6076 /* Emacs has a mechanism to automatically detect a coding system if it
6077    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6078    it's impossible to distinguish some coding systems accurately
6079    because they use the same range of codes.  So, at first, coding
6080    systems are categorized into 7, those are:
6081
6082    o coding-category-emacs-mule
6083
6084         The category for a coding system which has the same code range
6085         as Emacs' internal format.  Assigned the coding-system (Lisp
6086         symbol) `emacs-mule' by default.
6087
6088    o coding-category-sjis
6089
6090         The category for a coding system which has the same code range
6091         as SJIS.  Assigned the coding-system (Lisp
6092         symbol) `japanese-shift-jis' by default.
6093
6094    o coding-category-iso-7
6095
6096         The category for a coding system which has the same code range
6097         as ISO2022 of 7-bit environment.  This doesn't use any locking
6098         shift and single shift functions.  This can encode/decode all
6099         charsets.  Assigned the coding-system (Lisp symbol)
6100         `iso-2022-7bit' by default.
6101
6102    o coding-category-iso-7-tight
6103
6104         Same as coding-category-iso-7 except that this can
6105         encode/decode only the specified charsets.
6106
6107    o coding-category-iso-8-1
6108
6109         The category for a coding system which has the same code range
6110         as ISO2022 of 8-bit environment and graphic plane 1 used only
6111         for DIMENSION1 charset.  This doesn't use any locking shift
6112         and single shift functions.  Assigned the coding-system (Lisp
6113         symbol) `iso-latin-1' by default.
6114
6115    o coding-category-iso-8-2
6116
6117         The category for a coding system which has the same code range
6118         as ISO2022 of 8-bit environment and graphic plane 1 used only
6119         for DIMENSION2 charset.  This doesn't use any locking shift
6120         and single shift functions.  Assigned the coding-system (Lisp
6121         symbol) `japanese-iso-8bit' by default.
6122
6123    o coding-category-iso-7-else
6124
6125         The category for a coding system which has the same code range
6126         as ISO2022 of 7-bit environment but uses locking shift or
6127         single shift functions.  Assigned the coding-system (Lisp
6128         symbol) `iso-2022-7bit-lock' by default.
6129
6130    o coding-category-iso-8-else
6131
6132         The category for a coding system which has the same code range
6133         as ISO2022 of 8-bit environment but uses locking shift or
6134         single shift functions.  Assigned the coding-system (Lisp
6135         symbol) `iso-2022-8bit-ss2' by default.
6136
6137    o coding-category-big5
6138
6139         The category for a coding system which has the same code range
6140         as BIG5.  Assigned the coding-system (Lisp symbol)
6141         `cn-big5' by default.
6142
6143    o coding-category-utf-8
6144
6145         The category for a coding system which has the same code range
6146         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6147         symbol) `utf-8' by default.
6148
6149    o coding-category-utf-16-be
6150
6151         The category for a coding system in which a text has an
6152         Unicode signature (cf. Unicode Standard) in the order of BIG
6153         endian at the head.  Assigned the coding-system (Lisp symbol)
6154         `utf-16-be' by default.
6155
6156    o coding-category-utf-16-le
6157
6158         The category for a coding system in which a text has an
6159         Unicode signature (cf. Unicode Standard) in the order of
6160         LITTLE endian at the head.  Assigned the coding-system (Lisp
6161         symbol) `utf-16-le' by default.
6162
6163    o coding-category-ccl
6164
6165         The category for a coding system of which encoder/decoder is
6166         written in CCL programs.  The default value is nil, i.e., no
6167         coding system is assigned.
6168
6169    o coding-category-binary
6170
6171         The category for a coding system not categorized in any of the
6172         above.  Assigned the coding-system (Lisp symbol)
6173         `no-conversion' by default.
6174
6175    Each of them is a Lisp symbol and the value is an actual
6176    `coding-system's (this is also a Lisp symbol) assigned by a user.
6177    What Emacs does actually is to detect a category of coding system.
6178    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6179    decide only one possible category, it selects a category of the
6180    highest priority.  Priorities of categories are also specified by a
6181    user in a Lisp variable `coding-category-list'.
6182
6183 */
6184
6185 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6186                                            int eol_seen);
6187
6188
6189 /* Return the number of ASCII characters at the head of the source.
6190    By side effects, set coding->head_ascii and update
6191    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6192    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6193    reliable only when all the source bytes are ASCII.  */
6194
6195 static ptrdiff_t
6196 check_ascii (struct coding_system *coding)
6197 {
6198   const unsigned char *src, *end;
6199   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6200   int eol_seen = coding->eol_seen;
6201
6202   coding_set_source (coding);
6203   src = coding->source;
6204   end = src + coding->src_bytes;
6205
6206   if (inhibit_eol_conversion
6207       || SYMBOLP (eol_type))
6208     {
6209       /* We don't have to check EOL format.  */
6210       while (src < end && !( *src & 0x80))
6211         {
6212           if (*src++ == '\n')
6213             eol_seen |= EOL_SEEN_LF;
6214         }
6215     }
6216   else
6217     {
6218       end--;                /* We look ahead one byte for "CR LF".  */
6219       while (src < end)
6220         {
6221           int c = *src;
6222
6223           if (c & 0x80)
6224             break;
6225           src++;
6226           if (c == '\r')
6227             {
6228               if (*src == '\n')
6229                 {
6230                   eol_seen |= EOL_SEEN_CRLF;
6231                   src++;
6232                 }
6233               else
6234                 eol_seen |= EOL_SEEN_CR;
6235             }
6236           else if (c == '\n')
6237             eol_seen |= EOL_SEEN_LF;
6238         }
6239       if (src == end)
6240         {
6241           int c = *src;
6242
6243           /* All bytes but the last one C are ASCII.  */
6244           if (! (c & 0x80))
6245             {
6246               if (c == '\r')
6247                 eol_seen |= EOL_SEEN_CR;
6248               else if (c  == '\n')
6249                 eol_seen |= EOL_SEEN_LF;
6250               src++;
6251             }
6252         }
6253     }
6254   coding->head_ascii = src - coding->source;
6255   coding->eol_seen = eol_seen;
6256   return (coding->head_ascii);
6257 }
6258
6259
6260 /* Return the number of characters at the source if all the bytes are
6261    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6262    effects, update coding->eol_seen.  The value of coding->eol_seen is
6263    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6264    the value is reliable only when all the source bytes are valid
6265    UTF-8.  */
6266
6267 static ptrdiff_t
6268 check_utf_8 (struct coding_system *coding)
6269 {
6270   const unsigned char *src, *end;
6271   int eol_seen;
6272   ptrdiff_t nchars = coding->head_ascii;
6273
6274   if (coding->head_ascii < 0)
6275     check_ascii (coding);
6276   else
6277     coding_set_source (coding);
6278   src = coding->source + coding->head_ascii;
6279   /* We look ahead one byte for CR LF.  */
6280   end = coding->source + coding->src_bytes - 1;
6281   eol_seen = coding->eol_seen;
6282   while (src < end)
6283     {
6284       int c = *src;
6285
6286       if (UTF_8_1_OCTET_P (*src))
6287         {
6288           src++;
6289           if (c < 0x20)
6290             {
6291               if (c == '\r')
6292                 {
6293                   if (*src == '\n')
6294                     {
6295                       eol_seen |= EOL_SEEN_CRLF;
6296                       src++;
6297                       nchars++;
6298                     }
6299                   else
6300                     eol_seen |= EOL_SEEN_CR;
6301                 }
6302               else if (c == '\n')
6303                 eol_seen |= EOL_SEEN_LF;
6304             }
6305         }
6306       else if (UTF_8_2_OCTET_LEADING_P (c))
6307         {
6308           if (c < 0xC2          /* overlong sequence */
6309               || src + 1 >= end
6310               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6311             return -1;
6312           src += 2;
6313         }
6314       else if (UTF_8_3_OCTET_LEADING_P (c))
6315         {
6316           if (src + 2 >= end
6317               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6318                     && UTF_8_EXTRA_OCTET_P (src[2])))
6319             return -1;
6320           c = (((c & 0xF) << 12)
6321                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6322           if (c < 0x800                       /* overlong sequence */
6323               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6324             return -1;
6325           src += 3;
6326         }
6327       else if (UTF_8_4_OCTET_LEADING_P (c))
6328         {
6329           if (src + 3 >= end
6330               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6331                     && UTF_8_EXTRA_OCTET_P (src[2])
6332                     && UTF_8_EXTRA_OCTET_P (src[3])))
6333             return -1;
6334           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6335                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6336           if (c < 0x10000       /* overlong sequence */
6337               || c >= 0x110000) /* non-Unicode character  */
6338             return -1;
6339           src += 4;
6340         }
6341       else
6342         return -1;
6343       nchars++;
6344     }
6345
6346   if (src == end)
6347     {
6348       if (! UTF_8_1_OCTET_P (*src))
6349         return -1;
6350       nchars++;
6351       if (*src == '\r')
6352         eol_seen |= EOL_SEEN_CR;
6353       else if (*src  == '\n')
6354         eol_seen |= EOL_SEEN_LF;
6355     }
6356   coding->eol_seen = eol_seen;
6357   return nchars;
6358 }
6359
6360
6361 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6362    SOURCE is encoded.  If CATEGORY is one of
6363    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6364    two-byte, else they are encoded by one-byte.
6365
6366    Return one of EOL_SEEN_XXX.  */
6367
6368 #define MAX_EOL_CHECK_COUNT 3
6369
6370 static int
6371 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6372             enum coding_category category)
6373 {
6374   const unsigned char *src = source, *src_end = src + src_bytes;
6375   unsigned char c;
6376   int total  = 0;
6377   int eol_seen = EOL_SEEN_NONE;
6378
6379   if ((1 << category) & CATEGORY_MASK_UTF_16)
6380     {
6381       bool msb = category == (coding_category_utf_16_le
6382                               | coding_category_utf_16_le_nosig);
6383       bool lsb = !msb;
6384
6385       while (src + 1 < src_end)
6386         {
6387           c = src[lsb];
6388           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6389             {
6390               int this_eol;
6391
6392               if (c == '\n')
6393                 this_eol = EOL_SEEN_LF;
6394               else if (src + 3 >= src_end
6395                        || src[msb + 2] != 0
6396                        || src[lsb + 2] != '\n')
6397                 this_eol = EOL_SEEN_CR;
6398               else
6399                 {
6400                   this_eol = EOL_SEEN_CRLF;
6401                   src += 2;
6402                 }
6403
6404               if (eol_seen == EOL_SEEN_NONE)
6405                 /* This is the first end-of-line.  */
6406                 eol_seen = this_eol;
6407               else if (eol_seen != this_eol)
6408                 {
6409                   /* The found type is different from what found before.
6410                      Allow for stray ^M characters in DOS EOL files.  */
6411                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6412                       || (eol_seen == EOL_SEEN_CRLF
6413                           && this_eol == EOL_SEEN_CR))
6414                     eol_seen = EOL_SEEN_CRLF;
6415                   else
6416                     {
6417                       eol_seen = EOL_SEEN_LF;
6418                       break;
6419                     }
6420                 }
6421               if (++total == MAX_EOL_CHECK_COUNT)
6422                 break;
6423             }
6424           src += 2;
6425         }
6426     }
6427   else
6428     while (src < src_end)
6429       {
6430         c = *src++;
6431         if (c == '\n' || c == '\r')
6432           {
6433             int this_eol;
6434
6435             if (c == '\n')
6436               this_eol = EOL_SEEN_LF;
6437             else if (src >= src_end || *src != '\n')
6438               this_eol = EOL_SEEN_CR;
6439             else
6440               this_eol = EOL_SEEN_CRLF, src++;
6441
6442             if (eol_seen == EOL_SEEN_NONE)
6443               /* This is the first end-of-line.  */
6444               eol_seen = this_eol;
6445             else if (eol_seen != this_eol)
6446               {
6447                 /* The found type is different from what found before.
6448                    Allow for stray ^M characters in DOS EOL files.  */
6449                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6450                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6451                   eol_seen = EOL_SEEN_CRLF;
6452                 else
6453                   {
6454                     eol_seen = EOL_SEEN_LF;
6455                     break;
6456                   }
6457               }
6458             if (++total == MAX_EOL_CHECK_COUNT)
6459               break;
6460           }
6461       }
6462   return eol_seen;
6463 }
6464
6465
6466 static Lisp_Object
6467 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6468 {
6469   Lisp_Object eol_type;
6470
6471   eol_type = CODING_ID_EOL_TYPE (coding->id);
6472   if (! VECTORP (eol_type))
6473     /* Already adjusted.  */
6474     return eol_type;
6475   if (eol_seen & EOL_SEEN_LF)
6476     {
6477       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6478       eol_type = Qunix;
6479     }
6480   else if (eol_seen & EOL_SEEN_CRLF)
6481     {
6482       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6483       eol_type = Qdos;
6484     }
6485   else if (eol_seen & EOL_SEEN_CR)
6486     {
6487       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6488       eol_type = Qmac;
6489     }
6490   return eol_type;
6491 }
6492
6493 /* Detect how a text specified in CODING is encoded.  If a coding
6494    system is detected, update fields of CODING by the detected coding
6495    system.  */
6496
6497 static void
6498 detect_coding (struct coding_system *coding)
6499 {
6500   const unsigned char *src, *src_end;
6501   unsigned int saved_mode = coding->mode;
6502   Lisp_Object found = Qnil;
6503   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6504
6505   coding->consumed = coding->consumed_char = 0;
6506   coding->produced = coding->produced_char = 0;
6507   coding_set_source (coding);
6508
6509   src_end = coding->source + coding->src_bytes;
6510
6511   coding->eol_seen = EOL_SEEN_NONE;
6512   /* If we have not yet decided the text encoding type, detect it
6513      now.  */
6514   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6515     {
6516       int c, i;
6517       struct coding_detection_info detect_info;
6518       bool null_byte_found = 0, eight_bit_found = 0;
6519       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6520                                        inhibit_null_byte_detection);
6521       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6522                                        inhibit_iso_escape_detection);
6523       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6524
6525       coding->head_ascii = 0;
6526       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6527       for (src = coding->source; src < src_end; src++)
6528         {
6529           c = *src;
6530           if (c & 0x80)
6531             {
6532               eight_bit_found = 1;
6533               if (null_byte_found)
6534                 break;
6535             }
6536           else if (c < 0x20)
6537             {
6538               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6539                   && ! inhibit_ied
6540                   && ! detect_info.checked)
6541                 {
6542                   if (detect_coding_iso_2022 (coding, &detect_info))
6543                     {
6544                       /* We have scanned the whole data.  */
6545                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6546                         {
6547                           /* We didn't find an 8-bit code.  We may
6548                              have found a null-byte, but it's very
6549                              rare that a binary file conforms to
6550                              ISO-2022.  */
6551                           src = src_end;
6552                           coding->head_ascii = src - coding->source;
6553                         }
6554                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6555                       break;
6556                     }
6557                 }
6558               else if (! c && !inhibit_nbd)
6559                 {
6560                   null_byte_found = 1;
6561                   if (eight_bit_found)
6562                     break;
6563                 }
6564               else if (! disable_ascii_optimization
6565                        && ! inhibit_eol_conversion)
6566                 {
6567                   if (c == '\r')
6568                     {
6569                       if (src < src_end && src[1] == '\n')
6570                         {
6571                           coding->eol_seen |= EOL_SEEN_CRLF;
6572                           src++;
6573                           if (! eight_bit_found)
6574                             coding->head_ascii++;
6575                         }
6576                       else
6577                         coding->eol_seen |= EOL_SEEN_CR;
6578                     }
6579                   else if (c == '\n')
6580                     {
6581                       coding->eol_seen |= EOL_SEEN_LF;
6582                     }
6583                 }
6584
6585               if (! eight_bit_found)
6586                 coding->head_ascii++;
6587             }
6588           else if (! eight_bit_found)
6589             coding->head_ascii++;
6590         }
6591
6592       if (null_byte_found || eight_bit_found
6593           || coding->head_ascii < coding->src_bytes
6594           || detect_info.found)
6595         {
6596           enum coding_category category;
6597           struct coding_system *this;
6598
6599           if (coding->head_ascii == coding->src_bytes)
6600             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6601             for (i = 0; i < coding_category_raw_text; i++)
6602               {
6603                 category = coding_priorities[i];
6604                 this = coding_categories + category;
6605                 if (detect_info.found & (1 << category))
6606                   break;
6607               }
6608           else
6609             {
6610               if (null_byte_found)
6611                 {
6612                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6613                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6614                 }
6615               else if (prefer_utf_8
6616                        && detect_coding_utf_8 (coding, &detect_info))
6617                 {
6618                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6619                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6620                 }
6621               for (i = 0; i < coding_category_raw_text; i++)
6622                 {
6623                   category = coding_priorities[i];
6624                   this = coding_categories + category;
6625                   /* Some of this->detector (e.g. detect_coding_sjis)
6626                      require this information.  */
6627                   coding->id = this->id;
6628                   if (this->id < 0)
6629                     {
6630                       /* No coding system of this category is defined.  */
6631                       detect_info.rejected |= (1 << category);
6632                     }
6633                   else if (category >= coding_category_raw_text)
6634                     continue;
6635                   else if (detect_info.checked & (1 << category))
6636                     {
6637                       if (detect_info.found & (1 << category))
6638                         break;
6639                     }
6640                   else if ((*(this->detector)) (coding, &detect_info)
6641                            && detect_info.found & (1 << category))
6642                     break;
6643                 }
6644             }
6645
6646           if (i < coding_category_raw_text)
6647             {
6648               if (category == coding_category_utf_8_auto)
6649                 {
6650                   Lisp_Object coding_systems;
6651
6652                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6653                                          coding_attr_utf_bom);
6654                   if (CONSP (coding_systems))
6655                     {
6656                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6657                         found = XCAR (coding_systems);
6658                       else
6659                         found = XCDR (coding_systems);
6660                     }
6661                   else
6662                     found = CODING_ID_NAME (this->id);
6663                 }
6664               else if (category == coding_category_utf_16_auto)
6665                 {
6666                   Lisp_Object coding_systems;
6667
6668                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6669                                          coding_attr_utf_bom);
6670                   if (CONSP (coding_systems))
6671                     {
6672                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6673                         found = XCAR (coding_systems);
6674                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6675                         found = XCDR (coding_systems);
6676                     }
6677                   else
6678                     found = CODING_ID_NAME (this->id);
6679                 }
6680               else
6681                 found = CODING_ID_NAME (this->id);
6682             }
6683           else if (null_byte_found)
6684             found = Qno_conversion;
6685           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6686                    == CATEGORY_MASK_ANY)
6687             found = Qraw_text;
6688           else if (detect_info.rejected)
6689             for (i = 0; i < coding_category_raw_text; i++)
6690               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6691                 {
6692                   this = coding_categories + coding_priorities[i];
6693                   found = CODING_ID_NAME (this->id);
6694                   break;
6695                 }
6696         }
6697     }
6698   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6699            == coding_category_utf_8_auto)
6700     {
6701       Lisp_Object coding_systems;
6702       struct coding_detection_info detect_info;
6703
6704       coding_systems
6705         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6706       detect_info.found = detect_info.rejected = 0;
6707       if (check_ascii (coding) == coding->src_bytes)
6708         {
6709           if (CONSP (coding_systems))
6710             found = XCDR (coding_systems);
6711         }
6712       else
6713         {
6714           if (CONSP (coding_systems)
6715               && detect_coding_utf_8 (coding, &detect_info))
6716             {
6717               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6718                 found = XCAR (coding_systems);
6719               else
6720                 found = XCDR (coding_systems);
6721             }
6722         }
6723     }
6724   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6725            == coding_category_utf_16_auto)
6726     {
6727       Lisp_Object coding_systems;
6728       struct coding_detection_info detect_info;
6729
6730       coding_systems
6731         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6732       detect_info.found = detect_info.rejected = 0;
6733       coding->head_ascii = 0;
6734       if (CONSP (coding_systems)
6735           && detect_coding_utf_16 (coding, &detect_info))
6736         {
6737           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6738             found = XCAR (coding_systems);
6739           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6740             found = XCDR (coding_systems);
6741         }
6742     }
6743
6744   if (! NILP (found))
6745     {
6746       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6747                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6748                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6749                            : EOL_SEEN_LF);
6750
6751       setup_coding_system (found, coding);
6752       if (specified_eol != EOL_SEEN_NONE)
6753         adjust_coding_eol_type (coding, specified_eol);
6754     }
6755
6756   coding->mode = saved_mode;
6757 }
6758
6759
6760 static void
6761 decode_eol (struct coding_system *coding)
6762 {
6763   Lisp_Object eol_type;
6764   unsigned char *p, *pbeg, *pend;
6765
6766   eol_type = CODING_ID_EOL_TYPE (coding->id);
6767   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6768     return;
6769
6770   if (NILP (coding->dst_object))
6771     pbeg = coding->destination;
6772   else
6773     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6774   pend = pbeg + coding->produced;
6775
6776   if (VECTORP (eol_type))
6777     {
6778       int eol_seen = EOL_SEEN_NONE;
6779
6780       for (p = pbeg; p < pend; p++)
6781         {
6782           if (*p == '\n')
6783             eol_seen |= EOL_SEEN_LF;
6784           else if (*p == '\r')
6785             {
6786               if (p + 1 < pend && *(p + 1) == '\n')
6787                 {
6788                   eol_seen |= EOL_SEEN_CRLF;
6789                   p++;
6790                 }
6791               else
6792                 eol_seen |= EOL_SEEN_CR;
6793             }
6794         }
6795       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6796       if ((eol_seen & EOL_SEEN_CRLF) != 0
6797           && (eol_seen & EOL_SEEN_CR) != 0
6798           && (eol_seen & EOL_SEEN_LF) == 0)
6799         eol_seen = EOL_SEEN_CRLF;
6800       else if (eol_seen != EOL_SEEN_NONE
6801           && eol_seen != EOL_SEEN_LF
6802           && eol_seen != EOL_SEEN_CRLF
6803           && eol_seen != EOL_SEEN_CR)
6804         eol_seen = EOL_SEEN_LF;
6805       if (eol_seen != EOL_SEEN_NONE)
6806         eol_type = adjust_coding_eol_type (coding, eol_seen);
6807     }
6808
6809   if (EQ (eol_type, Qmac))
6810     {
6811       for (p = pbeg; p < pend; p++)
6812         if (*p == '\r')
6813           *p = '\n';
6814     }
6815   else if (EQ (eol_type, Qdos))
6816     {
6817       ptrdiff_t n = 0;
6818       ptrdiff_t pos = coding->dst_pos;
6819       ptrdiff_t pos_byte = coding->dst_pos_byte;
6820       ptrdiff_t pos_end = pos_byte + coding->produced - 1;
6821
6822       /* This assertion is here instead of code, now deleted, that
6823          handled the NILP case, which no longer happens with the
6824          current codebase.  */
6825       eassert (!NILP (coding->dst_object));
6826
6827       while (pos_byte < pos_end)
6828         {
6829           int incr;
6830
6831           p = BYTE_POS_ADDR (pos_byte);
6832           if (coding->dst_multibyte)
6833             incr = BYTES_BY_CHAR_HEAD (*p);
6834           else
6835             incr = 1;
6836
6837           if (*p == '\r' && p[1] == '\n')
6838             {
6839               del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6840               n++;
6841               pos_end--;
6842             }
6843           pos++;
6844           pos_byte += incr;
6845         }
6846       coding->produced -= n;
6847       coding->produced_char -= n;
6848     }
6849 }
6850
6851
6852 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
6853    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
6854    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
6855 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6856
6857 /* Return a translation table (or list of them) from coding system
6858    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6859    not ENCODEP). */
6860
6861 static Lisp_Object
6862 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6863 {
6864   Lisp_Object standard, translation_table;
6865   Lisp_Object val;
6866
6867   if (NILP (Venable_character_translation))
6868     {
6869       if (max_lookup)
6870         *max_lookup = 0;
6871       return Qnil;
6872     }
6873   if (encodep)
6874     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6875       standard = Vstandard_translation_table_for_encode;
6876   else
6877     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6878       standard = Vstandard_translation_table_for_decode;
6879   if (NILP (translation_table))
6880     translation_table = standard;
6881   else
6882     {
6883       if (SYMBOLP (translation_table))
6884         translation_table = Fget (translation_table, Qtranslation_table);
6885       else if (CONSP (translation_table))
6886         {
6887           translation_table = Fcopy_sequence (translation_table);
6888           for (val = translation_table; CONSP (val); val = XCDR (val))
6889             if (SYMBOLP (XCAR (val)))
6890               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6891         }
6892       if (CHAR_TABLE_P (standard))
6893         {
6894           if (CONSP (translation_table))
6895             translation_table = nconc2 (translation_table, list1 (standard));
6896           else
6897             translation_table = list2 (translation_table, standard);
6898         }
6899     }
6900
6901   if (max_lookup)
6902     {
6903       *max_lookup = 1;
6904       if (CHAR_TABLE_P (translation_table)
6905           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6906         {
6907           val = XCHAR_TABLE (translation_table)->extras[1];
6908           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6909             *max_lookup = min (XFASTINT (val), MAX_LOOKUP_MAX);
6910         }
6911       else if (CONSP (translation_table))
6912         {
6913           Lisp_Object tail;
6914
6915           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6916             if (CHAR_TABLE_P (XCAR (tail))
6917                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6918               {
6919                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6920                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6921                   *max_lookup = min (XFASTINT (tailval), MAX_LOOKUP_MAX);
6922               }
6923         }
6924     }
6925   return translation_table;
6926 }
6927
6928 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6929   do {                                                          \
6930     trans = Qnil;                                               \
6931     if (CHAR_TABLE_P (table))                                   \
6932       {                                                         \
6933         trans = CHAR_TABLE_REF (table, c);                      \
6934         if (CHARACTERP (trans))                                 \
6935           c = XFASTINT (trans), trans = Qnil;                   \
6936       }                                                         \
6937     else if (CONSP (table))                                     \
6938       {                                                         \
6939         Lisp_Object tail;                                       \
6940                                                                 \
6941         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6942           if (CHAR_TABLE_P (XCAR (tail)))                       \
6943             {                                                   \
6944               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6945               if (CHARACTERP (trans))                           \
6946                 c = XFASTINT (trans), trans = Qnil;             \
6947               else if (! NILP (trans))                          \
6948                 break;                                          \
6949             }                                                   \
6950       }                                                         \
6951   } while (0)
6952
6953
6954 /* Return a translation of character(s) at BUF according to TRANS.
6955    TRANS is TO-CHAR, [TO-CHAR ...], or ((FROM .  TO) ...) where FROM =
6956    [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].  The return value
6957    is TO-CHAR or [TO-CHAR ...] if a translation is found, Qnil if not
6958    found, or Qt if BUF is too short to lookup characters in FROM.  As
6959    a side effect, if a translation is found, *NCHARS is set to the
6960    number of characters being translated.  */
6961
6962 static Lisp_Object
6963 get_translation (Lisp_Object trans, int *buf, int *buf_end, ptrdiff_t *nchars)
6964 {
6965   if (INTEGERP (trans) || VECTORP (trans))
6966     {
6967       *nchars = 1;
6968       return trans;
6969     }
6970   for (; CONSP (trans); trans = XCDR (trans))
6971     {
6972       Lisp_Object val = XCAR (trans);
6973       Lisp_Object from = XCAR (val);
6974       ptrdiff_t len = ASIZE (from);
6975       ptrdiff_t i;
6976
6977       for (i = 0; i < len; i++)
6978         {
6979           if (buf + i == buf_end)
6980             return Qt;
6981           if (XINT (AREF (from, i)) != buf[i])
6982             break;
6983         }
6984       if (i == len)
6985         {
6986           *nchars = len;
6987           return XCDR (val);
6988         }
6989     }
6990   return Qnil;
6991 }
6992
6993
6994 static int
6995 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6996                bool last_block)
6997 {
6998   unsigned char *dst = coding->destination + coding->produced;
6999   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7000   ptrdiff_t produced;
7001   ptrdiff_t produced_chars = 0;
7002   int carryover = 0;
7003
7004   if (! coding->chars_at_source)
7005     {
7006       /* Source characters are in coding->charbuf.  */
7007       int *buf = coding->charbuf;
7008       int *buf_end = buf + coding->charbuf_used;
7009
7010       if (EQ (coding->src_object, coding->dst_object)
7011           && ! NILP (coding->dst_object))
7012         {
7013           eassert (growable_destination (coding));
7014           coding_set_source (coding);
7015           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7016         }
7017
7018       while (buf < buf_end)
7019         {
7020           int c = *buf;
7021           ptrdiff_t i;
7022
7023           if (c >= 0)
7024             {
7025               ptrdiff_t from_nchars = 1, to_nchars = 1;
7026               Lisp_Object trans = Qnil;
7027
7028               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7029               if (! NILP (trans))
7030                 {
7031                   trans = get_translation (trans, buf, buf_end, &from_nchars);
7032                   if (INTEGERP (trans))
7033                     c = XINT (trans);
7034                   else if (VECTORP (trans))
7035                     {
7036                       to_nchars = ASIZE (trans);
7037                       c = XINT (AREF (trans, 0));
7038                     }
7039                   else if (EQ (trans, Qt) && ! last_block)
7040                     break;
7041                 }
7042
7043               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7044                 {
7045                   eassert (growable_destination (coding));
7046                   ptrdiff_t dst_size;
7047                   if (INT_MULTIPLY_WRAPV (to_nchars, MAX_MULTIBYTE_LENGTH,
7048                                           &dst_size)
7049                       || INT_ADD_WRAPV (buf_end - buf, dst_size, &dst_size))
7050                     memory_full (SIZE_MAX);
7051                   dst = alloc_destination (coding, dst_size, dst);
7052                   if (EQ (coding->src_object, coding->dst_object))
7053                     {
7054                       coding_set_source (coding);
7055                       dst_end = (((unsigned char *) coding->source)
7056                                  + coding->consumed);
7057                     }
7058                   else
7059                     dst_end = coding->destination + coding->dst_bytes;
7060                 }
7061
7062               for (i = 0; i < to_nchars; i++)
7063                 {
7064                   if (i > 0)
7065                     c = XINT (AREF (trans, i));
7066                   if (coding->dst_multibyte
7067                       || ! CHAR_BYTE8_P (c))
7068                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7069                   else
7070                     *dst++ = CHAR_TO_BYTE8 (c);
7071                 }
7072               produced_chars += to_nchars;
7073               buf += from_nchars;
7074             }
7075           else
7076             /* This is an annotation datum.  (-C) is the length.  */
7077             buf += -c;
7078         }
7079       carryover = buf_end - buf;
7080     }
7081   else
7082     {
7083       /* Source characters are at coding->source.  */
7084       const unsigned char *src = coding->source;
7085       const unsigned char *src_end = src + coding->consumed;
7086
7087       if (EQ (coding->dst_object, coding->src_object))
7088         {
7089           eassert (growable_destination (coding));
7090           dst_end = (unsigned char *) src;
7091         }
7092       if (coding->src_multibyte != coding->dst_multibyte)
7093         {
7094           if (coding->src_multibyte)
7095             {
7096               bool multibytep = 1;
7097               ptrdiff_t consumed_chars = 0;
7098
7099               while (1)
7100                 {
7101                   const unsigned char *src_base = src;
7102                   int c;
7103
7104                   ONE_MORE_BYTE (c);
7105                   if (dst == dst_end)
7106                     {
7107                       eassert (growable_destination (coding));
7108                       if (EQ (coding->src_object, coding->dst_object))
7109                         dst_end = (unsigned char *) src;
7110                       if (dst == dst_end)
7111                         {
7112                           ptrdiff_t offset = src - coding->source;
7113
7114                           dst = alloc_destination (coding, src_end - src + 1,
7115                                                    dst);
7116                           dst_end = coding->destination + coding->dst_bytes;
7117                           coding_set_source (coding);
7118                           src = coding->source + offset;
7119                           src_end = coding->source + coding->consumed;
7120                           if (EQ (coding->src_object, coding->dst_object))
7121                             dst_end = (unsigned char *) src;
7122                         }
7123                     }
7124                   *dst++ = c;
7125                   produced_chars++;
7126                 }
7127             no_more_source:
7128               ;
7129             }
7130           else
7131             while (src < src_end)
7132               {
7133                 bool multibytep = 1;
7134                 int c = *src++;
7135
7136                 if (dst >= dst_end - 1)
7137                   {
7138                     eassert (growable_destination (coding));
7139                     if (EQ (coding->src_object, coding->dst_object))
7140                       dst_end = (unsigned char *) src;
7141                     if (dst >= dst_end - 1)
7142                       {
7143                         ptrdiff_t offset = src - coding->source;
7144                         ptrdiff_t more_bytes;
7145
7146                         if (EQ (coding->src_object, coding->dst_object))
7147                           more_bytes = ((src_end - src) / 2) + 2;
7148                         else
7149                           more_bytes = src_end - src + 2;
7150                         dst = alloc_destination (coding, more_bytes, dst);
7151                         dst_end = coding->destination + coding->dst_bytes;
7152                         coding_set_source (coding);
7153                         src = coding->source + offset;
7154                         src_end = coding->source + coding->consumed;
7155                         if (EQ (coding->src_object, coding->dst_object))
7156                           dst_end = (unsigned char *) src;
7157                       }
7158                   }
7159                 EMIT_ONE_BYTE (c);
7160               }
7161         }
7162       else
7163         {
7164           if (!EQ (coding->src_object, coding->dst_object))
7165             {
7166               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7167
7168               if (require > 0)
7169                 {
7170                   ptrdiff_t offset = src - coding->source;
7171
7172                   dst = alloc_destination (coding, require, dst);
7173                   coding_set_source (coding);
7174                   src = coding->source + offset;
7175                   src_end = coding->source + coding->consumed;
7176                 }
7177             }
7178           produced_chars = coding->consumed_char;
7179           while (src < src_end)
7180             *dst++ = *src++;
7181         }
7182     }
7183
7184   produced = dst - (coding->destination + coding->produced);
7185   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7186     insert_from_gap (produced_chars, produced, 0);
7187   coding->produced += produced;
7188   coding->produced_char += produced_chars;
7189   return carryover;
7190 }
7191
7192 /* Compose text in CODING->object according to the annotation data at
7193    CHARBUF.  CHARBUF is an array:
7194      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7195  */
7196
7197 static void
7198 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7199 {
7200   int len;
7201   ptrdiff_t to;
7202   enum composition_method method;
7203   Lisp_Object components;
7204
7205   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7206   to = pos + charbuf[2];
7207   method = (enum composition_method) (charbuf[4]);
7208
7209   if (method == COMPOSITION_RELATIVE)
7210     components = Qnil;
7211   else
7212     {
7213       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7214       int i, j;
7215
7216       if (method == COMPOSITION_WITH_RULE)
7217         len = charbuf[2] * 3 - 2;
7218       charbuf += MAX_ANNOTATION_LENGTH;
7219       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7220       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7221         {
7222           if (charbuf[i] >= 0)
7223             args[j] = make_number (charbuf[i]);
7224           else
7225             {
7226               i++;
7227               args[j] = make_number (charbuf[i] % 0x100);
7228             }
7229         }
7230       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7231     }
7232   compose_text (pos, to, components, Qnil, coding->dst_object);
7233 }
7234
7235
7236 /* Put `charset' property on text in CODING->object according to
7237    the annotation data at CHARBUF.  CHARBUF is an array:
7238      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7239  */
7240
7241 static void
7242 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7243 {
7244   ptrdiff_t from = pos - charbuf[2];
7245   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7246
7247   Fput_text_property (make_number (from), make_number (pos),
7248                       Qcharset, CHARSET_NAME (charset),
7249                       coding->dst_object);
7250 }
7251
7252 #define MAX_CHARBUF_SIZE 0x4000
7253 /* How many units decoding functions expect in coding->charbuf at
7254    most.  Currently, decode_coding_emacs_mule expects the following
7255    size, and that is the largest value.  */
7256 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7257
7258 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7259   do {                                                          \
7260     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
7261                            MAX_CHARBUF_SIZE);                   \
7262     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
7263     coding->charbuf_size = units;                               \
7264   } while (0)
7265
7266 static void
7267 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7268 {
7269   int *charbuf = coding->charbuf;
7270   int *charbuf_end = charbuf + coding->charbuf_used;
7271
7272   if (NILP (coding->dst_object))
7273     return;
7274
7275   while (charbuf < charbuf_end)
7276     {
7277       if (*charbuf >= 0)
7278         pos++, charbuf++;
7279       else
7280         {
7281           int len = -*charbuf;
7282
7283           if (len > 2)
7284             switch (charbuf[1])
7285               {
7286               case CODING_ANNOTATE_COMPOSITION_MASK:
7287                 produce_composition (coding, charbuf, pos);
7288                 break;
7289               case CODING_ANNOTATE_CHARSET_MASK:
7290                 produce_charset (coding, charbuf, pos);
7291                 break;
7292               default:
7293                 break;
7294               }
7295           charbuf += len;
7296         }
7297     }
7298 }
7299
7300 /* Decode the data at CODING->src_object into CODING->dst_object.
7301    CODING->src_object is a buffer, a string, or nil.
7302    CODING->dst_object is a buffer.
7303
7304    If CODING->src_object is a buffer, it must be the current buffer.
7305    In this case, if CODING->src_pos is positive, it is a position of
7306    the source text in the buffer, otherwise, the source text is in the
7307    gap area of the buffer, and CODING->src_pos specifies the offset of
7308    the text from GPT (which must be the same as PT).  If this is the
7309    same buffer as CODING->dst_object, CODING->src_pos must be
7310    negative.
7311
7312    If CODING->src_object is a string, CODING->src_pos is an index to
7313    that string.
7314
7315    If CODING->src_object is nil, CODING->source must already point to
7316    the non-relocatable memory area.  In this case, CODING->src_pos is
7317    an offset from CODING->source.
7318
7319    The decoded data is inserted at the current point of the buffer
7320    CODING->dst_object.
7321 */
7322
7323 static void
7324 decode_coding (struct coding_system *coding)
7325 {
7326   Lisp_Object attrs;
7327   Lisp_Object undo_list;
7328   Lisp_Object translation_table;
7329   struct ccl_spec cclspec;
7330   int carryover;
7331   int i;
7332
7333   USE_SAFE_ALLOCA;
7334
7335   if (BUFFERP (coding->src_object)
7336       && coding->src_pos > 0
7337       && coding->src_pos < GPT
7338       && coding->src_pos + coding->src_chars > GPT)
7339     move_gap_both (coding->src_pos, coding->src_pos_byte);
7340
7341   undo_list = Qt;
7342   if (BUFFERP (coding->dst_object))
7343     {
7344       set_buffer_internal (XBUFFER (coding->dst_object));
7345       if (GPT != PT)
7346         move_gap_both (PT, PT_BYTE);
7347
7348       /* We must disable undo_list in order to record the whole insert
7349          transaction via record_insert at the end.  But doing so also
7350          disables the recording of the first change to the undo_list.
7351          Therefore we check for first change here and record it via
7352          record_first_change if needed.  */
7353       if (MODIFF <= SAVE_MODIFF)
7354         record_first_change ();
7355
7356       undo_list = BVAR (current_buffer, undo_list);
7357       bset_undo_list (current_buffer, Qt);
7358     }
7359
7360   coding->consumed = coding->consumed_char = 0;
7361   coding->produced = coding->produced_char = 0;
7362   coding->chars_at_source = 0;
7363   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7364
7365   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7366
7367   attrs = CODING_ID_ATTRS (coding->id);
7368   translation_table = get_translation_table (attrs, 0, NULL);
7369
7370   carryover = 0;
7371   if (coding->decoder == decode_coding_ccl)
7372     {
7373       coding->spec.ccl = &cclspec;
7374       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7375     }
7376   do
7377     {
7378       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7379
7380       coding_set_source (coding);
7381       coding->annotated = 0;
7382       coding->charbuf_used = carryover;
7383       (*(coding->decoder)) (coding);
7384       coding_set_destination (coding);
7385       carryover = produce_chars (coding, translation_table, 0);
7386       if (coding->annotated)
7387         produce_annotation (coding, pos);
7388       for (i = 0; i < carryover; i++)
7389         coding->charbuf[i]
7390           = coding->charbuf[coding->charbuf_used - carryover + i];
7391     }
7392   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7393          || (coding->consumed < coding->src_bytes
7394              && (coding->result == CODING_RESULT_SUCCESS
7395                  || coding->result == CODING_RESULT_INVALID_SRC)));
7396
7397   if (carryover > 0)
7398     {
7399       coding_set_destination (coding);
7400       coding->charbuf_used = carryover;
7401       produce_chars (coding, translation_table, 1);
7402     }
7403
7404   coding->carryover_bytes = 0;
7405   if (coding->consumed < coding->src_bytes)
7406     {
7407       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7408       const unsigned char *src;
7409
7410       coding_set_source (coding);
7411       coding_set_destination (coding);
7412       src = coding->source + coding->consumed;
7413
7414       if (coding->mode & CODING_MODE_LAST_BLOCK)
7415         {
7416           /* Flush out unprocessed data as binary chars.  We are sure
7417              that the number of data is less than the size of
7418              coding->charbuf.  */
7419           coding->charbuf_used = 0;
7420           coding->chars_at_source = 0;
7421
7422           while (nbytes-- > 0)
7423             {
7424               int c = *src++;
7425
7426               if (c & 0x80)
7427                 c = BYTE8_TO_CHAR (c);
7428               coding->charbuf[coding->charbuf_used++] = c;
7429             }
7430           produce_chars (coding, Qnil, 1);
7431         }
7432       else
7433         {
7434           /* Record unprocessed bytes in coding->carryover.  We are
7435              sure that the number of data is less than the size of
7436              coding->carryover.  */
7437           unsigned char *p = coding->carryover;
7438
7439           if (nbytes > sizeof coding->carryover)
7440             nbytes = sizeof coding->carryover;
7441           coding->carryover_bytes = nbytes;
7442           while (nbytes-- > 0)
7443             *p++ = *src++;
7444         }
7445       coding->consumed = coding->src_bytes;
7446     }
7447
7448   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7449       && !inhibit_eol_conversion)
7450     decode_eol (coding);
7451   if (BUFFERP (coding->dst_object))
7452     {
7453       bset_undo_list (current_buffer, undo_list);
7454       record_insert (coding->dst_pos, coding->produced_char);
7455     }
7456
7457   SAFE_FREE ();
7458 }
7459
7460
7461 /* Extract an annotation datum from a composition starting at POS and
7462    ending before LIMIT of CODING->src_object (buffer or string), store
7463    the data in BUF, set *STOP to a starting position of the next
7464    composition (if any) or to LIMIT, and return the address of the
7465    next element of BUF.
7466
7467    If such an annotation is not found, set *STOP to a starting
7468    position of a composition after POS (if any) or to LIMIT, and
7469    return BUF.  */
7470
7471 static int *
7472 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7473                                struct coding_system *coding, int *buf,
7474                                ptrdiff_t *stop)
7475 {
7476   ptrdiff_t start, end;
7477   Lisp_Object prop;
7478
7479   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7480       || end > limit)
7481     *stop = limit;
7482   else if (start > pos)
7483     *stop = start;
7484   else
7485     {
7486       if (start == pos)
7487         {
7488           /* We found a composition.  Store the corresponding
7489              annotation data in BUF.  */
7490           int *head = buf;
7491           enum composition_method method = composition_method (prop);
7492           int nchars = COMPOSITION_LENGTH (prop);
7493
7494           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7495           if (method != COMPOSITION_RELATIVE)
7496             {
7497               Lisp_Object components;
7498               ptrdiff_t i, len, i_byte;
7499
7500               components = COMPOSITION_COMPONENTS (prop);
7501               if (VECTORP (components))
7502                 {
7503                   len = ASIZE (components);
7504                   for (i = 0; i < len; i++)
7505                     *buf++ = XINT (AREF (components, i));
7506                 }
7507               else if (STRINGP (components))
7508                 {
7509                   len = SCHARS (components);
7510                   i = i_byte = 0;
7511                   while (i < len)
7512                     {
7513                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7514                       buf++;
7515                     }
7516                 }
7517               else if (INTEGERP (components))
7518                 {
7519                   len = 1;
7520                   *buf++ = XINT (components);
7521                 }
7522               else if (CONSP (components))
7523                 {
7524                   for (len = 0; CONSP (components);
7525                        len++, components = XCDR (components))
7526                     *buf++ = XINT (XCAR (components));
7527                 }
7528               else
7529                 emacs_abort ();
7530               *head -= len;
7531             }
7532         }
7533
7534       if (find_composition (end, limit, &start, &end, &prop,
7535                             coding->src_object)
7536           && end <= limit)
7537         *stop = start;
7538       else
7539         *stop = limit;
7540     }
7541   return buf;
7542 }
7543
7544
7545 /* Extract an annotation datum from a text property `charset' at POS of
7546    CODING->src_object (buffer of string), store the data in BUF, set
7547    *STOP to the position where the value of `charset' property changes
7548    (limiting by LIMIT), and return the address of the next element of
7549    BUF.
7550
7551    If the property value is nil, set *STOP to the position where the
7552    property value is non-nil (limiting by LIMIT), and return BUF.  */
7553
7554 static int *
7555 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7556                            struct coding_system *coding, int *buf,
7557                            ptrdiff_t *stop)
7558 {
7559   Lisp_Object val, next;
7560   int id;
7561
7562   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7563   if (! NILP (val) && CHARSETP (val))
7564     id = XINT (CHARSET_SYMBOL_ID (val));
7565   else
7566     id = -1;
7567   ADD_CHARSET_DATA (buf, 0, id);
7568   next = Fnext_single_property_change (make_number (pos), Qcharset,
7569                                        coding->src_object,
7570                                        make_number (limit));
7571   *stop = XINT (next);
7572   return buf;
7573 }
7574
7575
7576 static void
7577 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7578                int max_lookup)
7579 {
7580   int *buf = coding->charbuf;
7581   int *buf_end = coding->charbuf + coding->charbuf_size;
7582   const unsigned char *src = coding->source + coding->consumed;
7583   const unsigned char *src_end = coding->source + coding->src_bytes;
7584   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7585   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7586   bool multibytep = coding->src_multibyte;
7587   Lisp_Object eol_type;
7588   int c;
7589   ptrdiff_t stop, stop_composition, stop_charset;
7590   int *lookup_buf = NULL;
7591
7592   if (! NILP (translation_table))
7593     lookup_buf = alloca (sizeof (int) * max_lookup);
7594
7595   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7596   if (VECTORP (eol_type))
7597     eol_type = Qunix;
7598
7599   /* Note: composition handling is not yet implemented.  */
7600   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7601
7602   if (NILP (coding->src_object))
7603     stop = stop_composition = stop_charset = end_pos;
7604   else
7605     {
7606       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7607         stop = stop_composition = pos;
7608       else
7609         stop = stop_composition = end_pos;
7610       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7611         stop = stop_charset = pos;
7612       else
7613         stop_charset = end_pos;
7614     }
7615
7616   /* Compensate for CRLF and conversion.  */
7617   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7618   while (buf < buf_end)
7619     {
7620       Lisp_Object trans;
7621
7622       if (pos == stop)
7623         {
7624           if (pos == end_pos)
7625             break;
7626           if (pos == stop_composition)
7627             buf = handle_composition_annotation (pos, end_pos, coding,
7628                                                  buf, &stop_composition);
7629           if (pos == stop_charset)
7630             buf = handle_charset_annotation (pos, end_pos, coding,
7631                                              buf, &stop_charset);
7632           stop = (stop_composition < stop_charset
7633                   ? stop_composition : stop_charset);
7634         }
7635
7636       if (! multibytep)
7637         {
7638           int bytes;
7639
7640           if (coding->encoder == encode_coding_raw_text
7641               || coding->encoder == encode_coding_ccl)
7642             c = *src++, pos++;
7643           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7644             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7645           else
7646             c = BYTE8_TO_CHAR (*src), src++, pos++;
7647         }
7648       else
7649         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7650       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7651         c = '\n';
7652       if (! EQ (eol_type, Qunix))
7653         {
7654           if (c == '\n')
7655             {
7656               if (EQ (eol_type, Qdos))
7657                 *buf++ = '\r';
7658               else
7659                 c = '\r';
7660             }
7661         }
7662
7663       trans = Qnil;
7664       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7665       if (NILP (trans))
7666         *buf++ = c;
7667       else
7668         {
7669           ptrdiff_t from_nchars = 1, to_nchars = 1;
7670           int *lookup_buf_end;
7671           const unsigned char *p = src;
7672           int i;
7673
7674           lookup_buf[0] = c;
7675           for (i = 1; i < max_lookup && p < src_end; i++)
7676             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7677           lookup_buf_end = lookup_buf + i;
7678           trans = get_translation (trans, lookup_buf, lookup_buf_end,
7679                                    &from_nchars);
7680           if (INTEGERP (trans))
7681             c = XINT (trans);
7682           else if (VECTORP (trans))
7683             {
7684               to_nchars = ASIZE (trans);
7685               if (buf_end - buf < to_nchars)
7686                 break;
7687               c = XINT (AREF (trans, 0));
7688             }
7689           else
7690             break;
7691           *buf++ = c;
7692           for (i = 1; i < to_nchars; i++)
7693             *buf++ = XINT (AREF (trans, i));
7694           for (i = 1; i < from_nchars; i++, pos++)
7695             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7696         }
7697     }
7698
7699   coding->consumed = src - coding->source;
7700   coding->consumed_char = pos - coding->src_pos;
7701   coding->charbuf_used = buf - coding->charbuf;
7702   coding->chars_at_source = 0;
7703 }
7704
7705
7706 /* Encode the text at CODING->src_object into CODING->dst_object.
7707    CODING->src_object is a buffer or a string.
7708    CODING->dst_object is a buffer or nil.
7709
7710    If CODING->src_object is a buffer, it must be the current buffer.
7711    In this case, if CODING->src_pos is positive, it is a position of
7712    the source text in the buffer, otherwise. the source text is in the
7713    gap area of the buffer, and coding->src_pos specifies the offset of
7714    the text from GPT (which must be the same as PT).  If this is the
7715    same buffer as CODING->dst_object, CODING->src_pos must be
7716    negative and CODING should not have `pre-write-conversion'.
7717
7718    If CODING->src_object is a string, CODING should not have
7719    `pre-write-conversion'.
7720
7721    If CODING->dst_object is a buffer, the encoded data is inserted at
7722    the current point of that buffer.
7723
7724    If CODING->dst_object is nil, the encoded data is placed at the
7725    memory area specified by CODING->destination.  */
7726
7727 static void
7728 encode_coding (struct coding_system *coding)
7729 {
7730   Lisp_Object attrs;
7731   Lisp_Object translation_table;
7732   int max_lookup;
7733   struct ccl_spec cclspec;
7734
7735   USE_SAFE_ALLOCA;
7736
7737   attrs = CODING_ID_ATTRS (coding->id);
7738   if (coding->encoder == encode_coding_raw_text)
7739     translation_table = Qnil, max_lookup = 0;
7740   else
7741     translation_table = get_translation_table (attrs, 1, &max_lookup);
7742
7743   if (BUFFERP (coding->dst_object))
7744     {
7745       set_buffer_internal (XBUFFER (coding->dst_object));
7746       coding->dst_multibyte
7747         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7748     }
7749
7750   coding->consumed = coding->consumed_char = 0;
7751   coding->produced = coding->produced_char = 0;
7752   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7753
7754   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7755
7756   if (coding->encoder == encode_coding_ccl)
7757     {
7758       coding->spec.ccl = &cclspec;
7759       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7760     }
7761   do {
7762     coding_set_source (coding);
7763     consume_chars (coding, translation_table, max_lookup);
7764     coding_set_destination (coding);
7765     (*(coding->encoder)) (coding);
7766   } while (coding->consumed_char < coding->src_chars);
7767
7768   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7769     insert_from_gap (coding->produced_char, coding->produced, 0);
7770
7771   SAFE_FREE ();
7772 }
7773
7774
7775 /* Name (or base name) of work buffer for code conversion.  */
7776 static Lisp_Object Vcode_conversion_workbuf_name;
7777
7778 /* A working buffer used by the top level conversion.  Once it is
7779    created, it is never destroyed.  It has the name
7780    Vcode_conversion_workbuf_name.  The other working buffers are
7781    destroyed after the use is finished, and their names are modified
7782    versions of Vcode_conversion_workbuf_name.  */
7783 static Lisp_Object Vcode_conversion_reused_workbuf;
7784
7785 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7786 static bool reused_workbuf_in_use;
7787
7788
7789 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7790    multibyteness of returning buffer.  */
7791
7792 static Lisp_Object
7793 make_conversion_work_buffer (bool multibyte)
7794 {
7795   Lisp_Object name, workbuf;
7796   struct buffer *current;
7797
7798   if (reused_workbuf_in_use)
7799     {
7800       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7801       workbuf = Fget_buffer_create (name);
7802     }
7803   else
7804     {
7805       reused_workbuf_in_use = 1;
7806       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7807         Vcode_conversion_reused_workbuf
7808           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7809       workbuf = Vcode_conversion_reused_workbuf;
7810     }
7811   current = current_buffer;
7812   set_buffer_internal (XBUFFER (workbuf));
7813   /* We can't allow modification hooks to run in the work buffer.  For
7814      instance, directory_files_internal assumes that file decoding
7815      doesn't compile new regexps.  */
7816   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7817   Ferase_buffer ();
7818   bset_undo_list (current_buffer, Qt);
7819   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7820   set_buffer_internal (current);
7821   return workbuf;
7822 }
7823
7824
7825 static void
7826 code_conversion_restore (Lisp_Object arg)
7827 {
7828   Lisp_Object current, workbuf;
7829
7830   current = XCAR (arg);
7831   workbuf = XCDR (arg);
7832   if (! NILP (workbuf))
7833     {
7834       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7835         reused_workbuf_in_use = 0;
7836       else
7837         Fkill_buffer (workbuf);
7838     }
7839   set_buffer_internal (XBUFFER (current));
7840 }
7841
7842 Lisp_Object
7843 code_conversion_save (bool with_work_buf, bool multibyte)
7844 {
7845   Lisp_Object workbuf = Qnil;
7846
7847   if (with_work_buf)
7848     workbuf = make_conversion_work_buffer (multibyte);
7849   record_unwind_protect (code_conversion_restore,
7850                          Fcons (Fcurrent_buffer (), workbuf));
7851   return workbuf;
7852 }
7853
7854 void
7855 decode_coding_gap (struct coding_system *coding,
7856                    ptrdiff_t chars, ptrdiff_t bytes)
7857 {
7858   ptrdiff_t count = SPECPDL_INDEX ();
7859   Lisp_Object attrs;
7860
7861   coding->src_object = Fcurrent_buffer ();
7862   coding->src_chars = chars;
7863   coding->src_bytes = bytes;
7864   coding->src_pos = -chars;
7865   coding->src_pos_byte = -bytes;
7866   coding->src_multibyte = chars < bytes;
7867   coding->dst_object = coding->src_object;
7868   coding->dst_pos = PT;
7869   coding->dst_pos_byte = PT_BYTE;
7870   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7871
7872   coding->head_ascii = -1;
7873   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7874   coding->eol_seen = EOL_SEEN_NONE;
7875   if (CODING_REQUIRE_DETECTION (coding))
7876     detect_coding (coding);
7877   attrs = CODING_ID_ATTRS (coding->id);
7878   if (! disable_ascii_optimization
7879       && ! coding->src_multibyte
7880       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7881       && NILP (CODING_ATTR_POST_READ (attrs))
7882       && NILP (get_translation_table (attrs, 0, NULL)))
7883     {
7884       chars = coding->head_ascii;
7885       if (chars < 0)
7886         chars = check_ascii (coding);
7887       if (chars != bytes)
7888         {
7889           /* There exists a non-ASCII byte.  */
7890           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7891               && coding->detected_utf8_bytes == coding->src_bytes)
7892             {
7893               if (coding->detected_utf8_chars >= 0)
7894                 chars = coding->detected_utf8_chars;
7895               else
7896                 chars = check_utf_8 (coding);
7897               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7898                   && coding->head_ascii == 0
7899                   && coding->source[0] == UTF_8_BOM_1
7900                   && coding->source[1] == UTF_8_BOM_2
7901                   && coding->source[2] == UTF_8_BOM_3)
7902                 {
7903                   chars--;
7904                   bytes -= 3;
7905                   coding->src_bytes -= 3;
7906                 }
7907             }
7908           else
7909             chars = -1;
7910         }
7911       if (chars >= 0)
7912         {
7913           Lisp_Object eol_type;
7914
7915           eol_type = CODING_ID_EOL_TYPE (coding->id);
7916           if (VECTORP (eol_type))
7917             {
7918               if (coding->eol_seen != EOL_SEEN_NONE)
7919                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7920             }
7921           if (EQ (eol_type, Qmac))
7922             {
7923               unsigned char *src_end = GAP_END_ADDR;
7924               unsigned char *src = src_end - coding->src_bytes;
7925
7926               while (src < src_end)
7927                 {
7928                   if (*src++ == '\r')
7929                     src[-1] = '\n';
7930                 }
7931             }
7932           else if (EQ (eol_type, Qdos))
7933             {
7934               unsigned char *src = GAP_END_ADDR;
7935               unsigned char *src_beg = src - coding->src_bytes;
7936               unsigned char *dst = src;
7937               ptrdiff_t diff;
7938
7939               while (src_beg < src)
7940                 {
7941                   *--dst = *--src;
7942                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7943                     src--;
7944                 }
7945               diff = dst - src;
7946               bytes -= diff;
7947               chars -= diff;
7948             }
7949           coding->produced = bytes;
7950           coding->produced_char = chars;
7951           insert_from_gap (chars, bytes, 1);
7952           return;
7953         }
7954     }
7955   code_conversion_save (0, 0);
7956
7957   coding->mode |= CODING_MODE_LAST_BLOCK;
7958   current_buffer->text->inhibit_shrinking = 1;
7959   decode_coding (coding);
7960   current_buffer->text->inhibit_shrinking = 0;
7961
7962   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7963     {
7964       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7965       Lisp_Object val;
7966
7967       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7968       val = call1 (CODING_ATTR_POST_READ (attrs),
7969                    make_number (coding->produced_char));
7970       CHECK_NATNUM (val);
7971       coding->produced_char += Z - prev_Z;
7972       coding->produced += Z_BYTE - prev_Z_BYTE;
7973     }
7974
7975   unbind_to (count, Qnil);
7976 }
7977
7978
7979 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7980    SRC_OBJECT into DST_OBJECT by coding context CODING.
7981
7982    SRC_OBJECT is a buffer, a string, or Qnil.
7983
7984    If it is a buffer, the text is at point of the buffer.  FROM and TO
7985    are positions in the buffer.
7986
7987    If it is a string, the text is at the beginning of the string.
7988    FROM and TO are indices to the string.
7989
7990    If it is nil, the text is at coding->source.  FROM and TO are
7991    indices to coding->source.
7992
7993    DST_OBJECT is a buffer, Qt, or Qnil.
7994
7995    If it is a buffer, the decoded text is inserted at point of the
7996    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7997    is deleted.
7998
7999    If it is Qt, a string is made from the decoded text, and
8000    set in CODING->dst_object.
8001
8002    If it is Qnil, the decoded text is stored at CODING->destination.
8003    The caller must allocate CODING->dst_bytes bytes at
8004    CODING->destination by xmalloc.  If the decoded text is longer than
8005    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8006  */
8007
8008 void
8009 decode_coding_object (struct coding_system *coding,
8010                       Lisp_Object src_object,
8011                       ptrdiff_t from, ptrdiff_t from_byte,
8012                       ptrdiff_t to, ptrdiff_t to_byte,
8013                       Lisp_Object dst_object)
8014 {
8015   ptrdiff_t count = SPECPDL_INDEX ();
8016   unsigned char *destination;
8017   ptrdiff_t dst_bytes;
8018   ptrdiff_t chars = to - from;
8019   ptrdiff_t bytes = to_byte - from_byte;
8020   Lisp_Object attrs;
8021   ptrdiff_t saved_pt = -1, saved_pt_byte;
8022   bool need_marker_adjustment = 0;
8023   Lisp_Object old_deactivate_mark;
8024
8025   old_deactivate_mark = Vdeactivate_mark;
8026
8027   if (NILP (dst_object))
8028     {
8029       destination = coding->destination;
8030       dst_bytes = coding->dst_bytes;
8031     }
8032
8033   coding->src_object = src_object;
8034   coding->src_chars = chars;
8035   coding->src_bytes = bytes;
8036   coding->src_multibyte = chars < bytes;
8037
8038   if (STRINGP (src_object))
8039     {
8040       coding->src_pos = from;
8041       coding->src_pos_byte = from_byte;
8042     }
8043   else if (BUFFERP (src_object))
8044     {
8045       set_buffer_internal (XBUFFER (src_object));
8046       if (from != GPT)
8047         move_gap_both (from, from_byte);
8048       if (EQ (src_object, dst_object))
8049         {
8050           struct Lisp_Marker *tail;
8051
8052           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8053             {
8054               tail->need_adjustment
8055                 = tail->charpos == (tail->insertion_type ? from : to);
8056               need_marker_adjustment |= tail->need_adjustment;
8057             }
8058           saved_pt = PT, saved_pt_byte = PT_BYTE;
8059           TEMP_SET_PT_BOTH (from, from_byte);
8060           current_buffer->text->inhibit_shrinking = 1;
8061           del_range_both (from, from_byte, to, to_byte, 1);
8062           coding->src_pos = -chars;
8063           coding->src_pos_byte = -bytes;
8064         }
8065       else
8066         {
8067           coding->src_pos = from;
8068           coding->src_pos_byte = from_byte;
8069         }
8070     }
8071
8072   if (CODING_REQUIRE_DETECTION (coding))
8073     detect_coding (coding);
8074   attrs = CODING_ID_ATTRS (coding->id);
8075
8076   if (EQ (dst_object, Qt)
8077       || (! NILP (CODING_ATTR_POST_READ (attrs))
8078           && NILP (dst_object)))
8079     {
8080       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8081       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8082       coding->dst_pos = BEG;
8083       coding->dst_pos_byte = BEG_BYTE;
8084     }
8085   else if (BUFFERP (dst_object))
8086     {
8087       code_conversion_save (0, 0);
8088       coding->dst_object = dst_object;
8089       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8090       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8091       coding->dst_multibyte
8092         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8093     }
8094   else
8095     {
8096       code_conversion_save (0, 0);
8097       coding->dst_object = Qnil;
8098       /* Most callers presume this will return a multibyte result, and they
8099          won't use `binary' or `raw-text' anyway, so let's not worry about
8100          CODING_FOR_UNIBYTE.  */
8101       coding->dst_multibyte = 1;
8102     }
8103
8104   decode_coding (coding);
8105
8106   if (BUFFERP (coding->dst_object))
8107     set_buffer_internal (XBUFFER (coding->dst_object));
8108
8109   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8110     {
8111       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8112       Lisp_Object val;
8113
8114       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8115       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8116                         make_number (coding->produced_char));
8117       CHECK_NATNUM (val);
8118       coding->produced_char += Z - prev_Z;
8119       coding->produced += Z_BYTE - prev_Z_BYTE;
8120     }
8121
8122   if (EQ (dst_object, Qt))
8123     {
8124       coding->dst_object = Fbuffer_string ();
8125     }
8126   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8127     {
8128       set_buffer_internal (XBUFFER (coding->dst_object));
8129       if (dst_bytes < coding->produced)
8130         {
8131           eassert (coding->produced > 0);
8132           destination = xrealloc (destination, coding->produced);
8133           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8134             move_gap_both (BEGV, BEGV_BYTE);
8135           memcpy (destination, BEGV_ADDR, coding->produced);
8136           coding->destination = destination;
8137         }
8138     }
8139
8140   if (saved_pt >= 0)
8141     {
8142       /* This is the case of:
8143          (BUFFERP (src_object) && EQ (src_object, dst_object))
8144          As we have moved PT while replacing the original buffer
8145          contents, we must recover it now.  */
8146       set_buffer_internal (XBUFFER (src_object));
8147       current_buffer->text->inhibit_shrinking = 0;
8148       if (saved_pt < from)
8149         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8150       else if (saved_pt < from + chars)
8151         TEMP_SET_PT_BOTH (from, from_byte);
8152       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8153         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8154                           saved_pt_byte + (coding->produced - bytes));
8155       else
8156         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8157                           saved_pt_byte + (coding->produced - bytes));
8158
8159       if (need_marker_adjustment)
8160         {
8161           struct Lisp_Marker *tail;
8162
8163           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8164             if (tail->need_adjustment)
8165               {
8166                 tail->need_adjustment = 0;
8167                 if (tail->insertion_type)
8168                   {
8169                     tail->bytepos = from_byte;
8170                     tail->charpos = from;
8171                   }
8172                 else
8173                   {
8174                     tail->bytepos = from_byte + coding->produced;
8175                     tail->charpos
8176                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8177                          ? tail->bytepos : from + coding->produced_char);
8178                   }
8179               }
8180         }
8181     }
8182
8183   Vdeactivate_mark = old_deactivate_mark;
8184   unbind_to (count, coding->dst_object);
8185 }
8186
8187
8188 void
8189 encode_coding_object (struct coding_system *coding,
8190                       Lisp_Object src_object,
8191                       ptrdiff_t from, ptrdiff_t from_byte,
8192                       ptrdiff_t to, ptrdiff_t to_byte,
8193                       Lisp_Object dst_object)
8194 {
8195   ptrdiff_t count = SPECPDL_INDEX ();
8196   ptrdiff_t chars = to - from;
8197   ptrdiff_t bytes = to_byte - from_byte;
8198   Lisp_Object attrs;
8199   ptrdiff_t saved_pt = -1, saved_pt_byte;
8200   bool need_marker_adjustment = 0;
8201   bool kill_src_buffer = 0;
8202   Lisp_Object old_deactivate_mark;
8203
8204   old_deactivate_mark = Vdeactivate_mark;
8205
8206   coding->src_object = src_object;
8207   coding->src_chars = chars;
8208   coding->src_bytes = bytes;
8209   coding->src_multibyte = chars < bytes;
8210
8211   attrs = CODING_ID_ATTRS (coding->id);
8212
8213   if (EQ (src_object, dst_object))
8214     {
8215       struct Lisp_Marker *tail;
8216
8217       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8218         {
8219           tail->need_adjustment
8220             = tail->charpos == (tail->insertion_type ? from : to);
8221           need_marker_adjustment |= tail->need_adjustment;
8222         }
8223     }
8224
8225   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8226     {
8227       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8228       set_buffer_internal (XBUFFER (coding->src_object));
8229       if (STRINGP (src_object))
8230         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8231       else if (BUFFERP (src_object))
8232         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8233       else
8234         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8235
8236       if (EQ (src_object, dst_object))
8237         {
8238           set_buffer_internal (XBUFFER (src_object));
8239           saved_pt = PT, saved_pt_byte = PT_BYTE;
8240           del_range_both (from, from_byte, to, to_byte, 1);
8241           set_buffer_internal (XBUFFER (coding->src_object));
8242         }
8243
8244       safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8245                   make_number (BEG), make_number (Z));
8246       if (XBUFFER (coding->src_object) != current_buffer)
8247         kill_src_buffer = 1;
8248       coding->src_object = Fcurrent_buffer ();
8249       if (BEG != GPT)
8250         move_gap_both (BEG, BEG_BYTE);
8251       coding->src_chars = Z - BEG;
8252       coding->src_bytes = Z_BYTE - BEG_BYTE;
8253       coding->src_pos = BEG;
8254       coding->src_pos_byte = BEG_BYTE;
8255       coding->src_multibyte = Z < Z_BYTE;
8256     }
8257   else if (STRINGP (src_object))
8258     {
8259       code_conversion_save (0, 0);
8260       coding->src_pos = from;
8261       coding->src_pos_byte = from_byte;
8262     }
8263   else if (BUFFERP (src_object))
8264     {
8265       code_conversion_save (0, 0);
8266       set_buffer_internal (XBUFFER (src_object));
8267       if (EQ (src_object, dst_object))
8268         {
8269           saved_pt = PT, saved_pt_byte = PT_BYTE;
8270           coding->src_object = del_range_1 (from, to, 1, 1);
8271           coding->src_pos = 0;
8272           coding->src_pos_byte = 0;
8273         }
8274       else
8275         {
8276           if (from < GPT && to >= GPT)
8277             move_gap_both (from, from_byte);
8278           coding->src_pos = from;
8279           coding->src_pos_byte = from_byte;
8280         }
8281     }
8282   else
8283     {
8284       code_conversion_save (0, 0);
8285       coding->src_pos = from;
8286       coding->src_pos_byte = from_byte;
8287     }
8288
8289   if (BUFFERP (dst_object))
8290     {
8291       coding->dst_object = dst_object;
8292       if (EQ (src_object, dst_object))
8293         {
8294           coding->dst_pos = from;
8295           coding->dst_pos_byte = from_byte;
8296         }
8297       else
8298         {
8299           struct buffer *current = current_buffer;
8300
8301           set_buffer_temp (XBUFFER (dst_object));
8302           coding->dst_pos = PT;
8303           coding->dst_pos_byte = PT_BYTE;
8304           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8305           set_buffer_temp (current);
8306         }
8307       coding->dst_multibyte
8308         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8309     }
8310   else if (EQ (dst_object, Qt))
8311     {
8312       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8313       coding->dst_object = Qnil;
8314       coding->destination = xmalloc (dst_bytes);
8315       coding->dst_bytes = dst_bytes;
8316       coding->dst_multibyte = 0;
8317     }
8318   else
8319     {
8320       coding->dst_object = Qnil;
8321       coding->dst_multibyte = 0;
8322     }
8323
8324   encode_coding (coding);
8325
8326   if (EQ (dst_object, Qt))
8327     {
8328       if (BUFFERP (coding->dst_object))
8329         coding->dst_object = Fbuffer_string ();
8330       else if (coding->raw_destination)
8331         /* This is used to avoid creating huge Lisp string.
8332            NOTE: caller who sets `raw_destination' is also
8333            responsible for freeing `destination' buffer.  */
8334         coding->dst_object = Qnil;
8335       else
8336         {
8337           coding->dst_object
8338             = make_unibyte_string ((char *) coding->destination,
8339                                    coding->produced);
8340           xfree (coding->destination);
8341         }
8342     }
8343
8344   if (saved_pt >= 0)
8345     {
8346       /* This is the case of:
8347          (BUFFERP (src_object) && EQ (src_object, dst_object))
8348          As we have moved PT while replacing the original buffer
8349          contents, we must recover it now.  */
8350       set_buffer_internal (XBUFFER (src_object));
8351       if (saved_pt < from)
8352         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8353       else if (saved_pt < from + chars)
8354         TEMP_SET_PT_BOTH (from, from_byte);
8355       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8356         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8357                           saved_pt_byte + (coding->produced - bytes));
8358       else
8359         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8360                           saved_pt_byte + (coding->produced - bytes));
8361
8362       if (need_marker_adjustment)
8363         {
8364           struct Lisp_Marker *tail;
8365
8366           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8367             if (tail->need_adjustment)
8368               {
8369                 tail->need_adjustment = 0;
8370                 if (tail->insertion_type)
8371                   {
8372                     tail->bytepos = from_byte;
8373                     tail->charpos = from;
8374                   }
8375                 else
8376                   {
8377                     tail->bytepos = from_byte + coding->produced;
8378                     tail->charpos
8379                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8380                          ? tail->bytepos : from + coding->produced_char);
8381                   }
8382               }
8383         }
8384     }
8385
8386   if (kill_src_buffer)
8387     Fkill_buffer (coding->src_object);
8388
8389   Vdeactivate_mark = old_deactivate_mark;
8390   unbind_to (count, Qnil);
8391 }
8392
8393
8394 Lisp_Object
8395 preferred_coding_system (void)
8396 {
8397   int id = coding_categories[coding_priorities[0]].id;
8398
8399   return CODING_ID_NAME (id);
8400 }
8401
8402 #if defined (WINDOWSNT) || defined (CYGWIN)
8403
8404 Lisp_Object
8405 from_unicode (Lisp_Object str)
8406 {
8407   CHECK_STRING (str);
8408   if (!STRING_MULTIBYTE (str) &&
8409       SBYTES (str) & 1)
8410     {
8411       str = Fsubstring (str, make_number (0), make_number (-1));
8412     }
8413
8414   return code_convert_string_norecord (str, Qutf_16le, 0);
8415 }
8416
8417 Lisp_Object
8418 from_unicode_buffer (const wchar_t *wstr)
8419 {
8420   /* We get one of the two final null bytes for free.  */
8421   ptrdiff_t len = 1 + sizeof (wchar_t) * wcslen (wstr);
8422   AUTO_STRING_WITH_LEN (str, (char *) wstr, len);
8423   return from_unicode (str);
8424 }
8425
8426 wchar_t *
8427 to_unicode (Lisp_Object str, Lisp_Object *buf)
8428 {
8429   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8430   /* We need to make another copy (in addition to the one made by
8431      code_convert_string_norecord) to ensure that the final string is
8432      _doubly_ zero terminated --- that is, that the string is
8433      terminated by two zero bytes and one utf-16le null character.
8434      Because strings are already terminated with a single zero byte,
8435      we just add one additional zero. */
8436   str = make_uninit_string (SBYTES (*buf) + 1);
8437   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8438   SDATA (str) [SBYTES (*buf)] = '\0';
8439   *buf = str;
8440   return WCSDATA (*buf);
8441 }
8442
8443 #endif /* WINDOWSNT || CYGWIN */
8444
8445 \f
8446 #ifdef emacs
8447 /*** 8. Emacs Lisp library functions ***/
8448
8449 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8450        doc: /* Return t if OBJECT is nil or a coding-system.
8451 See the documentation of `define-coding-system' for information
8452 about coding-system objects.  */)
8453   (Lisp_Object object)
8454 {
8455   if (NILP (object)
8456       || CODING_SYSTEM_ID (object) >= 0)
8457     return Qt;
8458   if (! SYMBOLP (object)
8459       || NILP (Fget (object, Qcoding_system_define_form)))
8460     return Qnil;
8461   return Qt;
8462 }
8463
8464 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8465        Sread_non_nil_coding_system, 1, 1, 0,
8466        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8467   (Lisp_Object prompt)
8468 {
8469   Lisp_Object val;
8470   do
8471     {
8472       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8473                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8474     }
8475   while (SCHARS (val) == 0);
8476   return (Fintern (val, Qnil));
8477 }
8478
8479 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8480        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8481 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8482 Ignores case when completing coding systems (all Emacs coding systems
8483 are lower-case).  */)
8484   (Lisp_Object prompt, Lisp_Object default_coding_system)
8485 {
8486   Lisp_Object val;
8487   ptrdiff_t count = SPECPDL_INDEX ();
8488
8489   if (SYMBOLP (default_coding_system))
8490     default_coding_system = SYMBOL_NAME (default_coding_system);
8491   specbind (Qcompletion_ignore_case, Qt);
8492   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8493                           Qt, Qnil, Qcoding_system_history,
8494                           default_coding_system, Qnil);
8495   unbind_to (count, Qnil);
8496   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8497 }
8498
8499 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8500        1, 1, 0,
8501        doc: /* Check validity of CODING-SYSTEM.
8502 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8503 It is valid if it is nil or a symbol defined as a coding system by the
8504 function `define-coding-system'.  */)
8505   (Lisp_Object coding_system)
8506 {
8507   Lisp_Object define_form;
8508
8509   define_form = Fget (coding_system, Qcoding_system_define_form);
8510   if (! NILP (define_form))
8511     {
8512       Fput (coding_system, Qcoding_system_define_form, Qnil);
8513       safe_eval (define_form);
8514     }
8515   if (!NILP (Fcoding_system_p (coding_system)))
8516     return coding_system;
8517   xsignal1 (Qcoding_system_error, coding_system);
8518 }
8519
8520 \f
8521 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8522    HIGHEST, return the coding system of the highest
8523    priority among the detected coding systems.  Otherwise return a
8524    list of detected coding systems sorted by their priorities.  If
8525    MULTIBYTEP, it is assumed that the bytes are in correct
8526    multibyte form but contains only ASCII and eight-bit chars.
8527    Otherwise, the bytes are raw bytes.
8528
8529    CODING-SYSTEM controls the detection as below:
8530
8531    If it is nil, detect both text-format and eol-format.  If the
8532    text-format part of CODING-SYSTEM is already specified
8533    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8534    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8535    detect only text-format.  */
8536
8537 Lisp_Object
8538 detect_coding_system (const unsigned char *src,
8539                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8540                       bool highest, bool multibytep,
8541                       Lisp_Object coding_system)
8542 {
8543   const unsigned char *src_end = src + src_bytes;
8544   Lisp_Object attrs, eol_type;
8545   Lisp_Object val = Qnil;
8546   struct coding_system coding;
8547   ptrdiff_t id;
8548   struct coding_detection_info detect_info;
8549   enum coding_category base_category;
8550   bool null_byte_found = 0, eight_bit_found = 0;
8551
8552   if (NILP (coding_system))
8553     coding_system = Qundecided;
8554   setup_coding_system (coding_system, &coding);
8555   attrs = CODING_ID_ATTRS (coding.id);
8556   eol_type = CODING_ID_EOL_TYPE (coding.id);
8557   coding_system = CODING_ATTR_BASE_NAME (attrs);
8558
8559   coding.source = src;
8560   coding.src_chars = src_chars;
8561   coding.src_bytes = src_bytes;
8562   coding.src_multibyte = multibytep;
8563   coding.consumed = 0;
8564   coding.mode |= CODING_MODE_LAST_BLOCK;
8565   coding.head_ascii = 0;
8566
8567   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8568
8569   /* At first, detect text-format if necessary.  */
8570   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8571   if (base_category == coding_category_undecided)
8572     {
8573       enum coding_category category UNINIT;
8574       struct coding_system *this UNINIT;
8575       int c, i;
8576       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8577                                        inhibit_null_byte_detection);
8578       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8579                                        inhibit_iso_escape_detection);
8580       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8581
8582       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8583       for (; src < src_end; src++)
8584         {
8585           c = *src;
8586           if (c & 0x80)
8587             {
8588               eight_bit_found = 1;
8589               if (null_byte_found)
8590                 break;
8591             }
8592           else if (c < 0x20)
8593             {
8594               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8595                   && ! inhibit_ied
8596                   && ! detect_info.checked)
8597                 {
8598                   if (detect_coding_iso_2022 (&coding, &detect_info))
8599                     {
8600                       /* We have scanned the whole data.  */
8601                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8602                         {
8603                           /* We didn't find an 8-bit code.  We may
8604                              have found a null-byte, but it's very
8605                              rare that a binary file confirm to
8606                              ISO-2022.  */
8607                           src = src_end;
8608                           coding.head_ascii = src - coding.source;
8609                         }
8610                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8611                       break;
8612                     }
8613                 }
8614               else if (! c && !inhibit_nbd)
8615                 {
8616                   null_byte_found = 1;
8617                   if (eight_bit_found)
8618                     break;
8619                 }
8620               if (! eight_bit_found)
8621                 coding.head_ascii++;
8622             }
8623           else if (! eight_bit_found)
8624             coding.head_ascii++;
8625         }
8626
8627       if (null_byte_found || eight_bit_found
8628           || coding.head_ascii < coding.src_bytes
8629           || detect_info.found)
8630         {
8631           if (coding.head_ascii == coding.src_bytes)
8632             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8633             for (i = 0; i < coding_category_raw_text; i++)
8634               {
8635                 category = coding_priorities[i];
8636                 this = coding_categories + category;
8637                 if (detect_info.found & (1 << category))
8638                   break;
8639               }
8640           else
8641             {
8642               if (null_byte_found)
8643                 {
8644                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8645                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8646                 }
8647               else if (prefer_utf_8
8648                        && detect_coding_utf_8 (&coding, &detect_info))
8649                 {
8650                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8651                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8652                 }
8653               for (i = 0; i < coding_category_raw_text; i++)
8654                 {
8655                   category = coding_priorities[i];
8656                   this = coding_categories + category;
8657
8658                   if (this->id < 0)
8659                     {
8660                       /* No coding system of this category is defined.  */
8661                       detect_info.rejected |= (1 << category);
8662                     }
8663                   else if (category >= coding_category_raw_text)
8664                     continue;
8665                   else if (detect_info.checked & (1 << category))
8666                     {
8667                       if (highest
8668                           && (detect_info.found & (1 << category)))
8669                         break;
8670                     }
8671                   else if ((*(this->detector)) (&coding, &detect_info)
8672                            && highest
8673                            && (detect_info.found & (1 << category)))
8674                     {
8675                       if (category == coding_category_utf_16_auto)
8676                         {
8677                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8678                             category = coding_category_utf_16_le;
8679                           else
8680                             category = coding_category_utf_16_be;
8681                         }
8682                       break;
8683                     }
8684                 }
8685             }
8686         }
8687
8688       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8689           || null_byte_found)
8690         {
8691           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8692           id = CODING_SYSTEM_ID (Qno_conversion);
8693           val = list1 (make_number (id));
8694         }
8695       else if (! detect_info.rejected && ! detect_info.found)
8696         {
8697           detect_info.found = CATEGORY_MASK_ANY;
8698           id = coding_categories[coding_category_undecided].id;
8699           val = list1 (make_number (id));
8700         }
8701       else if (highest)
8702         {
8703           if (detect_info.found)
8704             {
8705               detect_info.found = 1 << category;
8706               val = list1 (make_number (this->id));
8707             }
8708           else
8709             for (i = 0; i < coding_category_raw_text; i++)
8710               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8711                 {
8712                   detect_info.found = 1 << coding_priorities[i];
8713                   id = coding_categories[coding_priorities[i]].id;
8714                   val = list1 (make_number (id));
8715                   break;
8716                 }
8717         }
8718       else
8719         {
8720           int mask = detect_info.rejected | detect_info.found;
8721           int found = 0;
8722
8723           for (i = coding_category_raw_text - 1; i >= 0; i--)
8724             {
8725               category = coding_priorities[i];
8726               if (! (mask & (1 << category)))
8727                 {
8728                   found |= 1 << category;
8729                   id = coding_categories[category].id;
8730                   if (id >= 0)
8731                     val = list1 (make_number (id));
8732                 }
8733             }
8734           for (i = coding_category_raw_text - 1; i >= 0; i--)
8735             {
8736               category = coding_priorities[i];
8737               if (detect_info.found & (1 << category))
8738                 {
8739                   id = coding_categories[category].id;
8740                   val = Fcons (make_number (id), val);
8741                 }
8742             }
8743           detect_info.found |= found;
8744         }
8745     }
8746   else if (base_category == coding_category_utf_8_auto)
8747     {
8748       if (detect_coding_utf_8 (&coding, &detect_info))
8749         {
8750           struct coding_system *this;
8751
8752           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8753             this = coding_categories + coding_category_utf_8_sig;
8754           else
8755             this = coding_categories + coding_category_utf_8_nosig;
8756           val = list1 (make_number (this->id));
8757         }
8758     }
8759   else if (base_category == coding_category_utf_16_auto)
8760     {
8761       if (detect_coding_utf_16 (&coding, &detect_info))
8762         {
8763           struct coding_system *this;
8764
8765           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8766             this = coding_categories + coding_category_utf_16_le;
8767           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8768             this = coding_categories + coding_category_utf_16_be;
8769           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8770             this = coding_categories + coding_category_utf_16_be_nosig;
8771           else
8772             this = coding_categories + coding_category_utf_16_le_nosig;
8773           val = list1 (make_number (this->id));
8774         }
8775     }
8776   else
8777     {
8778       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8779       val = list1 (make_number (coding.id));
8780     }
8781
8782   /* Then, detect eol-format if necessary.  */
8783   {
8784     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8785     Lisp_Object tail;
8786
8787     if (VECTORP (eol_type))
8788       {
8789         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8790           {
8791             if (null_byte_found)
8792               normal_eol = EOL_SEEN_LF;
8793             else
8794               normal_eol = detect_eol (coding.source, src_bytes,
8795                                        coding_category_raw_text);
8796           }
8797         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8798                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8799           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8800                                       coding_category_utf_16_be);
8801         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8802                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8803           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8804                                       coding_category_utf_16_le);
8805       }
8806     else
8807       {
8808         if (EQ (eol_type, Qunix))
8809           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8810         else if (EQ (eol_type, Qdos))
8811           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8812         else
8813           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8814       }
8815
8816     for (tail = val; CONSP (tail); tail = XCDR (tail))
8817       {
8818         enum coding_category category;
8819         int this_eol;
8820
8821         id = XINT (XCAR (tail));
8822         attrs = CODING_ID_ATTRS (id);
8823         category = XINT (CODING_ATTR_CATEGORY (attrs));
8824         eol_type = CODING_ID_EOL_TYPE (id);
8825         if (VECTORP (eol_type))
8826           {
8827             if (category == coding_category_utf_16_be
8828                 || category == coding_category_utf_16_be_nosig)
8829               this_eol = utf_16_be_eol;
8830             else if (category == coding_category_utf_16_le
8831                      || category == coding_category_utf_16_le_nosig)
8832               this_eol = utf_16_le_eol;
8833             else
8834               this_eol = normal_eol;
8835
8836             if (this_eol == EOL_SEEN_LF)
8837               XSETCAR (tail, AREF (eol_type, 0));
8838             else if (this_eol == EOL_SEEN_CRLF)
8839               XSETCAR (tail, AREF (eol_type, 1));
8840             else if (this_eol == EOL_SEEN_CR)
8841               XSETCAR (tail, AREF (eol_type, 2));
8842             else
8843               XSETCAR (tail, CODING_ID_NAME (id));
8844           }
8845         else
8846           XSETCAR (tail, CODING_ID_NAME (id));
8847       }
8848   }
8849
8850   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8851 }
8852
8853
8854 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8855        2, 3, 0,
8856        doc: /* Detect coding system of the text in the region between START and END.
8857 Return a list of possible coding systems ordered by priority.
8858 The coding systems to try and their priorities follows what
8859 the function `coding-system-priority-list' (which see) returns.
8860
8861 If only ASCII characters are found (except for such ISO-2022 control
8862 characters as ESC), it returns a list of single element `undecided'
8863 or its subsidiary coding system according to a detected end-of-line
8864 format.
8865
8866 If optional argument HIGHEST is non-nil, return the coding system of
8867 highest priority.  */)
8868   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8869 {
8870   ptrdiff_t from, to;
8871   ptrdiff_t from_byte, to_byte;
8872
8873   validate_region (&start, &end);
8874   from = XINT (start), to = XINT (end);
8875   from_byte = CHAR_TO_BYTE (from);
8876   to_byte = CHAR_TO_BYTE (to);
8877
8878   if (from < GPT && to >= GPT)
8879     move_gap_both (to, to_byte);
8880
8881   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8882                                to - from, to_byte - from_byte,
8883                                !NILP (highest),
8884                                !NILP (BVAR (current_buffer
8885                                       , enable_multibyte_characters)),
8886                                Qnil);
8887 }
8888
8889 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8890        1, 2, 0,
8891        doc: /* Detect coding system of the text in STRING.
8892 Return a list of possible coding systems ordered by priority.
8893 The coding systems to try and their priorities follows what
8894 the function `coding-system-priority-list' (which see) returns.
8895
8896 If only ASCII characters are found (except for such ISO-2022 control
8897 characters as ESC), it returns a list of single element `undecided'
8898 or its subsidiary coding system according to a detected end-of-line
8899 format.
8900
8901 If optional argument HIGHEST is non-nil, return the coding system of
8902 highest priority.  */)
8903   (Lisp_Object string, Lisp_Object highest)
8904 {
8905   CHECK_STRING (string);
8906
8907   return detect_coding_system (SDATA (string),
8908                                SCHARS (string), SBYTES (string),
8909                                !NILP (highest), STRING_MULTIBYTE (string),
8910                                Qnil);
8911 }
8912
8913
8914 static bool
8915 char_encodable_p (int c, Lisp_Object attrs)
8916 {
8917   Lisp_Object tail;
8918   struct charset *charset;
8919   Lisp_Object translation_table;
8920
8921   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8922   if (! NILP (translation_table))
8923     c = translate_char (translation_table, c);
8924   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8925        CONSP (tail); tail = XCDR (tail))
8926     {
8927       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8928       if (CHAR_CHARSET_P (c, charset))
8929         break;
8930     }
8931   return (! NILP (tail));
8932 }
8933
8934
8935 /* Return a list of coding systems that safely encode the text between
8936    START and END.  If EXCLUDE is non-nil, it is a list of coding
8937    systems not to check.  The returned list doesn't contain any such
8938    coding systems.  In any case, if the text contains only ASCII or is
8939    unibyte, return t.  */
8940
8941 DEFUN ("find-coding-systems-region-internal",
8942        Ffind_coding_systems_region_internal,
8943        Sfind_coding_systems_region_internal, 2, 3, 0,
8944        doc: /* Internal use only.  */)
8945   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8946 {
8947   Lisp_Object coding_attrs_list, safe_codings;
8948   ptrdiff_t start_byte, end_byte;
8949   const unsigned char *p, *pbeg, *pend;
8950   int c;
8951   Lisp_Object tail, elt, work_table;
8952
8953   if (STRINGP (start))
8954     {
8955       if (!STRING_MULTIBYTE (start)
8956           || SCHARS (start) == SBYTES (start))
8957         return Qt;
8958       start_byte = 0;
8959       end_byte = SBYTES (start);
8960     }
8961   else
8962     {
8963       CHECK_NUMBER_COERCE_MARKER (start);
8964       CHECK_NUMBER_COERCE_MARKER (end);
8965       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8966         args_out_of_range (start, end);
8967       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8968         return Qt;
8969       start_byte = CHAR_TO_BYTE (XINT (start));
8970       end_byte = CHAR_TO_BYTE (XINT (end));
8971       if (XINT (end) - XINT (start) == end_byte - start_byte)
8972         return Qt;
8973
8974       if (XINT (start) < GPT && XINT (end) > GPT)
8975         {
8976           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8977             move_gap_both (XINT (start), start_byte);
8978           else
8979             move_gap_both (XINT (end), end_byte);
8980         }
8981     }
8982
8983   coding_attrs_list = Qnil;
8984   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8985     if (NILP (exclude)
8986         || NILP (Fmemq (XCAR (tail), exclude)))
8987       {
8988         Lisp_Object attrs;
8989
8990         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8991         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
8992           {
8993             ASET (attrs, coding_attr_trans_tbl,
8994                   get_translation_table (attrs, 1, NULL));
8995             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8996           }
8997       }
8998
8999   if (STRINGP (start))
9000     p = pbeg = SDATA (start);
9001   else
9002     p = pbeg = BYTE_POS_ADDR (start_byte);
9003   pend = p + (end_byte - start_byte);
9004
9005   while (p < pend && ASCII_CHAR_P (*p)) p++;
9006   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9007
9008   work_table = Fmake_char_table (Qnil, Qnil);
9009   while (p < pend)
9010     {
9011       if (ASCII_CHAR_P (*p))
9012         p++;
9013       else
9014         {
9015           c = STRING_CHAR_ADVANCE (p);
9016           if (!NILP (char_table_ref (work_table, c)))
9017             /* This character was already checked.  Ignore it.  */
9018             continue;
9019
9020           charset_map_loaded = 0;
9021           for (tail = coding_attrs_list; CONSP (tail);)
9022             {
9023               elt = XCAR (tail);
9024               if (NILP (elt))
9025                 tail = XCDR (tail);
9026               else if (char_encodable_p (c, elt))
9027                 tail = XCDR (tail);
9028               else if (CONSP (XCDR (tail)))
9029                 {
9030                   XSETCAR (tail, XCAR (XCDR (tail)));
9031                   XSETCDR (tail, XCDR (XCDR (tail)));
9032                 }
9033               else
9034                 {
9035                   XSETCAR (tail, Qnil);
9036                   tail = XCDR (tail);
9037                 }
9038             }
9039           if (charset_map_loaded)
9040             {
9041               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9042
9043               if (STRINGP (start))
9044                 pbeg = SDATA (start);
9045               else
9046                 pbeg = BYTE_POS_ADDR (start_byte);
9047               p = pbeg + p_offset;
9048               pend = pbeg + pend_offset;
9049             }
9050           char_table_set (work_table, c, Qt);
9051         }
9052     }
9053
9054   safe_codings = list2 (Qraw_text, Qno_conversion);
9055   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9056     if (! NILP (XCAR (tail)))
9057       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9058
9059   return safe_codings;
9060 }
9061
9062
9063 DEFUN ("unencodable-char-position", Funencodable_char_position,
9064        Sunencodable_char_position, 3, 5, 0,
9065        doc: /* Return position of first un-encodable character in a region.
9066 START and END specify the region and CODING-SYSTEM specifies the
9067 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9068
9069 If optional 4th argument COUNT is non-nil, it specifies at most how
9070 many un-encodable characters to search.  In this case, the value is a
9071 list of positions.
9072
9073 If optional 5th argument STRING is non-nil, it is a string to search
9074 for un-encodable characters.  In that case, START and END are indexes
9075 to the string and treated as in `substring'.  */)
9076   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9077    Lisp_Object count, Lisp_Object string)
9078 {
9079   EMACS_INT n;
9080   struct coding_system coding;
9081   Lisp_Object attrs, charset_list, translation_table;
9082   Lisp_Object positions;
9083   ptrdiff_t from, to;
9084   const unsigned char *p, *stop, *pend;
9085   bool ascii_compatible;
9086
9087   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9088   attrs = CODING_ID_ATTRS (coding.id);
9089   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9090     return Qnil;
9091   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9092   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9093   translation_table = get_translation_table (attrs, 1, NULL);
9094
9095   if (NILP (string))
9096     {
9097       validate_region (&start, &end);
9098       from = XINT (start);
9099       to = XINT (end);
9100       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9101           || (ascii_compatible
9102               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9103         return Qnil;
9104       p = CHAR_POS_ADDR (from);
9105       pend = CHAR_POS_ADDR (to);
9106       if (from < GPT && to >= GPT)
9107         stop = GPT_ADDR;
9108       else
9109         stop = pend;
9110     }
9111   else
9112     {
9113       CHECK_STRING (string);
9114       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9115       if (! STRING_MULTIBYTE (string))
9116         return Qnil;
9117       p = SDATA (string) + string_char_to_byte (string, from);
9118       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9119       if (ascii_compatible && (to - from) == (pend - p))
9120         return Qnil;
9121     }
9122
9123   if (NILP (count))
9124     n = 1;
9125   else
9126     {
9127       CHECK_NATNUM (count);
9128       n = XINT (count);
9129     }
9130
9131   positions = Qnil;
9132   charset_map_loaded = 0;
9133   while (1)
9134     {
9135       int c;
9136
9137       if (ascii_compatible)
9138         while (p < stop && ASCII_CHAR_P (*p))
9139           p++, from++;
9140       if (p >= stop)
9141         {
9142           if (p >= pend)
9143             break;
9144           stop = pend;
9145           p = GAP_END_ADDR;
9146         }
9147
9148       c = STRING_CHAR_ADVANCE (p);
9149       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9150           && ! char_charset (translate_char (translation_table, c),
9151                              charset_list, NULL))
9152         {
9153           positions = Fcons (make_number (from), positions);
9154           n--;
9155           if (n == 0)
9156             break;
9157         }
9158
9159       from++;
9160       if (charset_map_loaded && NILP (string))
9161         {
9162           p = CHAR_POS_ADDR (from);
9163           pend = CHAR_POS_ADDR (to);
9164           if (from < GPT && to >= GPT)
9165             stop = GPT_ADDR;
9166           else
9167             stop = pend;
9168           charset_map_loaded = 0;
9169         }
9170     }
9171
9172   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9173 }
9174
9175
9176 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9177        Scheck_coding_systems_region, 3, 3, 0,
9178        doc: /* Check if the region is encodable by coding systems.
9179
9180 START and END are buffer positions specifying the region.
9181 CODING-SYSTEM-LIST is a list of coding systems to check.
9182
9183 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9184 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9185 whole region, POS0, POS1, ... are buffer positions where non-encodable
9186 characters are found.
9187
9188 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9189 value is nil.
9190
9191 START may be a string.  In that case, check if the string is
9192 encodable, and the value contains indices to the string instead of
9193 buffer positions.  END is ignored.
9194
9195 If the current buffer (or START if it is a string) is unibyte, the value
9196 is nil.  */)
9197   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9198 {
9199   Lisp_Object list;
9200   ptrdiff_t start_byte, end_byte;
9201   ptrdiff_t pos;
9202   const unsigned char *p, *pbeg, *pend;
9203   int c;
9204   Lisp_Object tail, elt, attrs;
9205
9206   if (STRINGP (start))
9207     {
9208       if (!STRING_MULTIBYTE (start)
9209           || SCHARS (start) == SBYTES (start))
9210         return Qnil;
9211       start_byte = 0;
9212       end_byte = SBYTES (start);
9213       pos = 0;
9214     }
9215   else
9216     {
9217       CHECK_NUMBER_COERCE_MARKER (start);
9218       CHECK_NUMBER_COERCE_MARKER (end);
9219       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9220         args_out_of_range (start, end);
9221       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9222         return Qnil;
9223       start_byte = CHAR_TO_BYTE (XINT (start));
9224       end_byte = CHAR_TO_BYTE (XINT (end));
9225       if (XINT (end) - XINT (start) == end_byte - start_byte)
9226         return Qnil;
9227
9228       if (XINT (start) < GPT && XINT (end) > GPT)
9229         {
9230           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9231             move_gap_both (XINT (start), start_byte);
9232           else
9233             move_gap_both (XINT (end), end_byte);
9234         }
9235       pos = XINT (start);
9236     }
9237
9238   list = Qnil;
9239   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9240     {
9241       elt = XCAR (tail);
9242       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9243       ASET (attrs, coding_attr_trans_tbl,
9244             get_translation_table (attrs, 1, NULL));
9245       list = Fcons (list2 (elt, attrs), list);
9246     }
9247
9248   if (STRINGP (start))
9249     p = pbeg = SDATA (start);
9250   else
9251     p = pbeg = BYTE_POS_ADDR (start_byte);
9252   pend = p + (end_byte - start_byte);
9253
9254   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9255   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9256
9257   while (p < pend)
9258     {
9259       if (ASCII_CHAR_P (*p))
9260         p++;
9261       else
9262         {
9263           c = STRING_CHAR_ADVANCE (p);
9264
9265           charset_map_loaded = 0;
9266           for (tail = list; CONSP (tail); tail = XCDR (tail))
9267             {
9268               elt = XCDR (XCAR (tail));
9269               if (! char_encodable_p (c, XCAR (elt)))
9270                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9271             }
9272           if (charset_map_loaded)
9273             {
9274               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9275
9276               if (STRINGP (start))
9277                 pbeg = SDATA (start);
9278               else
9279                 pbeg = BYTE_POS_ADDR (start_byte);
9280               p = pbeg + p_offset;
9281               pend = pbeg + pend_offset;
9282             }
9283         }
9284       pos++;
9285     }
9286
9287   tail = list;
9288   list = Qnil;
9289   for (; CONSP (tail); tail = XCDR (tail))
9290     {
9291       elt = XCAR (tail);
9292       if (CONSP (XCDR (XCDR (elt))))
9293         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9294                       list);
9295     }
9296
9297   return list;
9298 }
9299
9300
9301 static Lisp_Object
9302 code_convert_region (Lisp_Object start, Lisp_Object end,
9303                      Lisp_Object coding_system, Lisp_Object dst_object,
9304                      bool encodep, bool norecord)
9305 {
9306   struct coding_system coding;
9307   ptrdiff_t from, from_byte, to, to_byte;
9308   Lisp_Object src_object;
9309
9310   if (NILP (coding_system))
9311     coding_system = Qno_conversion;
9312   else
9313     CHECK_CODING_SYSTEM (coding_system);
9314   src_object = Fcurrent_buffer ();
9315   if (NILP (dst_object))
9316     dst_object = src_object;
9317   else if (! EQ (dst_object, Qt))
9318     CHECK_BUFFER (dst_object);
9319
9320   validate_region (&start, &end);
9321   from = XFASTINT (start);
9322   from_byte = CHAR_TO_BYTE (from);
9323   to = XFASTINT (end);
9324   to_byte = CHAR_TO_BYTE (to);
9325
9326   setup_coding_system (coding_system, &coding);
9327   coding.mode |= CODING_MODE_LAST_BLOCK;
9328
9329   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9330     {
9331       struct buffer *buf = XBUFFER (dst_object);
9332       ptrdiff_t buf_pt = BUF_PT (buf);
9333
9334       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9335     }
9336
9337   if (encodep)
9338     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9339                           dst_object);
9340   else
9341     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9342                           dst_object);
9343   if (! norecord)
9344     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9345
9346   return (BUFFERP (dst_object)
9347           ? make_number (coding.produced_char)
9348           : coding.dst_object);
9349 }
9350
9351
9352 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9353        3, 4, "r\nzCoding system: ",
9354        doc: /* Decode the current region from the specified coding system.
9355 When called from a program, takes four arguments:
9356         START, END, CODING-SYSTEM, and DESTINATION.
9357 START and END are buffer positions.
9358
9359 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9360 If nil, the region between START and END is replaced by the decoded text.
9361 If buffer, the decoded text is inserted in that buffer after point (point
9362 does not move).
9363 In those cases, the length of the decoded text is returned.
9364 If DESTINATION is t, the decoded text is returned.
9365
9366 This function sets `last-coding-system-used' to the precise coding system
9367 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9368 not fully specified.)  */)
9369   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9370 {
9371   return code_convert_region (start, end, coding_system, destination, 0, 0);
9372 }
9373
9374 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9375        3, 4, "r\nzCoding system: ",
9376        doc: /* Encode the current region by specified coding system.
9377 When called from a program, takes four arguments:
9378         START, END, CODING-SYSTEM and DESTINATION.
9379 START and END are buffer positions.
9380
9381 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9382 If nil, the region between START and END is replace by the encoded text.
9383 If buffer, the encoded text is inserted in that buffer after point (point
9384 does not move).
9385 In those cases, the length of the encoded text is returned.
9386 If DESTINATION is t, the encoded text is returned.
9387
9388 This function sets `last-coding-system-used' to the precise coding system
9389 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9390 not fully specified.)  */)
9391   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9392 {
9393   return code_convert_region (start, end, coding_system, destination, 1, 0);
9394 }
9395
9396 Lisp_Object
9397 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9398                      Lisp_Object dst_object, bool encodep, bool nocopy,
9399                      bool norecord)
9400 {
9401   struct coding_system coding;
9402   ptrdiff_t chars, bytes;
9403
9404   CHECK_STRING (string);
9405   if (NILP (coding_system))
9406     {
9407       if (! norecord)
9408         Vlast_coding_system_used = Qno_conversion;
9409       if (NILP (dst_object))
9410         return (nocopy ? Fcopy_sequence (string) : string);
9411     }
9412
9413   if (NILP (coding_system))
9414     coding_system = Qno_conversion;
9415   else
9416     CHECK_CODING_SYSTEM (coding_system);
9417   if (NILP (dst_object))
9418     dst_object = Qt;
9419   else if (! EQ (dst_object, Qt))
9420     CHECK_BUFFER (dst_object);
9421
9422   setup_coding_system (coding_system, &coding);
9423   coding.mode |= CODING_MODE_LAST_BLOCK;
9424   chars = SCHARS (string);
9425   bytes = SBYTES (string);
9426
9427   if (BUFFERP (dst_object))
9428     {
9429       struct buffer *buf = XBUFFER (dst_object);
9430       ptrdiff_t buf_pt = BUF_PT (buf);
9431
9432       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9433     }
9434
9435   if (encodep)
9436     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9437   else
9438     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9439   if (! norecord)
9440     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9441
9442   return (BUFFERP (dst_object)
9443           ? make_number (coding.produced_char)
9444           : coding.dst_object);
9445 }
9446
9447
9448 /* Encode or decode STRING according to CODING_SYSTEM.
9449    Do not set Vlast_coding_system_used.
9450
9451    This function is called only from macros DECODE_FILE and
9452    ENCODE_FILE, thus we ignore character composition.  */
9453
9454 Lisp_Object
9455 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9456                               bool encodep)
9457 {
9458   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9459 }
9460
9461 /* Encode or decode a file name, to or from a unibyte string suitable
9462    for passing to C library functions.  */
9463 Lisp_Object
9464 decode_file_name (Lisp_Object fname)
9465 {
9466 #ifdef WINDOWSNT
9467   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9468      converts the file names either to UTF-16LE or to the system ANSI
9469      codepage internally, depending on the underlying OS; see w32.c.  */
9470   if (! NILP (Fcoding_system_p (Qutf_8)))
9471     return code_convert_string_norecord (fname, Qutf_8, 0);
9472   return fname;
9473 #else  /* !WINDOWSNT */
9474   if (! NILP (Vfile_name_coding_system))
9475     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9476   else if (! NILP (Vdefault_file_name_coding_system))
9477     return code_convert_string_norecord (fname,
9478                                          Vdefault_file_name_coding_system, 0);
9479   else
9480     return fname;
9481 #endif
9482 }
9483
9484 Lisp_Object
9485 encode_file_name (Lisp_Object fname)
9486 {
9487   /* This is especially important during bootstrap and dumping, when
9488      file-name encoding is not yet known, and therefore any non-ASCII
9489      file names are unibyte strings, and could only be thrashed if we
9490      try to encode them.  */
9491   if (!STRING_MULTIBYTE (fname))
9492     return fname;
9493 #ifdef WINDOWSNT
9494   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9495      converts the file names either to UTF-16LE or to the system ANSI
9496      codepage internally, depending on the underlying OS; see w32.c.  */
9497   if (! NILP (Fcoding_system_p (Qutf_8)))
9498     return code_convert_string_norecord (fname, Qutf_8, 1);
9499   return fname;
9500 #else  /* !WINDOWSNT */
9501   if (! NILP (Vfile_name_coding_system))
9502     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9503   else if (! NILP (Vdefault_file_name_coding_system))
9504     return code_convert_string_norecord (fname,
9505                                          Vdefault_file_name_coding_system, 1);
9506   else
9507     return fname;
9508 #endif
9509 }
9510
9511 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9512        2, 4, 0,
9513        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9514
9515 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9516 if the decoding operation is trivial.
9517
9518 Optional fourth arg BUFFER non-nil means that the decoded text is
9519 inserted in that buffer after point (point does not move).  In this
9520 case, the return value is the length of the decoded text.
9521
9522 This function sets `last-coding-system-used' to the precise coding system
9523 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9524 not fully specified.)  */)
9525   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9526 {
9527   return code_convert_string (string, coding_system, buffer,
9528                               0, ! NILP (nocopy), 0);
9529 }
9530
9531 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9532        2, 4, 0,
9533        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9534
9535 Optional third arg NOCOPY non-nil means it is OK to return STRING
9536 itself if the encoding operation is trivial.
9537
9538 Optional fourth arg BUFFER non-nil means that the encoded text is
9539 inserted in that buffer after point (point does not move).  In this
9540 case, the return value is the length of the encoded text.
9541
9542 This function sets `last-coding-system-used' to the precise coding system
9543 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9544 not fully specified.)  */)
9545   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9546 {
9547   return code_convert_string (string, coding_system, buffer,
9548                               1, ! NILP (nocopy), 0);
9549 }
9550
9551 \f
9552 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9553        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9554 Return the corresponding character.  */)
9555   (Lisp_Object code)
9556 {
9557   Lisp_Object spec, attrs, val;
9558   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9559   EMACS_INT ch;
9560   int c;
9561
9562   CHECK_NATNUM (code);
9563   ch = XFASTINT (code);
9564   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9565   attrs = AREF (spec, 0);
9566
9567   if (ASCII_CHAR_P (ch)
9568       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9569     return code;
9570
9571   val = CODING_ATTR_CHARSET_LIST (attrs);
9572   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9573   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9574   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9575
9576   if (ch <= 0x7F)
9577     {
9578       c = ch;
9579       charset = charset_roman;
9580     }
9581   else if (ch >= 0xA0 && ch < 0xDF)
9582     {
9583       c = ch - 0x80;
9584       charset = charset_kana;
9585     }
9586   else
9587     {
9588       EMACS_INT c1 = ch >> 8;
9589       int c2 = ch & 0xFF;
9590
9591       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9592           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9593         error ("Invalid code: %"pI"d", ch);
9594       c = ch;
9595       SJIS_TO_JIS (c);
9596       charset = charset_kanji;
9597     }
9598   c = DECODE_CHAR (charset, c);
9599   if (c < 0)
9600     error ("Invalid code: %"pI"d", ch);
9601   return make_number (c);
9602 }
9603
9604
9605 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9606        doc: /* Encode a Japanese character CH to shift_jis encoding.
9607 Return the corresponding code in SJIS.  */)
9608   (Lisp_Object ch)
9609 {
9610   Lisp_Object spec, attrs, charset_list;
9611   int c;
9612   struct charset *charset;
9613   unsigned code;
9614
9615   CHECK_CHARACTER (ch);
9616   c = XFASTINT (ch);
9617   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9618   attrs = AREF (spec, 0);
9619
9620   if (ASCII_CHAR_P (c)
9621       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9622     return ch;
9623
9624   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9625   charset = char_charset (c, charset_list, &code);
9626   if (code == CHARSET_INVALID_CODE (charset))
9627     error ("Can't encode by shift_jis encoding: %c", c);
9628   JIS_TO_SJIS (code);
9629
9630   return make_number (code);
9631 }
9632
9633 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9634        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9635 Return the corresponding character.  */)
9636   (Lisp_Object code)
9637 {
9638   Lisp_Object spec, attrs, val;
9639   struct charset *charset_roman, *charset_big5, *charset;
9640   EMACS_INT ch;
9641   int c;
9642
9643   CHECK_NATNUM (code);
9644   ch = XFASTINT (code);
9645   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9646   attrs = AREF (spec, 0);
9647
9648   if (ASCII_CHAR_P (ch)
9649       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9650     return code;
9651
9652   val = CODING_ATTR_CHARSET_LIST (attrs);
9653   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9654   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9655
9656   if (ch <= 0x7F)
9657     {
9658       c = ch;
9659       charset = charset_roman;
9660     }
9661   else
9662     {
9663       EMACS_INT b1 = ch >> 8;
9664       int b2 = ch & 0x7F;
9665       if (b1 < 0xA1 || b1 > 0xFE
9666           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9667         error ("Invalid code: %"pI"d", ch);
9668       c = ch;
9669       charset = charset_big5;
9670     }
9671   c = DECODE_CHAR (charset, c);
9672   if (c < 0)
9673     error ("Invalid code: %"pI"d", ch);
9674   return make_number (c);
9675 }
9676
9677 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9678        doc: /* Encode the Big5 character CH to BIG5 coding system.
9679 Return the corresponding character code in Big5.  */)
9680   (Lisp_Object ch)
9681 {
9682   Lisp_Object spec, attrs, charset_list;
9683   struct charset *charset;
9684   int c;
9685   unsigned code;
9686
9687   CHECK_CHARACTER (ch);
9688   c = XFASTINT (ch);
9689   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9690   attrs = AREF (spec, 0);
9691   if (ASCII_CHAR_P (c)
9692       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9693     return ch;
9694
9695   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9696   charset = char_charset (c, charset_list, &code);
9697   if (code == CHARSET_INVALID_CODE (charset))
9698     error ("Can't encode by Big5 encoding: %c", c);
9699
9700   return make_number (code);
9701 }
9702
9703 \f
9704 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9705        Sset_terminal_coding_system_internal, 1, 2, 0,
9706        doc: /* Internal use only.  */)
9707   (Lisp_Object coding_system, Lisp_Object terminal)
9708 {
9709   struct terminal *term = decode_live_terminal (terminal);
9710   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9711   CHECK_SYMBOL (coding_system);
9712   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9713   /* We had better not send unsafe characters to terminal.  */
9714   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9715   /* Character composition should be disabled.  */
9716   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9717   terminal_coding->src_multibyte = 1;
9718   terminal_coding->dst_multibyte = 0;
9719   tset_charset_list
9720     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9721             ? coding_charset_list (terminal_coding)
9722             : list1 (make_number (charset_ascii))));
9723   return Qnil;
9724 }
9725
9726 DEFUN ("set-safe-terminal-coding-system-internal",
9727        Fset_safe_terminal_coding_system_internal,
9728        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9729        doc: /* Internal use only.  */)
9730   (Lisp_Object coding_system)
9731 {
9732   CHECK_SYMBOL (coding_system);
9733   setup_coding_system (Fcheck_coding_system (coding_system),
9734                        &safe_terminal_coding);
9735   /* Character composition should be disabled.  */
9736   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9737   safe_terminal_coding.src_multibyte = 1;
9738   safe_terminal_coding.dst_multibyte = 0;
9739   return Qnil;
9740 }
9741
9742 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9743        Sterminal_coding_system, 0, 1, 0,
9744        doc: /* Return coding system specified for terminal output on the given terminal.
9745 TERMINAL may be a terminal object, a frame, or nil for the selected
9746 frame's terminal device.  */)
9747   (Lisp_Object terminal)
9748 {
9749   struct coding_system *terminal_coding
9750     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
9751   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9752
9753   /* For backward compatibility, return nil if it is `undecided'.  */
9754   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9755 }
9756
9757 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9758        Sset_keyboard_coding_system_internal, 1, 2, 0,
9759        doc: /* Internal use only.  */)
9760   (Lisp_Object coding_system, Lisp_Object terminal)
9761 {
9762   struct terminal *t = decode_live_terminal (terminal);
9763   CHECK_SYMBOL (coding_system);
9764   if (NILP (coding_system))
9765     coding_system = Qno_conversion;
9766   else
9767     Fcheck_coding_system (coding_system);
9768   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9769   /* Character composition should be disabled.  */
9770   TERMINAL_KEYBOARD_CODING (t)->common_flags
9771     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9772   return Qnil;
9773 }
9774
9775 DEFUN ("keyboard-coding-system",
9776        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9777        doc: /* Return coding system specified for decoding keyboard input.  */)
9778   (Lisp_Object terminal)
9779 {
9780   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9781                          (decode_live_terminal (terminal))->id);
9782 }
9783
9784 \f
9785 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9786        Sfind_operation_coding_system,  1, MANY, 0,
9787        doc: /* Choose a coding system for an operation based on the target name.
9788 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9789 DECODING-SYSTEM is the coding system to use for decoding
9790 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9791 for encoding (in case OPERATION does encoding).
9792
9793 The first argument OPERATION specifies an I/O primitive:
9794   For file I/O, `insert-file-contents' or `write-region'.
9795   For process I/O, `call-process', `call-process-region', or `start-process'.
9796   For network I/O, `open-network-stream'.
9797
9798 The remaining arguments should be the same arguments that were passed
9799 to the primitive.  Depending on which primitive, one of those arguments
9800 is selected as the TARGET.  For example, if OPERATION does file I/O,
9801 whichever argument specifies the file name is TARGET.
9802
9803 TARGET has a meaning which depends on OPERATION:
9804   For file I/O, TARGET is a file name (except for the special case below).
9805   For process I/O, TARGET is a process name.
9806   For network I/O, TARGET is a service name or a port number.
9807
9808 This function looks up what is specified for TARGET in
9809 `file-coding-system-alist', `process-coding-system-alist',
9810 or `network-coding-system-alist' depending on OPERATION.
9811 They may specify a coding system, a cons of coding systems,
9812 or a function symbol to call.
9813 In the last case, we call the function with one argument,
9814 which is a list of all the arguments given to this function.
9815 If the function can't decide a coding system, it can return
9816 `undecided' so that the normal code-detection is performed.
9817
9818 If OPERATION is `insert-file-contents', the argument corresponding to
9819 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9820 file name to look up, and BUFFER is a buffer that contains the file's
9821 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9822 function to call for FILENAME, that function should examine the
9823 contents of BUFFER instead of reading the file.
9824
9825 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9826   (ptrdiff_t nargs, Lisp_Object *args)
9827 {
9828   Lisp_Object operation, target_idx, target, val;
9829   register Lisp_Object chain;
9830
9831   if (nargs < 2)
9832     error ("Too few arguments");
9833   operation = args[0];
9834   if (!SYMBOLP (operation)
9835       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9836     error ("Invalid first argument");
9837   if (nargs <= 1 + XFASTINT (target_idx))
9838     error ("Too few arguments for operation `%s'",
9839            SDATA (SYMBOL_NAME (operation)));
9840   target = args[XFASTINT (target_idx) + 1];
9841   if (!(STRINGP (target)
9842         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9843             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9844         || (EQ (operation, Qopen_network_stream)
9845             && (INTEGERP (target) || EQ (target, Qt)))))
9846     error ("Invalid argument %"pI"d of operation `%s'",
9847            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9848   if (CONSP (target))
9849     target = XCAR (target);
9850
9851   chain = ((EQ (operation, Qinsert_file_contents)
9852             || EQ (operation, Qwrite_region))
9853            ? Vfile_coding_system_alist
9854            : (EQ (operation, Qopen_network_stream)
9855               ? Vnetwork_coding_system_alist
9856               : Vprocess_coding_system_alist));
9857   if (NILP (chain))
9858     return Qnil;
9859
9860   for (; CONSP (chain); chain = XCDR (chain))
9861     {
9862       Lisp_Object elt;
9863
9864       elt = XCAR (chain);
9865       if (CONSP (elt)
9866           && ((STRINGP (target)
9867                && STRINGP (XCAR (elt))
9868                && fast_string_match (XCAR (elt), target) >= 0)
9869               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9870         {
9871           val = XCDR (elt);
9872           /* Here, if VAL is both a valid coding system and a valid
9873              function symbol, we return VAL as a coding system.  */
9874           if (CONSP (val))
9875             return val;
9876           if (! SYMBOLP (val))
9877             return Qnil;
9878           if (! NILP (Fcoding_system_p (val)))
9879             return Fcons (val, val);
9880           if (! NILP (Ffboundp (val)))
9881             {
9882               /* We use call1 rather than safe_call1
9883                  so as to get bug reports about functions called here
9884                  which don't handle the current interface.  */
9885               val = call1 (val, Flist (nargs, args));
9886               if (CONSP (val))
9887                 return val;
9888               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9889                 return Fcons (val, val);
9890             }
9891           return Qnil;
9892         }
9893     }
9894   return Qnil;
9895 }
9896
9897 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9898        Sset_coding_system_priority, 0, MANY, 0,
9899        doc: /* Assign higher priority to the coding systems given as arguments.
9900 If multiple coding systems belong to the same category,
9901 all but the first one are ignored.
9902
9903 usage: (set-coding-system-priority &rest coding-systems)  */)
9904   (ptrdiff_t nargs, Lisp_Object *args)
9905 {
9906   ptrdiff_t i, j;
9907   bool changed[coding_category_max];
9908   enum coding_category priorities[coding_category_max];
9909
9910   memset (changed, 0, sizeof changed);
9911
9912   for (i = j = 0; i < nargs; i++)
9913     {
9914       enum coding_category category;
9915       Lisp_Object spec, attrs;
9916
9917       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9918       attrs = AREF (spec, 0);
9919       category = XINT (CODING_ATTR_CATEGORY (attrs));
9920       if (changed[category])
9921         /* Ignore this coding system because a coding system of the
9922            same category already had a higher priority.  */
9923         continue;
9924       changed[category] = 1;
9925       priorities[j++] = category;
9926       if (coding_categories[category].id >= 0
9927           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9928         setup_coding_system (args[i], &coding_categories[category]);
9929       Fset (AREF (Vcoding_category_table, category), args[i]);
9930     }
9931
9932   /* Now we have decided top J priorities.  Reflect the order of the
9933      original priorities to the remaining priorities.  */
9934
9935   for (i = j, j = 0; i < coding_category_max; i++, j++)
9936     {
9937       while (j < coding_category_max
9938              && changed[coding_priorities[j]])
9939         j++;
9940       if (j == coding_category_max)
9941         emacs_abort ();
9942       priorities[i] = coding_priorities[j];
9943     }
9944
9945   memcpy (coding_priorities, priorities, sizeof priorities);
9946
9947   /* Update `coding-category-list'.  */
9948   Vcoding_category_list = Qnil;
9949   for (i = coding_category_max; i-- > 0; )
9950     Vcoding_category_list
9951       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9952                Vcoding_category_list);
9953
9954   return Qnil;
9955 }
9956
9957 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9958        Scoding_system_priority_list, 0, 1, 0,
9959        doc: /* Return a list of coding systems ordered by their priorities.
9960 The list contains a subset of coding systems; i.e. coding systems
9961 assigned to each coding category (see `coding-category-list').
9962
9963 HIGHESTP non-nil means just return the highest priority one.  */)
9964   (Lisp_Object highestp)
9965 {
9966   int i;
9967   Lisp_Object val;
9968
9969   for (i = 0, val = Qnil; i < coding_category_max; i++)
9970     {
9971       enum coding_category category = coding_priorities[i];
9972       int id = coding_categories[category].id;
9973       Lisp_Object attrs;
9974
9975       if (id < 0)
9976         continue;
9977       attrs = CODING_ID_ATTRS (id);
9978       if (! NILP (highestp))
9979         return CODING_ATTR_BASE_NAME (attrs);
9980       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9981     }
9982   return Fnreverse (val);
9983 }
9984
9985 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9986
9987 static Lisp_Object
9988 make_subsidiaries (Lisp_Object base)
9989 {
9990   Lisp_Object subsidiaries;
9991   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9992   USE_SAFE_ALLOCA;
9993   char *buf = SAFE_ALLOCA (base_name_len + 6);
9994   int i;
9995
9996   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9997   subsidiaries = make_uninit_vector (3);
9998   for (i = 0; i < 3; i++)
9999     {
10000       strcpy (buf + base_name_len, suffixes[i]);
10001       ASET (subsidiaries, i, intern (buf));
10002     }
10003   SAFE_FREE ();
10004   return subsidiaries;
10005 }
10006
10007
10008 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10009        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10010        doc: /* For internal use only.
10011 usage: (define-coding-system-internal ...)  */)
10012   (ptrdiff_t nargs, Lisp_Object *args)
10013 {
10014   Lisp_Object name;
10015   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10016   Lisp_Object attrs;            /* Vector of attributes.  */
10017   Lisp_Object eol_type;
10018   Lisp_Object aliases;
10019   Lisp_Object coding_type, charset_list, safe_charsets;
10020   enum coding_category category;
10021   Lisp_Object tail, val;
10022   int max_charset_id = 0;
10023   int i;
10024
10025   if (nargs < coding_arg_max)
10026     goto short_args;
10027
10028   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10029
10030   name = args[coding_arg_name];
10031   CHECK_SYMBOL (name);
10032   ASET (attrs, coding_attr_base_name, name);
10033
10034   val = args[coding_arg_mnemonic];
10035   if (! STRINGP (val))
10036     CHECK_CHARACTER (val);
10037   ASET (attrs, coding_attr_mnemonic, val);
10038
10039   coding_type = args[coding_arg_coding_type];
10040   CHECK_SYMBOL (coding_type);
10041   ASET (attrs, coding_attr_type, coding_type);
10042
10043   charset_list = args[coding_arg_charset_list];
10044   if (SYMBOLP (charset_list))
10045     {
10046       if (EQ (charset_list, Qiso_2022))
10047         {
10048           if (! EQ (coding_type, Qiso_2022))
10049             error ("Invalid charset-list");
10050           charset_list = Viso_2022_charset_list;
10051         }
10052       else if (EQ (charset_list, Qemacs_mule))
10053         {
10054           if (! EQ (coding_type, Qemacs_mule))
10055             error ("Invalid charset-list");
10056           charset_list = Vemacs_mule_charset_list;
10057         }
10058       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10059         {
10060           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10061             error ("Invalid charset-list");
10062           if (max_charset_id < XFASTINT (XCAR (tail)))
10063             max_charset_id = XFASTINT (XCAR (tail));
10064         }
10065     }
10066   else
10067     {
10068       charset_list = Fcopy_sequence (charset_list);
10069       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10070         {
10071           struct charset *charset;
10072
10073           val = XCAR (tail);
10074           CHECK_CHARSET_GET_CHARSET (val, charset);
10075           if (EQ (coding_type, Qiso_2022)
10076               ? CHARSET_ISO_FINAL (charset) < 0
10077               : EQ (coding_type, Qemacs_mule)
10078               ? CHARSET_EMACS_MULE_ID (charset) < 0
10079               : 0)
10080             error ("Can't handle charset `%s'",
10081                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10082
10083           XSETCAR (tail, make_number (charset->id));
10084           if (max_charset_id < charset->id)
10085             max_charset_id = charset->id;
10086         }
10087     }
10088   ASET (attrs, coding_attr_charset_list, charset_list);
10089
10090   safe_charsets = make_uninit_string (max_charset_id + 1);
10091   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10092   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10093     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10094   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10095
10096   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10097
10098   val = args[coding_arg_decode_translation_table];
10099   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10100     CHECK_SYMBOL (val);
10101   ASET (attrs, coding_attr_decode_tbl, val);
10102
10103   val = args[coding_arg_encode_translation_table];
10104   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10105     CHECK_SYMBOL (val);
10106   ASET (attrs, coding_attr_encode_tbl, val);
10107
10108   val = args[coding_arg_post_read_conversion];
10109   CHECK_SYMBOL (val);
10110   ASET (attrs, coding_attr_post_read, val);
10111
10112   val = args[coding_arg_pre_write_conversion];
10113   CHECK_SYMBOL (val);
10114   ASET (attrs, coding_attr_pre_write, val);
10115
10116   val = args[coding_arg_default_char];
10117   if (NILP (val))
10118     ASET (attrs, coding_attr_default_char, make_number (' '));
10119   else
10120     {
10121       CHECK_CHARACTER (val);
10122       ASET (attrs, coding_attr_default_char, val);
10123     }
10124
10125   val = args[coding_arg_for_unibyte];
10126   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10127
10128   val = args[coding_arg_plist];
10129   CHECK_LIST (val);
10130   ASET (attrs, coding_attr_plist, val);
10131
10132   if (EQ (coding_type, Qcharset))
10133     {
10134       /* Generate a lisp vector of 256 elements.  Each element is nil,
10135          integer, or a list of charset IDs.
10136
10137          If Nth element is nil, the byte code N is invalid in this
10138          coding system.
10139
10140          If Nth element is a number NUM, N is the first byte of a
10141          charset whose ID is NUM.
10142
10143          If Nth element is a list of charset IDs, N is the first byte
10144          of one of them.  The list is sorted by dimensions of the
10145          charsets.  A charset of smaller dimension comes first. */
10146       val = Fmake_vector (make_number (256), Qnil);
10147
10148       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10149         {
10150           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10151           int dim = CHARSET_DIMENSION (charset);
10152           int idx = (dim - 1) * 4;
10153
10154           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10155             ASET (attrs, coding_attr_ascii_compat, Qt);
10156
10157           for (i = charset->code_space[idx];
10158                i <= charset->code_space[idx + 1]; i++)
10159             {
10160               Lisp_Object tmp, tmp2;
10161               int dim2;
10162
10163               tmp = AREF (val, i);
10164               if (NILP (tmp))
10165                 tmp = XCAR (tail);
10166               else if (NUMBERP (tmp))
10167                 {
10168                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10169                   if (dim < dim2)
10170                     tmp = list2 (XCAR (tail), tmp);
10171                   else
10172                     tmp = list2 (tmp, XCAR (tail));
10173                 }
10174               else
10175                 {
10176                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10177                     {
10178                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10179                       if (dim < dim2)
10180                         break;
10181                     }
10182                   if (NILP (tmp2))
10183                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10184                   else
10185                     {
10186                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10187                       XSETCAR (tmp2, XCAR (tail));
10188                     }
10189                 }
10190               ASET (val, i, tmp);
10191             }
10192         }
10193       ASET (attrs, coding_attr_charset_valids, val);
10194       category = coding_category_charset;
10195     }
10196   else if (EQ (coding_type, Qccl))
10197     {
10198       Lisp_Object valids;
10199
10200       if (nargs < coding_arg_ccl_max)
10201         goto short_args;
10202
10203       val = args[coding_arg_ccl_decoder];
10204       CHECK_CCL_PROGRAM (val);
10205       if (VECTORP (val))
10206         val = Fcopy_sequence (val);
10207       ASET (attrs, coding_attr_ccl_decoder, val);
10208
10209       val = args[coding_arg_ccl_encoder];
10210       CHECK_CCL_PROGRAM (val);
10211       if (VECTORP (val))
10212         val = Fcopy_sequence (val);
10213       ASET (attrs, coding_attr_ccl_encoder, val);
10214
10215       val = args[coding_arg_ccl_valids];
10216       valids = Fmake_string (make_number (256), make_number (0));
10217       for (tail = val; CONSP (tail); tail = XCDR (tail))
10218         {
10219           int from, to;
10220
10221           val = XCAR (tail);
10222           if (INTEGERP (val))
10223             {
10224               if (! (0 <= XINT (val) && XINT (val) <= 255))
10225                 args_out_of_range_3 (val, make_number (0), make_number (255));
10226               from = to = XINT (val);
10227             }
10228           else
10229             {
10230               CHECK_CONS (val);
10231               CHECK_NATNUM_CAR (val);
10232               CHECK_NUMBER_CDR (val);
10233               if (XINT (XCAR (val)) > 255)
10234                 args_out_of_range_3 (XCAR (val),
10235                                      make_number (0), make_number (255));
10236               from = XINT (XCAR (val));
10237               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10238                 args_out_of_range_3 (XCDR (val),
10239                                      XCAR (val), make_number (255));
10240               to = XINT (XCDR (val));
10241             }
10242           for (i = from; i <= to; i++)
10243             SSET (valids, i, 1);
10244         }
10245       ASET (attrs, coding_attr_ccl_valids, valids);
10246
10247       category = coding_category_ccl;
10248     }
10249   else if (EQ (coding_type, Qutf_16))
10250     {
10251       Lisp_Object bom, endian;
10252
10253       ASET (attrs, coding_attr_ascii_compat, Qnil);
10254
10255       if (nargs < coding_arg_utf16_max)
10256         goto short_args;
10257
10258       bom = args[coding_arg_utf16_bom];
10259       if (! NILP (bom) && ! EQ (bom, Qt))
10260         {
10261           CHECK_CONS (bom);
10262           val = XCAR (bom);
10263           CHECK_CODING_SYSTEM (val);
10264           val = XCDR (bom);
10265           CHECK_CODING_SYSTEM (val);
10266         }
10267       ASET (attrs, coding_attr_utf_bom, bom);
10268
10269       endian = args[coding_arg_utf16_endian];
10270       CHECK_SYMBOL (endian);
10271       if (NILP (endian))
10272         endian = Qbig;
10273       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10274         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10275       ASET (attrs, coding_attr_utf_16_endian, endian);
10276
10277       category = (CONSP (bom)
10278                   ? coding_category_utf_16_auto
10279                   : NILP (bom)
10280                   ? (EQ (endian, Qbig)
10281                      ? coding_category_utf_16_be_nosig
10282                      : coding_category_utf_16_le_nosig)
10283                   : (EQ (endian, Qbig)
10284                      ? coding_category_utf_16_be
10285                      : coding_category_utf_16_le));
10286     }
10287   else if (EQ (coding_type, Qiso_2022))
10288     {
10289       Lisp_Object initial, reg_usage, request, flags;
10290
10291       if (nargs < coding_arg_iso2022_max)
10292         goto short_args;
10293
10294       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10295       CHECK_VECTOR (initial);
10296       for (i = 0; i < 4; i++)
10297         {
10298           val = AREF (initial, i);
10299           if (! NILP (val))
10300             {
10301               struct charset *charset;
10302
10303               CHECK_CHARSET_GET_CHARSET (val, charset);
10304               ASET (initial, i, make_number (CHARSET_ID (charset)));
10305               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10306                 ASET (attrs, coding_attr_ascii_compat, Qt);
10307             }
10308           else
10309             ASET (initial, i, make_number (-1));
10310         }
10311
10312       reg_usage = args[coding_arg_iso2022_reg_usage];
10313       CHECK_CONS (reg_usage);
10314       CHECK_NUMBER_CAR (reg_usage);
10315       CHECK_NUMBER_CDR (reg_usage);
10316
10317       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10318       for (tail = request; CONSP (tail); tail = XCDR (tail))
10319         {
10320           int id;
10321           Lisp_Object tmp1;
10322
10323           val = XCAR (tail);
10324           CHECK_CONS (val);
10325           tmp1 = XCAR (val);
10326           CHECK_CHARSET_GET_ID (tmp1, id);
10327           CHECK_NATNUM_CDR (val);
10328           if (XINT (XCDR (val)) >= 4)
10329             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10330           XSETCAR (val, make_number (id));
10331         }
10332
10333       flags = args[coding_arg_iso2022_flags];
10334       CHECK_NATNUM (flags);
10335       i = XINT (flags) & INT_MAX;
10336       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10337         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10338       flags = make_number (i);
10339
10340       ASET (attrs, coding_attr_iso_initial, initial);
10341       ASET (attrs, coding_attr_iso_usage, reg_usage);
10342       ASET (attrs, coding_attr_iso_request, request);
10343       ASET (attrs, coding_attr_iso_flags, flags);
10344       setup_iso_safe_charsets (attrs);
10345
10346       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10347         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10348                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10349                     ? coding_category_iso_7_else
10350                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10351                     ? coding_category_iso_7
10352                     : coding_category_iso_7_tight);
10353       else
10354         {
10355           int id = XINT (AREF (initial, 1));
10356
10357           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10358                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10359                        || id < 0)
10360                       ? coding_category_iso_8_else
10361                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10362                       ? coding_category_iso_8_1
10363                       : coding_category_iso_8_2);
10364         }
10365       if (category != coding_category_iso_8_1
10366           && category != coding_category_iso_8_2)
10367         ASET (attrs, coding_attr_ascii_compat, Qnil);
10368     }
10369   else if (EQ (coding_type, Qemacs_mule))
10370     {
10371       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10372         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10373       ASET (attrs, coding_attr_ascii_compat, Qt);
10374       category = coding_category_emacs_mule;
10375     }
10376   else if (EQ (coding_type, Qshift_jis))
10377     {
10378
10379       struct charset *charset;
10380
10381       if (XINT (Flength (charset_list)) != 3
10382           && XINT (Flength (charset_list)) != 4)
10383         error ("There should be three or four charsets");
10384
10385       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10386       if (CHARSET_DIMENSION (charset) != 1)
10387         error ("Dimension of charset %s is not one",
10388                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10389       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10390         ASET (attrs, coding_attr_ascii_compat, Qt);
10391
10392       charset_list = XCDR (charset_list);
10393       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10394       if (CHARSET_DIMENSION (charset) != 1)
10395         error ("Dimension of charset %s is not one",
10396                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10397
10398       charset_list = XCDR (charset_list);
10399       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10400       if (CHARSET_DIMENSION (charset) != 2)
10401         error ("Dimension of charset %s is not two",
10402                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10403
10404       charset_list = XCDR (charset_list);
10405       if (! NILP (charset_list))
10406         {
10407           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10408           if (CHARSET_DIMENSION (charset) != 2)
10409             error ("Dimension of charset %s is not two",
10410                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10411         }
10412
10413       category = coding_category_sjis;
10414       Vsjis_coding_system = name;
10415     }
10416   else if (EQ (coding_type, Qbig5))
10417     {
10418       struct charset *charset;
10419
10420       if (XINT (Flength (charset_list)) != 2)
10421         error ("There should be just two charsets");
10422
10423       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10424       if (CHARSET_DIMENSION (charset) != 1)
10425         error ("Dimension of charset %s is not one",
10426                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10427       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10428         ASET (attrs, coding_attr_ascii_compat, Qt);
10429
10430       charset_list = XCDR (charset_list);
10431       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10432       if (CHARSET_DIMENSION (charset) != 2)
10433         error ("Dimension of charset %s is not two",
10434                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10435
10436       category = coding_category_big5;
10437       Vbig5_coding_system = name;
10438     }
10439   else if (EQ (coding_type, Qraw_text))
10440     {
10441       category = coding_category_raw_text;
10442       ASET (attrs, coding_attr_ascii_compat, Qt);
10443     }
10444   else if (EQ (coding_type, Qutf_8))
10445     {
10446       Lisp_Object bom;
10447
10448       if (nargs < coding_arg_utf8_max)
10449         goto short_args;
10450
10451       bom = args[coding_arg_utf8_bom];
10452       if (! NILP (bom) && ! EQ (bom, Qt))
10453         {
10454           CHECK_CONS (bom);
10455           val = XCAR (bom);
10456           CHECK_CODING_SYSTEM (val);
10457           val = XCDR (bom);
10458           CHECK_CODING_SYSTEM (val);
10459         }
10460       ASET (attrs, coding_attr_utf_bom, bom);
10461       if (NILP (bom))
10462         ASET (attrs, coding_attr_ascii_compat, Qt);
10463
10464       category = (CONSP (bom) ? coding_category_utf_8_auto
10465                   : NILP (bom) ? coding_category_utf_8_nosig
10466                   : coding_category_utf_8_sig);
10467     }
10468   else if (EQ (coding_type, Qundecided))
10469     {
10470       if (nargs < coding_arg_undecided_max)
10471         goto short_args;
10472       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10473             args[coding_arg_undecided_inhibit_null_byte_detection]);
10474       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10475             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10476       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10477             args[coding_arg_undecided_prefer_utf_8]);
10478       category = coding_category_undecided;
10479     }
10480   else
10481     error ("Invalid coding system type: %s",
10482            SDATA (SYMBOL_NAME (coding_type)));
10483
10484   ASET (attrs, coding_attr_category, make_number (category));
10485   ASET (attrs, coding_attr_plist,
10486         Fcons (QCcategory,
10487                Fcons (AREF (Vcoding_category_table, category),
10488                       CODING_ATTR_PLIST (attrs))));
10489   ASET (attrs, coding_attr_plist,
10490         Fcons (QCascii_compatible_p,
10491                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10492                       CODING_ATTR_PLIST (attrs))));
10493
10494   eol_type = args[coding_arg_eol_type];
10495   if (! NILP (eol_type)
10496       && ! EQ (eol_type, Qunix)
10497       && ! EQ (eol_type, Qdos)
10498       && ! EQ (eol_type, Qmac))
10499     error ("Invalid eol-type");
10500
10501   aliases = list1 (name);
10502
10503   if (NILP (eol_type))
10504     {
10505       eol_type = make_subsidiaries (name);
10506       for (i = 0; i < 3; i++)
10507         {
10508           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10509
10510           this_name = AREF (eol_type, i);
10511           this_aliases = list1 (this_name);
10512           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10513           this_spec = make_uninit_vector (3);
10514           ASET (this_spec, 0, attrs);
10515           ASET (this_spec, 1, this_aliases);
10516           ASET (this_spec, 2, this_eol_type);
10517           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10518           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10519           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10520           if (NILP (val))
10521             Vcoding_system_alist
10522               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10523                        Vcoding_system_alist);
10524         }
10525     }
10526
10527   spec_vec = make_uninit_vector (3);
10528   ASET (spec_vec, 0, attrs);
10529   ASET (spec_vec, 1, aliases);
10530   ASET (spec_vec, 2, eol_type);
10531
10532   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10533   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10534   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10535   if (NILP (val))
10536     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10537                                   Vcoding_system_alist);
10538
10539   {
10540     int id = coding_categories[category].id;
10541
10542     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10543       setup_coding_system (name, &coding_categories[category]);
10544   }
10545
10546   return Qnil;
10547
10548  short_args:
10549   return Fsignal (Qwrong_number_of_arguments,
10550                   Fcons (intern ("define-coding-system-internal"),
10551                          make_number (nargs)));
10552 }
10553
10554
10555 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10556        3, 3, 0,
10557        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10558   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10559 {
10560   Lisp_Object spec, attrs;
10561
10562   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10563   attrs = AREF (spec, 0);
10564   if (EQ (prop, QCmnemonic))
10565     {
10566       if (! STRINGP (val))
10567         CHECK_CHARACTER (val);
10568       ASET (attrs, coding_attr_mnemonic, val);
10569     }
10570   else if (EQ (prop, QCdefault_char))
10571     {
10572       if (NILP (val))
10573         val = make_number (' ');
10574       else
10575         CHECK_CHARACTER (val);
10576       ASET (attrs, coding_attr_default_char, val);
10577     }
10578   else if (EQ (prop, QCdecode_translation_table))
10579     {
10580       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10581         CHECK_SYMBOL (val);
10582       ASET (attrs, coding_attr_decode_tbl, val);
10583     }
10584   else if (EQ (prop, QCencode_translation_table))
10585     {
10586       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10587         CHECK_SYMBOL (val);
10588       ASET (attrs, coding_attr_encode_tbl, val);
10589     }
10590   else if (EQ (prop, QCpost_read_conversion))
10591     {
10592       CHECK_SYMBOL (val);
10593       ASET (attrs, coding_attr_post_read, val);
10594     }
10595   else if (EQ (prop, QCpre_write_conversion))
10596     {
10597       CHECK_SYMBOL (val);
10598       ASET (attrs, coding_attr_pre_write, val);
10599     }
10600   else if (EQ (prop, QCascii_compatible_p))
10601     {
10602       ASET (attrs, coding_attr_ascii_compat, val);
10603     }
10604
10605   ASET (attrs, coding_attr_plist,
10606         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10607   return val;
10608 }
10609
10610
10611 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10612        Sdefine_coding_system_alias, 2, 2, 0,
10613        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10614   (Lisp_Object alias, Lisp_Object coding_system)
10615 {
10616   Lisp_Object spec, aliases, eol_type, val;
10617
10618   CHECK_SYMBOL (alias);
10619   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10620   aliases = AREF (spec, 1);
10621   /* ALIASES should be a list of length more than zero, and the first
10622      element is a base coding system.  Append ALIAS at the tail of the
10623      list.  */
10624   while (!NILP (XCDR (aliases)))
10625     aliases = XCDR (aliases);
10626   XSETCDR (aliases, list1 (alias));
10627
10628   eol_type = AREF (spec, 2);
10629   if (VECTORP (eol_type))
10630     {
10631       Lisp_Object subsidiaries;
10632       int i;
10633
10634       subsidiaries = make_subsidiaries (alias);
10635       for (i = 0; i < 3; i++)
10636         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10637                                      AREF (eol_type, i));
10638     }
10639
10640   Fputhash (alias, spec, Vcoding_system_hash_table);
10641   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10642   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10643   if (NILP (val))
10644     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10645                                   Vcoding_system_alist);
10646
10647   return Qnil;
10648 }
10649
10650 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10651        1, 1, 0,
10652        doc: /* Return the base of CODING-SYSTEM.
10653 Any alias or subsidiary coding system is not a base coding system.  */)
10654   (Lisp_Object coding_system)
10655 {
10656   Lisp_Object spec, attrs;
10657
10658   if (NILP (coding_system))
10659     return (Qno_conversion);
10660   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10661   attrs = AREF (spec, 0);
10662   return CODING_ATTR_BASE_NAME (attrs);
10663 }
10664
10665 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10666        1, 1, 0,
10667        doc: /* Return the property list of CODING-SYSTEM.  */)
10668   (Lisp_Object coding_system)
10669 {
10670   Lisp_Object spec, attrs;
10671
10672   if (NILP (coding_system))
10673     coding_system = Qno_conversion;
10674   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10675   attrs = AREF (spec, 0);
10676   return CODING_ATTR_PLIST (attrs);
10677 }
10678
10679
10680 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10681        1, 1, 0,
10682        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10683   (Lisp_Object coding_system)
10684 {
10685   Lisp_Object spec;
10686
10687   if (NILP (coding_system))
10688     coding_system = Qno_conversion;
10689   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10690   return AREF (spec, 1);
10691 }
10692
10693 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10694        Scoding_system_eol_type, 1, 1, 0,
10695        doc: /* Return eol-type of CODING-SYSTEM.
10696 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10697
10698 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10699 and CR respectively.
10700
10701 A vector value indicates that a format of end-of-line should be
10702 detected automatically.  Nth element of the vector is the subsidiary
10703 coding system whose eol-type is N.  */)
10704   (Lisp_Object coding_system)
10705 {
10706   Lisp_Object spec, eol_type;
10707   int n;
10708
10709   if (NILP (coding_system))
10710     coding_system = Qno_conversion;
10711   if (! CODING_SYSTEM_P (coding_system))
10712     return Qnil;
10713   spec = CODING_SYSTEM_SPEC (coding_system);
10714   eol_type = AREF (spec, 2);
10715   if (VECTORP (eol_type))
10716     return Fcopy_sequence (eol_type);
10717   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10718   return make_number (n);
10719 }
10720
10721 #endif /* emacs */
10722
10723 \f
10724 /*** 9. Post-amble ***/
10725
10726 void
10727 init_coding_once (void)
10728 {
10729   int i;
10730
10731   for (i = 0; i < coding_category_max; i++)
10732     {
10733       coding_categories[i].id = -1;
10734       coding_priorities[i] = i;
10735     }
10736
10737   /* ISO2022 specific initialize routine.  */
10738   for (i = 0; i < 0x20; i++)
10739     iso_code_class[i] = ISO_control_0;
10740   for (i = 0x21; i < 0x7F; i++)
10741     iso_code_class[i] = ISO_graphic_plane_0;
10742   for (i = 0x80; i < 0xA0; i++)
10743     iso_code_class[i] = ISO_control_1;
10744   for (i = 0xA1; i < 0xFF; i++)
10745     iso_code_class[i] = ISO_graphic_plane_1;
10746   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10747   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10748   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10749   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10750   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10751   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10752   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10753   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10754   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10755
10756   for (i = 0; i < 256; i++)
10757     {
10758       emacs_mule_bytes[i] = 1;
10759     }
10760   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10761   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10762   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10763   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10764 }
10765
10766 #ifdef emacs
10767
10768 void
10769 syms_of_coding (void)
10770 {
10771   staticpro (&Vcoding_system_hash_table);
10772   Vcoding_system_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
10773
10774   staticpro (&Vsjis_coding_system);
10775   Vsjis_coding_system = Qnil;
10776
10777   staticpro (&Vbig5_coding_system);
10778   Vbig5_coding_system = Qnil;
10779
10780   staticpro (&Vcode_conversion_reused_workbuf);
10781   Vcode_conversion_reused_workbuf = Qnil;
10782
10783   staticpro (&Vcode_conversion_workbuf_name);
10784   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10785
10786   reused_workbuf_in_use = 0;
10787
10788   DEFSYM (Qcharset, "charset");
10789   DEFSYM (Qtarget_idx, "target-idx");
10790   DEFSYM (Qcoding_system_history, "coding-system-history");
10791   Fset (Qcoding_system_history, Qnil);
10792
10793   /* Target FILENAME is the first argument.  */
10794   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10795   /* Target FILENAME is the third argument.  */
10796   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10797
10798   DEFSYM (Qcall_process, "call-process");
10799   /* Target PROGRAM is the first argument.  */
10800   Fput (Qcall_process, Qtarget_idx, make_number (0));
10801
10802   DEFSYM (Qcall_process_region, "call-process-region");
10803   /* Target PROGRAM is the third argument.  */
10804   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10805
10806   DEFSYM (Qstart_process, "start-process");
10807   /* Target PROGRAM is the third argument.  */
10808   Fput (Qstart_process, Qtarget_idx, make_number (2));
10809
10810   DEFSYM (Qopen_network_stream, "open-network-stream");
10811   /* Target SERVICE is the fourth argument.  */
10812   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10813
10814   DEFSYM (Qunix, "unix");
10815   DEFSYM (Qdos, "dos");
10816   DEFSYM (Qmac, "mac");
10817
10818   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10819   DEFSYM (Qundecided, "undecided");
10820   DEFSYM (Qno_conversion, "no-conversion");
10821   DEFSYM (Qraw_text, "raw-text");
10822
10823   DEFSYM (Qiso_2022, "iso-2022");
10824
10825   DEFSYM (Qutf_8, "utf-8");
10826   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10827
10828 #if defined (WINDOWSNT) || defined (CYGWIN)
10829   /* No, not utf-16-le: that one has a BOM.  */
10830   DEFSYM (Qutf_16le, "utf-16le");
10831 #endif
10832
10833   DEFSYM (Qutf_16, "utf-16");
10834   DEFSYM (Qbig, "big");
10835   DEFSYM (Qlittle, "little");
10836
10837   DEFSYM (Qshift_jis, "shift-jis");
10838   DEFSYM (Qbig5, "big5");
10839
10840   DEFSYM (Qcoding_system_p, "coding-system-p");
10841
10842   /* Error signaled when there's a problem with detecting a coding system.  */
10843   DEFSYM (Qcoding_system_error, "coding-system-error");
10844   Fput (Qcoding_system_error, Qerror_conditions,
10845         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10846   Fput (Qcoding_system_error, Qerror_message,
10847         build_pure_c_string ("Invalid coding system"));
10848
10849   DEFSYM (Qtranslation_table, "translation-table");
10850   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10851   DEFSYM (Qtranslation_table_id, "translation-table-id");
10852
10853   /* Coding system emacs-mule and raw-text are for converting only
10854      end-of-line format.  */
10855   DEFSYM (Qemacs_mule, "emacs-mule");
10856
10857   DEFSYM (QCcategory, ":category");
10858   DEFSYM (QCmnemonic, ":mnemonic");
10859   DEFSYM (QCdefault_char, ":default-char");
10860   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10861   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10862   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10863   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10864   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10865
10866   Vcoding_category_table
10867     = Fmake_vector (make_number (coding_category_max), Qnil);
10868   staticpro (&Vcoding_category_table);
10869   /* Followings are target of code detection.  */
10870   ASET (Vcoding_category_table, coding_category_iso_7,
10871         intern_c_string ("coding-category-iso-7"));
10872   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10873         intern_c_string ("coding-category-iso-7-tight"));
10874   ASET (Vcoding_category_table, coding_category_iso_8_1,
10875         intern_c_string ("coding-category-iso-8-1"));
10876   ASET (Vcoding_category_table, coding_category_iso_8_2,
10877         intern_c_string ("coding-category-iso-8-2"));
10878   ASET (Vcoding_category_table, coding_category_iso_7_else,
10879         intern_c_string ("coding-category-iso-7-else"));
10880   ASET (Vcoding_category_table, coding_category_iso_8_else,
10881         intern_c_string ("coding-category-iso-8-else"));
10882   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10883         intern_c_string ("coding-category-utf-8-auto"));
10884   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10885         intern_c_string ("coding-category-utf-8"));
10886   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10887         intern_c_string ("coding-category-utf-8-sig"));
10888   ASET (Vcoding_category_table, coding_category_utf_16_be,
10889         intern_c_string ("coding-category-utf-16-be"));
10890   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10891         intern_c_string ("coding-category-utf-16-auto"));
10892   ASET (Vcoding_category_table, coding_category_utf_16_le,
10893         intern_c_string ("coding-category-utf-16-le"));
10894   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10895         intern_c_string ("coding-category-utf-16-be-nosig"));
10896   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10897         intern_c_string ("coding-category-utf-16-le-nosig"));
10898   ASET (Vcoding_category_table, coding_category_charset,
10899         intern_c_string ("coding-category-charset"));
10900   ASET (Vcoding_category_table, coding_category_sjis,
10901         intern_c_string ("coding-category-sjis"));
10902   ASET (Vcoding_category_table, coding_category_big5,
10903         intern_c_string ("coding-category-big5"));
10904   ASET (Vcoding_category_table, coding_category_ccl,
10905         intern_c_string ("coding-category-ccl"));
10906   ASET (Vcoding_category_table, coding_category_emacs_mule,
10907         intern_c_string ("coding-category-emacs-mule"));
10908   /* Followings are NOT target of code detection.  */
10909   ASET (Vcoding_category_table, coding_category_raw_text,
10910         intern_c_string ("coding-category-raw-text"));
10911   ASET (Vcoding_category_table, coding_category_undecided,
10912         intern_c_string ("coding-category-undecided"));
10913
10914   DEFSYM (Qinsufficient_source, "insufficient-source");
10915   DEFSYM (Qinvalid_source, "invalid-source");
10916   DEFSYM (Qinterrupted, "interrupted");
10917
10918   /* If a symbol has this property, evaluate the value to define the
10919      symbol as a coding system.  */
10920   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10921
10922   defsubr (&Scoding_system_p);
10923   defsubr (&Sread_coding_system);
10924   defsubr (&Sread_non_nil_coding_system);
10925   defsubr (&Scheck_coding_system);
10926   defsubr (&Sdetect_coding_region);
10927   defsubr (&Sdetect_coding_string);
10928   defsubr (&Sfind_coding_systems_region_internal);
10929   defsubr (&Sunencodable_char_position);
10930   defsubr (&Scheck_coding_systems_region);
10931   defsubr (&Sdecode_coding_region);
10932   defsubr (&Sencode_coding_region);
10933   defsubr (&Sdecode_coding_string);
10934   defsubr (&Sencode_coding_string);
10935   defsubr (&Sdecode_sjis_char);
10936   defsubr (&Sencode_sjis_char);
10937   defsubr (&Sdecode_big5_char);
10938   defsubr (&Sencode_big5_char);
10939   defsubr (&Sset_terminal_coding_system_internal);
10940   defsubr (&Sset_safe_terminal_coding_system_internal);
10941   defsubr (&Sterminal_coding_system);
10942   defsubr (&Sset_keyboard_coding_system_internal);
10943   defsubr (&Skeyboard_coding_system);
10944   defsubr (&Sfind_operation_coding_system);
10945   defsubr (&Sset_coding_system_priority);
10946   defsubr (&Sdefine_coding_system_internal);
10947   defsubr (&Sdefine_coding_system_alias);
10948   defsubr (&Scoding_system_put);
10949   defsubr (&Scoding_system_base);
10950   defsubr (&Scoding_system_plist);
10951   defsubr (&Scoding_system_aliases);
10952   defsubr (&Scoding_system_eol_type);
10953   defsubr (&Scoding_system_priority_list);
10954
10955   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10956                doc: /* List of coding systems.
10957
10958 Do not alter the value of this variable manually.  This variable should be
10959 updated by the functions `define-coding-system' and
10960 `define-coding-system-alias'.  */);
10961   Vcoding_system_list = Qnil;
10962
10963   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10964                doc: /* Alist of coding system names.
10965 Each element is one element list of coding system name.
10966 This variable is given to `completing-read' as COLLECTION argument.
10967
10968 Do not alter the value of this variable manually.  This variable should be
10969 updated by the functions `make-coding-system' and
10970 `define-coding-system-alias'.  */);
10971   Vcoding_system_alist = Qnil;
10972
10973   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10974                doc: /* List of coding-categories (symbols) ordered by priority.
10975
10976 On detecting a coding system, Emacs tries code detection algorithms
10977 associated with each coding-category one by one in this order.  When
10978 one algorithm agrees with a byte sequence of source text, the coding
10979 system bound to the corresponding coding-category is selected.
10980
10981 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10982   {
10983     int i;
10984
10985     Vcoding_category_list = Qnil;
10986     for (i = coding_category_max - 1; i >= 0; i--)
10987       Vcoding_category_list
10988         = Fcons (AREF (Vcoding_category_table, i),
10989                  Vcoding_category_list);
10990   }
10991
10992   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10993                doc: /* Specify the coding system for read operations.
10994 It is useful to bind this variable with `let', but do not set it globally.
10995 If the value is a coding system, it is used for decoding on read operation.
10996 If not, an appropriate element is used from one of the coding system alists.
10997 There are three such tables: `file-coding-system-alist',
10998 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10999   Vcoding_system_for_read = Qnil;
11000
11001   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11002                doc: /* Specify the coding system for write operations.
11003 Programs bind this variable with `let', but you should not set it globally.
11004 If the value is a coding system, it is used for encoding of output,
11005 when writing it to a file and when sending it to a file or subprocess.
11006
11007 If this does not specify a coding system, an appropriate element
11008 is used from one of the coding system alists.
11009 There are three such tables: `file-coding-system-alist',
11010 `process-coding-system-alist', and `network-coding-system-alist'.
11011 For output to files, if the above procedure does not specify a coding system,
11012 the value of `buffer-file-coding-system' is used.  */);
11013   Vcoding_system_for_write = Qnil;
11014
11015   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11016                doc: /*
11017 Coding system used in the latest file or process I/O.  */);
11018   Vlast_coding_system_used = Qnil;
11019
11020   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11021                doc: /*
11022 Error status of the last code conversion.
11023
11024 When an error was detected in the last code conversion, this variable
11025 is set to one of the following symbols.
11026   `insufficient-source'
11027   `inconsistent-eol'
11028   `invalid-source'
11029   `interrupted'
11030   `insufficient-memory'
11031 When no error was detected, the value doesn't change.  So, to check
11032 the error status of a code conversion by this variable, you must
11033 explicitly set this variable to nil before performing code
11034 conversion.  */);
11035   Vlast_code_conversion_error = Qnil;
11036
11037   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11038                doc: /*
11039 Non-nil means always inhibit code conversion of end-of-line format.
11040 See info node `Coding Systems' and info node `Text and Binary' concerning
11041 such conversion.  */);
11042   inhibit_eol_conversion = 0;
11043
11044   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11045                doc: /*
11046 Non-nil means process buffer inherits coding system of process output.
11047 Bind it to t if the process output is to be treated as if it were a file
11048 read from some filesystem.  */);
11049   inherit_process_coding_system = 0;
11050
11051   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11052                doc: /*
11053 Alist to decide a coding system to use for a file I/O operation.
11054 The format is ((PATTERN . VAL) ...),
11055 where PATTERN is a regular expression matching a file name,
11056 VAL is a coding system, a cons of coding systems, or a function symbol.
11057 If VAL is a coding system, it is used for both decoding and encoding
11058 the file contents.
11059 If VAL is a cons of coding systems, the car part is used for decoding,
11060 and the cdr part is used for encoding.
11061 If VAL is a function symbol, the function must return a coding system
11062 or a cons of coding systems which are used as above.  The function is
11063 called with an argument that is a list of the arguments with which
11064 `find-operation-coding-system' was called.  If the function can't decide
11065 a coding system, it can return `undecided' so that the normal
11066 code-detection is performed.
11067
11068 See also the function `find-operation-coding-system'
11069 and the variable `auto-coding-alist'.  */);
11070   Vfile_coding_system_alist = Qnil;
11071
11072   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11073                doc: /*
11074 Alist to decide a coding system to use for a process I/O operation.
11075 The format is ((PATTERN . VAL) ...),
11076 where PATTERN is a regular expression matching a program name,
11077 VAL is a coding system, a cons of coding systems, or a function symbol.
11078 If VAL is a coding system, it is used for both decoding what received
11079 from the program and encoding what sent to the program.
11080 If VAL is a cons of coding systems, the car part is used for decoding,
11081 and the cdr part is used for encoding.
11082 If VAL is a function symbol, the function must return a coding system
11083 or a cons of coding systems which are used as above.
11084
11085 See also the function `find-operation-coding-system'.  */);
11086   Vprocess_coding_system_alist = Qnil;
11087
11088   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11089                doc: /*
11090 Alist to decide a coding system to use for a network I/O operation.
11091 The format is ((PATTERN . VAL) ...),
11092 where PATTERN is a regular expression matching a network service name
11093 or is a port number to connect to,
11094 VAL is a coding system, a cons of coding systems, or a function symbol.
11095 If VAL is a coding system, it is used for both decoding what received
11096 from the network stream and encoding what sent to the network stream.
11097 If VAL is a cons of coding systems, the car part is used for decoding,
11098 and the cdr part is used for encoding.
11099 If VAL is a function symbol, the function must return a coding system
11100 or a cons of coding systems which are used as above.
11101
11102 See also the function `find-operation-coding-system'.  */);
11103   Vnetwork_coding_system_alist = Qnil;
11104
11105   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11106                doc: /* Coding system to use with system messages.
11107 Also used for decoding keyboard input on X Window system, and for
11108 encoding standard output and error streams.  */);
11109   Vlocale_coding_system = Qnil;
11110
11111   /* The eol mnemonics are reset in startup.el system-dependently.  */
11112   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11113                doc: /*
11114 String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11115   eol_mnemonic_unix = build_pure_c_string (":");
11116
11117   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11118                doc: /*
11119 String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11120   eol_mnemonic_dos = build_pure_c_string ("\\");
11121
11122   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11123                doc: /*
11124 String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11125   eol_mnemonic_mac = build_pure_c_string ("/");
11126
11127   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11128                doc: /*
11129 String displayed in mode line when end-of-line format is not yet determined.  */);
11130   eol_mnemonic_undecided = build_pure_c_string (":");
11131
11132   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11133                doc: /*
11134 Non-nil enables character translation while encoding and decoding.  */);
11135   Venable_character_translation = Qt;
11136
11137   DEFVAR_LISP ("standard-translation-table-for-decode",
11138                Vstandard_translation_table_for_decode,
11139                doc: /* Table for translating characters while decoding.  */);
11140   Vstandard_translation_table_for_decode = Qnil;
11141
11142   DEFVAR_LISP ("standard-translation-table-for-encode",
11143                Vstandard_translation_table_for_encode,
11144                doc: /* Table for translating characters while encoding.  */);
11145   Vstandard_translation_table_for_encode = Qnil;
11146
11147   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11148                doc: /* Alist of charsets vs revision numbers.
11149 While encoding, if a charset (car part of an element) is found,
11150 designate it with the escape sequence identifying revision (cdr part
11151 of the element).  */);
11152   Vcharset_revision_table = Qnil;
11153
11154   DEFVAR_LISP ("default-process-coding-system",
11155                Vdefault_process_coding_system,
11156                doc: /* Cons of coding systems used for process I/O by default.
11157 The car part is used for decoding a process output,
11158 the cdr part is used for encoding a text to be sent to a process.  */);
11159   Vdefault_process_coding_system = Qnil;
11160
11161   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11162                doc: /*
11163 Table of extra Latin codes in the range 128..159 (inclusive).
11164 This is a vector of length 256.
11165 If Nth element is non-nil, the existence of code N in a file
11166 \(or output of subprocess) doesn't prevent it to be detected as
11167 a coding system of ISO 2022 variant which has a flag
11168 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11169 or reading output of a subprocess.
11170 Only 128th through 159th elements have a meaning.  */);
11171   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11172
11173   DEFVAR_LISP ("select-safe-coding-system-function",
11174                Vselect_safe_coding_system_function,
11175                doc: /*
11176 Function to call to select safe coding system for encoding a text.
11177
11178 If set, this function is called to force a user to select a proper
11179 coding system which can encode the text in the case that a default
11180 coding system used in each operation can't encode the text.  The
11181 function should take care that the buffer is not modified while
11182 the coding system is being selected.
11183
11184 The default value is `select-safe-coding-system' (which see).  */);
11185   Vselect_safe_coding_system_function = Qnil;
11186
11187   DEFVAR_BOOL ("coding-system-require-warning",
11188                coding_system_require_warning,
11189                doc: /* Internal use only.
11190 If non-nil, on writing a file, `select-safe-coding-system-function' is
11191 called even if `coding-system-for-write' is non-nil.  The command
11192 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11193   coding_system_require_warning = 0;
11194
11195
11196   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11197                inhibit_iso_escape_detection,
11198                doc: /*
11199 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11200
11201 When Emacs reads text, it tries to detect how the text is encoded.
11202 This code detection is sensitive to escape sequences.  If Emacs sees
11203 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11204 of the ISO2022 encodings, and decodes text by the corresponding coding
11205 system (e.g. `iso-2022-7bit').
11206
11207 However, there may be a case that you want to read escape sequences in
11208 a file as is.  In such a case, you can set this variable to non-nil.
11209 Then the code detection will ignore any escape sequences, and no text is
11210 detected as encoded in some ISO-2022 encoding.  The result is that all
11211 escape sequences become visible in a buffer.
11212
11213 The default value is nil, and it is strongly recommended not to change
11214 it.  That is because many Emacs Lisp source files that contain
11215 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11216 in Emacs's distribution, and they won't be decoded correctly on
11217 reading if you suppress escape sequence detection.
11218
11219 The other way to read escape sequences in a file without decoding is
11220 to explicitly specify some coding system that doesn't use ISO-2022
11221 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11222   inhibit_iso_escape_detection = 0;
11223
11224   DEFVAR_BOOL ("inhibit-null-byte-detection",
11225                inhibit_null_byte_detection,
11226                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11227 By default, Emacs treats it as binary data, and does not attempt to
11228 decode it.  The effect is as if you specified `no-conversion' for
11229 reading that text.
11230
11231 Set this to non-nil when a regular text happens to include null bytes.
11232 Examples are Index nodes of Info files and null-byte delimited output
11233 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11234 decode text as usual.  */);
11235   inhibit_null_byte_detection = 0;
11236
11237   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11238                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11239 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11240   disable_ascii_optimization = 0;
11241
11242   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11243                doc: /* Char table for translating self-inserting characters.
11244 This is applied to the result of input methods, not their input.
11245 See also `keyboard-translate-table'.
11246
11247 Use of this variable for character code unification was rendered
11248 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11249 internal character representation.  */);
11250   Vtranslation_table_for_input = Qnil;
11251
11252   Lisp_Object args[coding_arg_undecided_max];
11253   memclear (args, sizeof args);
11254
11255   Lisp_Object plist[] =
11256     {
11257       QCname,
11258       args[coding_arg_name] = Qno_conversion,
11259       QCmnemonic,
11260       args[coding_arg_mnemonic] = make_number ('='),
11261       intern_c_string (":coding-type"),
11262       args[coding_arg_coding_type] = Qraw_text,
11263       QCascii_compatible_p,
11264       args[coding_arg_ascii_compatible_p] = Qt,
11265       QCdefault_char,
11266       args[coding_arg_default_char] = make_number (0),
11267       intern_c_string (":for-unibyte"),
11268       args[coding_arg_for_unibyte] = Qt,
11269       intern_c_string (":docstring"),
11270       (build_pure_c_string
11271        ("Do no conversion.\n"
11272         "\n"
11273         "When you visit a file with this coding, the file is read into a\n"
11274         "unibyte buffer as is, thus each byte of a file is treated as a\n"
11275         "character.")),
11276       intern_c_string (":eol-type"),
11277       args[coding_arg_eol_type] = Qunix,
11278     };
11279   args[coding_arg_plist] = CALLMANY (Flist, plist);
11280   Fdefine_coding_system_internal (coding_arg_max, args);
11281
11282   plist[1] = args[coding_arg_name] = Qundecided;
11283   plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11284   plist[5] = args[coding_arg_coding_type] = Qundecided;
11285   /* This is already set.
11286      plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11287   plist[8] = intern_c_string (":charset-list");
11288   plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11289   plist[11] = args[coding_arg_for_unibyte] = Qnil;
11290   plist[13] = build_pure_c_string ("No conversion on encoding, "
11291                                    "automatic conversion on decoding.");
11292   plist[15] = args[coding_arg_eol_type] = Qnil;
11293   args[coding_arg_plist] = CALLMANY (Flist, plist);
11294   args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11295   args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11296   Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11297
11298   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11299
11300   for (int i = 0; i < coding_category_max; i++)
11301     Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11302
11303 #if defined (DOS_NT)
11304   system_eol_type = Qdos;
11305 #else
11306   system_eol_type = Qunix;
11307 #endif
11308   staticpro (&system_eol_type);
11309 }
11310 #endif /* emacs */