src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2017 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or (at
  16 your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  Classic Mac OS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "termhooks.h"
 301
 302 Lisp_Object Vcoding_system_hash_table;
 303
 304 /* Format of end-of-line decided by system.  This is Qunix on
 305    Unix and Mac, Qdos on DOS/Windows.
 306    This has an effect only for external encoding (i.e. for output to
 307    file and process), not for in-buffer or Lisp string encoding.  */
 308 static Lisp_Object system_eol_type;
 309
 310 #ifdef emacs
 311
 312 /* Coding-systems are handed between Emacs Lisp programs and C internal
 313    routines by the following three variables.  */
 314 /* Coding system to be used to encode text for terminal display when
 315    terminal coding system is nil.  */
 316 struct coding_system safe_terminal_coding;
 317
 318 #endif /* emacs */
 319
 320 /* Two special coding systems.  */
 321 static Lisp_Object Vsjis_coding_system;
 322 static Lisp_Object Vbig5_coding_system;
 323
 324 /* ISO2022 section */
 325
 326 #define CODING_ISO_INITIAL(coding, reg)                 \
 327   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 328                      coding_attr_iso_initial),          \
 329                reg)))
 330
 331
 332 #define CODING_ISO_REQUEST(coding, charset_id)          \
 333   (((charset_id) <= (coding)->max_charset_id            \
 334     ? ((coding)->safe_charsets[charset_id] != 255       \
 335        ? (coding)->safe_charsets[charset_id]            \
 336        : -1)                                            \
 337     : -1))
 338
 339
 340 #define CODING_ISO_FLAGS(coding)        \
 341   ((coding)->spec.iso_2022.flags)
 342 #define CODING_ISO_DESIGNATION(coding, reg)     \
 343   ((coding)->spec.iso_2022.current_designation[reg])
 344 #define CODING_ISO_INVOCATION(coding, plane)    \
 345   ((coding)->spec.iso_2022.current_invocation[plane])
 346 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 347   ((coding)->spec.iso_2022.single_shifting)
 348 #define CODING_ISO_BOL(coding)  \
 349   ((coding)->spec.iso_2022.bol)
 350 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 351   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1       \
 352    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
 353 #define CODING_ISO_CMP_STATUS(coding)   \
 354   (&(coding)->spec.iso_2022.cmp_status)
 355 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 356   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 357 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 358   ((coding)->spec.iso_2022.embedded_utf_8)
 359
 360 /* Control characters of ISO2022.  */
 361                         /* code */      /* function */
 362 #define ISO_CODE_SO     0x0E            /* shift-out */
 363 #define ISO_CODE_SI     0x0F            /* shift-in */
 364 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 365 #define ISO_CODE_ESC    0x1B            /* escape */
 366 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 367 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 368 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 369
 370 /* All code (1-byte) of ISO2022 is classified into one of the
 371    followings.  */
 372 enum iso_code_class_type
 373   {
 374     ISO_control_0,              /* Control codes in the range
 375                                    0x00..0x1F and 0x7F, except for the
 376                                    following 5 codes.  */
 377     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 378     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 379     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 380     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 381     ISO_control_1,              /* Control codes in the range
 382                                    0x80..0x9F, except for the
 383                                    following 3 codes.  */
 384     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 385     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 386     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 387     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 388     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 389     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 390     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 391   };
 392
 393 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 394     `iso-flags' attribute of an iso2022 coding system.  */
 395
 396 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 397    instead of the correct short-form sequence (e.g. ESC $ A).  */
 398 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 399
 400 /* If set, reset graphic planes and registers at end-of-line to the
 401    initial state.  */
 402 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 403
 404 /* If set, reset graphic planes and registers before any control
 405    characters to the initial state.  */
 406 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 407
 408 /* If set, encode by 7-bit environment.  */
 409 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 410
 411 /* If set, use locking-shift function.  */
 412 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 413
 414 /* If set, use single-shift function.  Overwrite
 415    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 416 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 417
 418 /* If set, use designation escape sequence.  */
 419 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 420
 421 /* If set, produce revision number sequence.  */
 422 #define CODING_ISO_FLAG_REVISION        0x0080
 423
 424 /* If set, produce ISO6429's direction specifying sequence.  */
 425 #define CODING_ISO_FLAG_DIRECTION       0x0100
 426
 427 /* If set, assume designation states are reset at beginning of line on
 428    output.  */
 429 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 430
 431 /* If set, designation sequence should be placed at beginning of line
 432    on output.  */
 433 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 434
 435 /* If set, do not encode unsafe characters on output.  */
 436 #define CODING_ISO_FLAG_SAFE            0x0800
 437
 438 /* If set, extra latin codes (128..159) are accepted as a valid code
 439    on input.  */
 440 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 441
 442 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 443
 444 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 445
 446 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 447
 448 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 449
 450 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 451
 452 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 453
 454 /* A character to be produced on output if encoding of the original
 455    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 456 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 457
 458 /* UTF-8 section */
 459 #define CODING_UTF_8_BOM(coding)        \
 460   ((coding)->spec.utf_8_bom)
 461
 462 /* UTF-16 section */
 463 #define CODING_UTF_16_BOM(coding)       \
 464   ((coding)->spec.utf_16.bom)
 465
 466 #define CODING_UTF_16_ENDIAN(coding)    \
 467   ((coding)->spec.utf_16.endian)
 468
 469 #define CODING_UTF_16_SURROGATE(coding) \
 470   ((coding)->spec.utf_16.surrogate)
 471
 472
 473 /* CCL section */
 474 #define CODING_CCL_DECODER(coding)      \
 475   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 476 #define CODING_CCL_ENCODER(coding)      \
 477   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 478 #define CODING_CCL_VALIDS(coding)                                          \
 479   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 480
 481 /* Index for each coding category in `coding_categories' */
 482
 483 enum coding_category
 484   {
 485     coding_category_iso_7,
 486     coding_category_iso_7_tight,
 487     coding_category_iso_8_1,
 488     coding_category_iso_8_2,
 489     coding_category_iso_7_else,
 490     coding_category_iso_8_else,
 491     coding_category_utf_8_auto,
 492     coding_category_utf_8_nosig,
 493     coding_category_utf_8_sig,
 494     coding_category_utf_16_auto,
 495     coding_category_utf_16_be,
 496     coding_category_utf_16_le,
 497     coding_category_utf_16_be_nosig,
 498     coding_category_utf_16_le_nosig,
 499     coding_category_charset,
 500     coding_category_sjis,
 501     coding_category_big5,
 502     coding_category_ccl,
 503     coding_category_emacs_mule,
 504     /* All above are targets of code detection.  */
 505     coding_category_raw_text,
 506     coding_category_undecided,
 507     coding_category_max
 508   };
 509
 510 /* Definitions of flag bits used in detect_coding_XXXX.  */
 511 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 512 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 513 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 514 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 515 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 516 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 517 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 518 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 519 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 520 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 521 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 522 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 523 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 524 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 525 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 526 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 527 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 528 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 529 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 530 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 531
 532 /* This value is returned if detect_coding_mask () find nothing other
 533    than ASCII characters.  */
 534 #define CATEGORY_MASK_ANY               \
 535   (CATEGORY_MASK_ISO_7                  \
 536    | CATEGORY_MASK_ISO_7_TIGHT          \
 537    | CATEGORY_MASK_ISO_8_1              \
 538    | CATEGORY_MASK_ISO_8_2              \
 539    | CATEGORY_MASK_ISO_7_ELSE           \
 540    | CATEGORY_MASK_ISO_8_ELSE           \
 541    | CATEGORY_MASK_UTF_8_AUTO           \
 542    | CATEGORY_MASK_UTF_8_NOSIG          \
 543    | CATEGORY_MASK_UTF_8_SIG            \
 544    | CATEGORY_MASK_UTF_16_AUTO          \
 545    | CATEGORY_MASK_UTF_16_BE            \
 546    | CATEGORY_MASK_UTF_16_LE            \
 547    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 548    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 549    | CATEGORY_MASK_CHARSET              \
 550    | CATEGORY_MASK_SJIS                 \
 551    | CATEGORY_MASK_BIG5                 \
 552    | CATEGORY_MASK_CCL                  \
 553    | CATEGORY_MASK_EMACS_MULE)
 554
 555
 556 #define CATEGORY_MASK_ISO_7BIT \
 557   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 558
 559 #define CATEGORY_MASK_ISO_8BIT \
 560   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 561
 562 #define CATEGORY_MASK_ISO_ELSE \
 563   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 564
 565 #define CATEGORY_MASK_ISO_ESCAPE        \
 566   (CATEGORY_MASK_ISO_7                  \
 567    | CATEGORY_MASK_ISO_7_TIGHT          \
 568    | CATEGORY_MASK_ISO_7_ELSE           \
 569    | CATEGORY_MASK_ISO_8_ELSE)
 570
 571 #define CATEGORY_MASK_ISO       \
 572   (  CATEGORY_MASK_ISO_7BIT     \
 573      | CATEGORY_MASK_ISO_8BIT   \
 574      | CATEGORY_MASK_ISO_ELSE)
 575
 576 #define CATEGORY_MASK_UTF_16            \
 577   (CATEGORY_MASK_UTF_16_AUTO            \
 578    | CATEGORY_MASK_UTF_16_BE            \
 579    | CATEGORY_MASK_UTF_16_LE            \
 580    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 581    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 582
 583 #define CATEGORY_MASK_UTF_8     \
 584   (CATEGORY_MASK_UTF_8_AUTO     \
 585    | CATEGORY_MASK_UTF_8_NOSIG  \
 586    | CATEGORY_MASK_UTF_8_SIG)
 587
 588 /* Table of coding categories (Lisp symbols).  This variable is for
 589    internal use only.  */
 590 static Lisp_Object Vcoding_category_table;
 591
 592 /* Table of coding-categories ordered by priority.  */
 593 static enum coding_category coding_priorities[coding_category_max];
 594
 595 /* Nth element is a coding context for the coding system bound to the
 596    Nth coding category.  */
 597 static struct coding_system coding_categories[coding_category_max];
 598
 599 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 600
 601 static int
 602 encode_inhibit_flag (Lisp_Object flag)
 603 {
 604   return NILP (flag) ? -1 : EQ (flag, Qt);
 605 }
 606
 607 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 608    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 609
 610 static bool
 611 inhibit_flag (int encoded_flag, bool var)
 612 {
 613   return 0 < encoded_flag + var;
 614 }
 615
 616 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 617   do {                                                  \
 618     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 619     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 620   } while (0)
 621
 622 static void
 623 CHECK_NATNUM_CAR (Lisp_Object x)
 624 {
 625   Lisp_Object tmp = XCAR (x);
 626   CHECK_NATNUM (tmp);
 627   XSETCAR (x, tmp);
 628 }
 629
 630 static void
 631 CHECK_NATNUM_CDR (Lisp_Object x)
 632 {
 633   Lisp_Object tmp = XCDR (x);
 634   CHECK_NATNUM (tmp);
 635   XSETCDR (x, tmp);
 636 }
 637
 638 /* True if CODING's destination can be grown.  */
 639
 640 static bool
 641 growable_destination (struct coding_system *coding)
 642 {
 643   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
 644 }
 645
 646
 647 /* Safely get one byte from the source text pointed by SRC which ends
 648    at SRC_END, and set C to that byte.  If there are not enough bytes
 649    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 650    and a multibyte character is found at SRC, set C to the
 651    negative value of the character code.  The caller should declare
 652    and set these variables appropriately in advance:
 653         src, src_end, multibytep */
 654
 655 #define ONE_MORE_BYTE(c)                                \
 656   do {                                                  \
 657     if (src == src_end)                                 \
 658       {                                                 \
 659         if (src_base < src)                             \
 660           record_conversion_result                      \
 661             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 662         goto no_more_source;                            \
 663       }                                                 \
 664     c = *src++;                                         \
 665     if (multibytep && (c & 0x80))                       \
 666       {                                                 \
 667         if ((c & 0xFE) == 0xC0)                         \
 668           c = ((c & 1) << 6) | *src++;                  \
 669         else                                            \
 670           {                                             \
 671             src--;                                      \
 672             c = - string_char (src, &src, NULL);        \
 673             record_conversion_result                    \
 674               (coding, CODING_RESULT_INVALID_SRC);      \
 675           }                                             \
 676       }                                                 \
 677     consumed_chars++;                                   \
 678   } while (0)
 679
 680 /* Safely get two bytes from the source text pointed by SRC which ends
 681    at SRC_END, and set C1 and C2 to those bytes while skipping the
 682    heading multibyte characters.  If there are not enough bytes in the
 683    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 684    a multibyte character is found for C2, set C2 to the negative value
 685    of the character code.  The caller should declare and set these
 686    variables appropriately in advance:
 687         src, src_end, multibytep
 688    It is intended that this macro is used in detect_coding_utf_16.  */
 689
 690 #define TWO_MORE_BYTES(c1, c2)                          \
 691   do {                                                  \
 692     do {                                                \
 693       if (src == src_end)                               \
 694         goto no_more_source;                            \
 695       c1 = *src++;                                      \
 696       if (multibytep && (c1 & 0x80))                    \
 697         {                                               \
 698           if ((c1 & 0xFE) == 0xC0)                      \
 699             c1 = ((c1 & 1) << 6) | *src++;              \
 700           else                                          \
 701             {                                           \
 702               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 703               c1 = -1;                                  \
 704             }                                           \
 705         }                                               \
 706     } while (c1 < 0);                                   \
 707     if (src == src_end)                                 \
 708       goto no_more_source;                              \
 709     c2 = *src++;                                        \
 710     if (multibytep && (c2 & 0x80))                      \
 711       {                                                 \
 712         if ((c2 & 0xFE) == 0xC0)                        \
 713           c2 = ((c2 & 1) << 6) | *src++;                \
 714         else                                            \
 715           c2 = -1;                                      \
 716       }                                                 \
 717   } while (0)
 718
 719
 720 /* Store a byte C in the place pointed by DST and increment DST to the
 721    next free point, and increment PRODUCED_CHARS.  The caller should
 722    assure that C is 0..127, and declare and set the variable `dst'
 723    appropriately in advance.
 724 */
 725
 726
 727 #define EMIT_ONE_ASCII_BYTE(c)  \
 728   do {                          \
 729     produced_chars++;           \
 730     *dst++ = (c);               \
 731   } while (0)
 732
 733
 734 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 735
 736 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 737   do {                                  \
 738     produced_chars += 2;                \
 739     *dst++ = (c1), *dst++ = (c2);       \
 740   } while (0)
 741
 742
 743 /* Store a byte C in the place pointed by DST and increment DST to the
 744    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 745    store in an appropriate multibyte form.  The caller should
 746    declare and set the variables `dst' and `multibytep' appropriately
 747    in advance.  */
 748
 749 #define EMIT_ONE_BYTE(c)                \
 750   do {                                  \
 751     produced_chars++;                   \
 752     if (multibytep)                     \
 753       {                                 \
 754         unsigned ch = (c);              \
 755         if (ch >= 0x80)                 \
 756           ch = BYTE8_TO_CHAR (ch);      \
 757         CHAR_STRING_ADVANCE (ch, dst);  \
 758       }                                 \
 759     else                                \
 760       *dst++ = (c);                     \
 761   } while (0)
 762
 763
 764 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 765
 766 #define EMIT_TWO_BYTES(c1, c2)          \
 767   do {                                  \
 768     produced_chars += 2;                \
 769     if (multibytep)                     \
 770       {                                 \
 771         unsigned ch;                    \
 772                                         \
 773         ch = (c1);                      \
 774         if (ch >= 0x80)                 \
 775           ch = BYTE8_TO_CHAR (ch);      \
 776         CHAR_STRING_ADVANCE (ch, dst);  \
 777         ch = (c2);                      \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781       }                                 \
 782     else                                \
 783       {                                 \
 784         *dst++ = (c1);                  \
 785         *dst++ = (c2);                  \
 786       }                                 \
 787   } while (0)
 788
 789
 790 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 791   do {                                  \
 792     EMIT_ONE_BYTE (c1);                 \
 793     EMIT_TWO_BYTES (c2, c3);            \
 794   } while (0)
 795
 796
 797 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 798   do {                                          \
 799     EMIT_TWO_BYTES (c1, c2);                    \
 800     EMIT_TWO_BYTES (c3, c4);                    \
 801   } while (0)
 802
 803
 804 static void
 805 record_conversion_result (struct coding_system *coding,
 806                           enum coding_result_code result)
 807 {
 808   coding->result = result;
 809   switch (result)
 810     {
 811     case CODING_RESULT_INSUFFICIENT_SRC:
 812       Vlast_code_conversion_error = Qinsufficient_source;
 813       break;
 814     case CODING_RESULT_INVALID_SRC:
 815       Vlast_code_conversion_error = Qinvalid_source;
 816       break;
 817     case CODING_RESULT_INTERRUPT:
 818       Vlast_code_conversion_error = Qinterrupted;
 819       break;
 820     case CODING_RESULT_INSUFFICIENT_DST:
 821       /* Don't record this error in Vlast_code_conversion_error
 822          because it happens just temporarily and is resolved when the
 823          whole conversion is finished.  */
 824       break;
 825     case CODING_RESULT_SUCCESS:
 826       break;
 827     default:
 828       Vlast_code_conversion_error = intern ("Unknown error");
 829     }
 830 }
 831
 832 /* These wrapper macros are used to preserve validity of pointers into
 833    buffer text across calls to decode_char, encode_char, etc, which
 834    could cause relocation of buffers if it loads a charset map,
 835    because loading a charset map allocates large structures.  */
 836
 837 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 838   do {                                                                       \
 839     ptrdiff_t offset;                                                        \
 840                                                                              \
 841     charset_map_loaded = 0;                                                  \
 842     c = DECODE_CHAR (charset, code);                                         \
 843     if (charset_map_loaded                                                   \
 844         && (offset = coding_change_source (coding)))                         \
 845       {                                                                      \
 846         src += offset;                                                       \
 847         src_base += offset;                                                  \
 848         src_end += offset;                                                   \
 849       }                                                                      \
 850   } while (0)
 851
 852 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 853   do {                                                                  \
 854     ptrdiff_t offset;                                                   \
 855                                                                         \
 856     charset_map_loaded = 0;                                             \
 857     code = ENCODE_CHAR (charset, c);                                    \
 858     if (charset_map_loaded                                              \
 859         && (offset = coding_change_destination (coding)))               \
 860       {                                                                 \
 861         dst += offset;                                                  \
 862         dst_end += offset;                                              \
 863       }                                                                 \
 864   } while (0)
 865
 866 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 867   do {                                                                  \
 868     ptrdiff_t offset;                                                   \
 869                                                                         \
 870     charset_map_loaded = 0;                                             \
 871     charset = char_charset (c, charset_list, code_return);              \
 872     if (charset_map_loaded                                              \
 873         && (offset = coding_change_destination (coding)))               \
 874       {                                                                 \
 875         dst += offset;                                                  \
 876         dst_end += offset;                                              \
 877       }                                                                 \
 878   } while (0)
 879
 880 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 881   do {                                                                  \
 882     ptrdiff_t offset;                                                   \
 883                                                                         \
 884     charset_map_loaded = 0;                                             \
 885     result = CHAR_CHARSET_P (c, charset);                               \
 886     if (charset_map_loaded                                              \
 887         && (offset = coding_change_destination (coding)))               \
 888       {                                                                 \
 889         dst += offset;                                                  \
 890         dst_end += offset;                                              \
 891       }                                                                 \
 892   } while (0)
 893
 894
 895 /* If there are at least BYTES length of room at dst, allocate memory
 896    for coding->destination and update dst and dst_end.  We don't have
 897    to take care of coding->source which will be relocated.  It is
 898    handled by calling coding_set_source in encode_coding.  */
 899
 900 #define ASSURE_DESTINATION(bytes)                               \
 901   do {                                                          \
 902     if (dst + (bytes) >= dst_end)                               \
 903       {                                                         \
 904         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 905                                                                 \
 906         dst = alloc_destination (coding, more_bytes, dst);      \
 907         dst_end = coding->destination + coding->dst_bytes;      \
 908       }                                                         \
 909   } while (0)
 910
 911
 912 /* Store multibyte form of the character C in P, and advance P to the
 913    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 914    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 915    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 916
 917 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 918
 919 /* Return the character code of character whose multibyte form is at
 920    P, and advance P to the end of the multibyte form.  This used to be
 921    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 922    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 923
 924 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 925
 926 /* Set coding->source from coding->src_object.  */
 927
 928 static void
 929 coding_set_source (struct coding_system *coding)
 930 {
 931   if (BUFFERP (coding->src_object))
 932     {
 933       struct buffer *buf = XBUFFER (coding->src_object);
 934
 935       if (coding->src_pos < 0)
 936         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 937       else
 938         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 939     }
 940   else if (STRINGP (coding->src_object))
 941     {
 942       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 943     }
 944   else
 945     {
 946       /* Otherwise, the source is C string and is never relocated
 947          automatically.  Thus we don't have to update anything.  */
 948     }
 949 }
 950
 951
 952 /* Set coding->source from coding->src_object, and return how many
 953    bytes coding->source was changed.  */
 954
 955 static ptrdiff_t
 956 coding_change_source (struct coding_system *coding)
 957 {
 958   const unsigned char *orig = coding->source;
 959   coding_set_source (coding);
 960   return coding->source - orig;
 961 }
 962
 963
 964 /* Set coding->destination from coding->dst_object.  */
 965
 966 static void
 967 coding_set_destination (struct coding_system *coding)
 968 {
 969   if (BUFFERP (coding->dst_object))
 970     {
 971       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 972         {
 973           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 974           coding->dst_bytes = (GAP_END_ADDR
 975                                - (coding->src_bytes - coding->consumed)
 976                                - coding->destination);
 977         }
 978       else
 979         {
 980           /* We are sure that coding->dst_pos_byte is before the gap
 981              of the buffer. */
 982           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 983                                  + coding->dst_pos_byte - BEG_BYTE);
 984           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 985                                - coding->destination);
 986         }
 987     }
 988   else
 989     {
 990       /* Otherwise, the destination is C string and is never relocated
 991          automatically.  Thus we don't have to update anything.  */
 992     }
 993 }
 994
 995
 996 /* Set coding->destination from coding->dst_object, and return how
 997    many bytes coding->destination was changed.  */
 998
 999 static ptrdiff_t
1000 coding_change_destination (struct coding_system *coding)
1001 {
1002   const unsigned char *orig = coding->destination;
1003   coding_set_destination (coding);
1004   return coding->destination - orig;
1005 }
1006
1007
1008 static void
1009 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1010 {
1011   ptrdiff_t newbytes;
1012   if (INT_ADD_WRAPV (coding->dst_bytes, bytes, &newbytes)
1013       || SIZE_MAX < newbytes)
1014     string_overflow ();
1015   coding->destination = xrealloc (coding->destination, newbytes);
1016   coding->dst_bytes = newbytes;
1017 }
1018
1019 static void
1020 coding_alloc_by_making_gap (struct coding_system *coding,
1021                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1022 {
1023   if (EQ (coding->src_object, coding->dst_object))
1024     {
1025       /* The gap may contain the produced data at the head and not-yet
1026          consumed data at the tail.  To preserve those data, we at
1027          first make the gap size to zero, then increase the gap
1028          size.  */
1029       ptrdiff_t add = GAP_SIZE;
1030
1031       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1032       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1033       make_gap (bytes);
1034       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1035       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1036     }
1037   else
1038     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1039 }
1040
1041
1042 static unsigned char *
1043 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1044                    unsigned char *dst)
1045 {
1046   ptrdiff_t offset = dst - coding->destination;
1047
1048   if (BUFFERP (coding->dst_object))
1049     {
1050       struct buffer *buf = XBUFFER (coding->dst_object);
1051
1052       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1053     }
1054   else
1055     coding_alloc_by_realloc (coding, nbytes);
1056   coding_set_destination (coding);
1057   dst = coding->destination + offset;
1058   return dst;
1059 }
1060
1061 /** Macros for annotations.  */
1062
1063 /* An annotation data is stored in the array coding->charbuf in this
1064    format:
1065      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1066    LENGTH is the number of elements in the annotation.
1067    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1068    NCHARS is the number of characters in the text annotated.
1069
1070    The format of the following elements depend on ANNOTATION_MASK.
1071
1072    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1073    follows:
1074      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1075
1076    NBYTES is the number of bytes specified in the header part of
1077    old-style emacs-mule encoding, or 0 for the other kind of
1078    composition.
1079
1080    METHOD is one of enum composition_method.
1081
1082    Optional COMPOSITION-COMPONENTS are characters and composition
1083    rules.
1084
1085    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1086    follows.
1087
1088    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1089    recover from an invalid annotation, and should be skipped by
1090    produce_annotation.  */
1091
1092 /* Maximum length of the header of annotation data.  */
1093 #define MAX_ANNOTATION_LENGTH 5
1094
1095 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1096   do {                                                  \
1097     *(buf)++ = -(len);                                  \
1098     *(buf)++ = (mask);                                  \
1099     *(buf)++ = (nchars);                                \
1100     coding->annotated = 1;                              \
1101   } while (0);
1102
1103 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1104   do {                                                                      \
1105     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1106     *buf++ = nbytes;                                                        \
1107     *buf++ = method;                                                        \
1108   } while (0)
1109
1110
1111 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1112   do {                                                                  \
1113     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1114     *buf++ = id;                                                        \
1115   } while (0)
1116
1117
1118 /* Bitmasks for coding->eol_seen.  */
1119
1120 #define EOL_SEEN_NONE   0
1121 #define EOL_SEEN_LF     1
1122 #define EOL_SEEN_CR     2
1123 #define EOL_SEEN_CRLF   4
1124
1125 \f
1126 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1127
1128
1129
1130 \f
1131 /*** 3. UTF-8 ***/
1132
1133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1134    Return true if a text is encoded in UTF-8.  */
1135
1136 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1137 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1138 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1139 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1140 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1141 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1142
1143 #define UTF_8_BOM_1 0xEF
1144 #define UTF_8_BOM_2 0xBB
1145 #define UTF_8_BOM_3 0xBF
1146
1147 /* Unlike the other detect_coding_XXX, this function counts the number
1148    of characters and checks the EOL format.  */
1149
1150 static bool
1151 detect_coding_utf_8 (struct coding_system *coding,
1152                      struct coding_detection_info *detect_info)
1153 {
1154   const unsigned char *src = coding->source, *src_base;
1155   const unsigned char *src_end = coding->source + coding->src_bytes;
1156   bool multibytep = coding->src_multibyte;
1157   ptrdiff_t consumed_chars = 0;
1158   bool bom_found = 0;
1159   ptrdiff_t nchars = coding->head_ascii;
1160   int eol_seen = coding->eol_seen;
1161
1162   detect_info->checked |= CATEGORY_MASK_UTF_8;
1163   /* A coding system of this category is always ASCII compatible.  */
1164   src += nchars;
1165
1166   if (src == coding->source     /* BOM should be at the head.  */
1167       && src + 3 < src_end      /* BOM is 3-byte long.  */
1168       && src[0] == UTF_8_BOM_1
1169       && src[1] == UTF_8_BOM_2
1170       && src[2] == UTF_8_BOM_3)
1171     {
1172       bom_found = 1;
1173       src += 3;
1174       nchars++;
1175     }
1176
1177   while (1)
1178     {
1179       int c, c1, c2, c3, c4;
1180
1181       src_base = src;
1182       ONE_MORE_BYTE (c);
1183       if (c < 0 || UTF_8_1_OCTET_P (c))
1184         {
1185           nchars++;
1186           if (c == '\r')
1187             {
1188               if (src < src_end && *src == '\n')
1189                 {
1190                   eol_seen |= EOL_SEEN_CRLF;
1191                   src++;
1192                   nchars++;
1193                 }
1194               else
1195                 eol_seen |= EOL_SEEN_CR;
1196             }
1197           else if (c == '\n')
1198             eol_seen |= EOL_SEEN_LF;
1199           continue;
1200         }
1201       ONE_MORE_BYTE (c1);
1202       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1203         break;
1204       if (UTF_8_2_OCTET_LEADING_P (c))
1205         {
1206           nchars++;
1207           continue;
1208         }
1209       ONE_MORE_BYTE (c2);
1210       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1211         break;
1212       if (UTF_8_3_OCTET_LEADING_P (c))
1213         {
1214           nchars++;
1215           continue;
1216         }
1217       ONE_MORE_BYTE (c3);
1218       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1219         break;
1220       if (UTF_8_4_OCTET_LEADING_P (c))
1221         {
1222           nchars++;
1223           continue;
1224         }
1225       ONE_MORE_BYTE (c4);
1226       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1227         break;
1228       if (UTF_8_5_OCTET_LEADING_P (c))
1229         {
1230           nchars++;
1231           continue;
1232         }
1233       break;
1234     }
1235   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1236   return 0;
1237
1238  no_more_source:
1239   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1240     {
1241       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1242       return 0;
1243     }
1244   if (bom_found)
1245     {
1246       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1247       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1248     }
1249   else
1250     {
1251       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1252       if (nchars < src_end - coding->source)
1253         /* The found characters are less than source bytes, which
1254            means that we found a valid non-ASCII characters.  */
1255         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1256     }
1257   coding->detected_utf8_bytes = src_base - coding->source;
1258   coding->detected_utf8_chars = nchars;
1259   return 1;
1260 }
1261
1262
1263 static void
1264 decode_coding_utf_8 (struct coding_system *coding)
1265 {
1266   const unsigned char *src = coding->source + coding->consumed;
1267   const unsigned char *src_end = coding->source + coding->src_bytes;
1268   const unsigned char *src_base;
1269   int *charbuf = coding->charbuf + coding->charbuf_used;
1270   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1271   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1272   bool multibytep = coding->src_multibyte;
1273   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1274   bool eol_dos
1275     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1276   int byte_after_cr = -1;
1277
1278   if (bom != utf_without_bom)
1279     {
1280       int c1, c2, c3;
1281
1282       src_base = src;
1283       ONE_MORE_BYTE (c1);
1284       if (! UTF_8_3_OCTET_LEADING_P (c1))
1285         src = src_base;
1286       else
1287         {
1288           ONE_MORE_BYTE (c2);
1289           if (! UTF_8_EXTRA_OCTET_P (c2))
1290             src = src_base;
1291           else
1292             {
1293               ONE_MORE_BYTE (c3);
1294               if (! UTF_8_EXTRA_OCTET_P (c3))
1295                 src = src_base;
1296               else
1297                 {
1298                   if ((c1 != UTF_8_BOM_1)
1299                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1300                     src = src_base;
1301                   else
1302                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1303                 }
1304             }
1305         }
1306     }
1307   CODING_UTF_8_BOM (coding) = utf_without_bom;
1308
1309   while (1)
1310     {
1311       int c, c1, c2, c3, c4, c5;
1312
1313       src_base = src;
1314       consumed_chars_base = consumed_chars;
1315
1316       if (charbuf >= charbuf_end)
1317         {
1318           if (byte_after_cr >= 0)
1319             src_base--;
1320           break;
1321         }
1322
1323       /* In the simple case, rapidly handle ordinary characters */
1324       if (multibytep && ! eol_dos
1325           && charbuf < charbuf_end - 6 && src < src_end - 6)
1326         {
1327           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1328             {
1329               c1 = *src;
1330               if (c1 & 0x80)
1331                 break;
1332               src++;
1333               consumed_chars++;
1334               *charbuf++ = c1;
1335
1336               c1 = *src;
1337               if (c1 & 0x80)
1338                 break;
1339               src++;
1340               consumed_chars++;
1341               *charbuf++ = c1;
1342
1343               c1 = *src;
1344               if (c1 & 0x80)
1345                 break;
1346               src++;
1347               consumed_chars++;
1348               *charbuf++ = c1;
1349
1350               c1 = *src;
1351               if (c1 & 0x80)
1352                 break;
1353               src++;
1354               consumed_chars++;
1355               *charbuf++ = c1;
1356             }
1357           /* If we handled at least one character, restart the main loop.  */
1358           if (src != src_base)
1359             continue;
1360         }
1361
1362       if (byte_after_cr >= 0)
1363         c1 = byte_after_cr, byte_after_cr = -1;
1364       else
1365         ONE_MORE_BYTE (c1);
1366       if (c1 < 0)
1367         {
1368           c = - c1;
1369         }
1370       else if (UTF_8_1_OCTET_P (c1))
1371         {
1372           if (eol_dos && c1 == '\r')
1373             ONE_MORE_BYTE (byte_after_cr);
1374           c = c1;
1375         }
1376       else
1377         {
1378           ONE_MORE_BYTE (c2);
1379           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1380             goto invalid_code;
1381           if (UTF_8_2_OCTET_LEADING_P (c1))
1382             {
1383               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1384               /* Reject overlong sequences here and below.  Encoders
1385                  producing them are incorrect, they can be misleading,
1386                  and they mess up read/write invariance.  */
1387               if (c < 128)
1388                 goto invalid_code;
1389             }
1390           else
1391             {
1392               ONE_MORE_BYTE (c3);
1393               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1394                 goto invalid_code;
1395               if (UTF_8_3_OCTET_LEADING_P (c1))
1396                 {
1397                   c = (((c1 & 0xF) << 12)
1398                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1399                   if (c < 0x800
1400                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1401                     goto invalid_code;
1402                 }
1403               else
1404                 {
1405                   ONE_MORE_BYTE (c4);
1406                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1407                     goto invalid_code;
1408                   if (UTF_8_4_OCTET_LEADING_P (c1))
1409                     {
1410                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1411                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1412                     if (c < 0x10000)
1413                       goto invalid_code;
1414                     }
1415                   else
1416                     {
1417                       ONE_MORE_BYTE (c5);
1418                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1419                         goto invalid_code;
1420                       if (UTF_8_5_OCTET_LEADING_P (c1))
1421                         {
1422                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1423                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1424                                | (c5 & 0x3F));
1425                           if ((c > MAX_CHAR) || (c < 0x200000))
1426                             goto invalid_code;
1427                         }
1428                       else
1429                         goto invalid_code;
1430                     }
1431                 }
1432             }
1433         }
1434
1435       *charbuf++ = c;
1436       continue;
1437
1438     invalid_code:
1439       src = src_base;
1440       consumed_chars = consumed_chars_base;
1441       ONE_MORE_BYTE (c);
1442       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1443     }
1444
1445  no_more_source:
1446   coding->consumed_char += consumed_chars_base;
1447   coding->consumed = src_base - coding->source;
1448   coding->charbuf_used = charbuf - coding->charbuf;
1449 }
1450
1451
1452 bool
1453 encode_coding_utf_8 (struct coding_system *coding)
1454 {
1455   bool multibytep = coding->dst_multibyte;
1456   int *charbuf = coding->charbuf;
1457   int *charbuf_end = charbuf + coding->charbuf_used;
1458   unsigned char *dst = coding->destination + coding->produced;
1459   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1460   ptrdiff_t produced_chars = 0;
1461   int c;
1462
1463   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1464     {
1465       ASSURE_DESTINATION (3);
1466       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1467       CODING_UTF_8_BOM (coding) = utf_without_bom;
1468     }
1469
1470   if (multibytep)
1471     {
1472       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1473
1474       while (charbuf < charbuf_end)
1475         {
1476           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1477
1478           ASSURE_DESTINATION (safe_room);
1479           c = *charbuf++;
1480           if (CHAR_BYTE8_P (c))
1481             {
1482               c = CHAR_TO_BYTE8 (c);
1483               EMIT_ONE_BYTE (c);
1484             }
1485           else
1486             {
1487               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1488               for (p = str; p < pend; p++)
1489                 EMIT_ONE_BYTE (*p);
1490             }
1491         }
1492     }
1493   else
1494     {
1495       int safe_room = MAX_MULTIBYTE_LENGTH;
1496
1497       while (charbuf < charbuf_end)
1498         {
1499           ASSURE_DESTINATION (safe_room);
1500           c = *charbuf++;
1501           if (CHAR_BYTE8_P (c))
1502             *dst++ = CHAR_TO_BYTE8 (c);
1503           else
1504             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1505         }
1506       produced_chars = dst - (coding->destination + coding->produced);
1507     }
1508   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1509   coding->produced_char += produced_chars;
1510   coding->produced = dst - coding->destination;
1511   return 0;
1512 }
1513
1514
1515 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1516    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1517
1518 #define UTF_16_HIGH_SURROGATE_P(val) \
1519   (((val) & 0xFC00) == 0xD800)
1520
1521 #define UTF_16_LOW_SURROGATE_P(val) \
1522   (((val) & 0xFC00) == 0xDC00)
1523
1524
1525 static bool
1526 detect_coding_utf_16 (struct coding_system *coding,
1527                       struct coding_detection_info *detect_info)
1528 {
1529   const unsigned char *src = coding->source;
1530   const unsigned char *src_end = coding->source + coding->src_bytes;
1531   bool multibytep = coding->src_multibyte;
1532   int c1, c2;
1533
1534   detect_info->checked |= CATEGORY_MASK_UTF_16;
1535   if (coding->mode & CODING_MODE_LAST_BLOCK
1536       && (coding->src_chars & 1))
1537     {
1538       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1539       return 0;
1540     }
1541
1542   TWO_MORE_BYTES (c1, c2);
1543   if ((c1 == 0xFF) && (c2 == 0xFE))
1544     {
1545       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1546                              | CATEGORY_MASK_UTF_16_AUTO);
1547       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1548                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1549                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1550     }
1551   else if ((c1 == 0xFE) && (c2 == 0xFF))
1552     {
1553       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1554                              | CATEGORY_MASK_UTF_16_AUTO);
1555       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1556                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1557                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1558     }
1559   else if (c2 < 0)
1560     {
1561       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1562       return 0;
1563     }
1564   else
1565     {
1566       /* We check the dispersion of Eth and Oth bytes where E is even and
1567          O is odd.  If both are high, we assume binary data.*/
1568       unsigned char e[256], o[256];
1569       unsigned e_num = 1, o_num = 1;
1570
1571       memset (e, 0, 256);
1572       memset (o, 0, 256);
1573       e[c1] = 1;
1574       o[c2] = 1;
1575
1576       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1577                                 |CATEGORY_MASK_UTF_16_BE
1578                                 | CATEGORY_MASK_UTF_16_LE);
1579
1580       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1581              != CATEGORY_MASK_UTF_16)
1582         {
1583           TWO_MORE_BYTES (c1, c2);
1584           if (c2 < 0)
1585             break;
1586           if (! e[c1])
1587             {
1588               e[c1] = 1;
1589               e_num++;
1590               if (e_num >= 128)
1591                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1592             }
1593           if (! o[c2])
1594             {
1595               o[c2] = 1;
1596               o_num++;
1597               if (o_num >= 128)
1598                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1599             }
1600         }
1601       return 0;
1602     }
1603
1604  no_more_source:
1605   return 1;
1606 }
1607
1608 static void
1609 decode_coding_utf_16 (struct coding_system *coding)
1610 {
1611   const unsigned char *src = coding->source + coding->consumed;
1612   const unsigned char *src_end = coding->source + coding->src_bytes;
1613   const unsigned char *src_base;
1614   int *charbuf = coding->charbuf + coding->charbuf_used;
1615   /* We may produces at most 3 chars in one loop.  */
1616   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1617   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1618   bool multibytep = coding->src_multibyte;
1619   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1620   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1621   int surrogate = CODING_UTF_16_SURROGATE (coding);
1622   bool eol_dos
1623     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1624   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1625
1626   if (bom == utf_with_bom)
1627     {
1628       int c, c1, c2;
1629
1630       src_base = src;
1631       ONE_MORE_BYTE (c1);
1632       ONE_MORE_BYTE (c2);
1633       c = (c1 << 8) | c2;
1634
1635       if (endian == utf_16_big_endian
1636           ? c != 0xFEFF : c != 0xFFFE)
1637         {
1638           /* The first two bytes are not BOM.  Treat them as bytes
1639              for a normal character.  */
1640           src = src_base;
1641         }
1642       CODING_UTF_16_BOM (coding) = utf_without_bom;
1643     }
1644   else if (bom == utf_detect_bom)
1645     {
1646       /* We have already tried to detect BOM and failed in
1647          detect_coding.  */
1648       CODING_UTF_16_BOM (coding) = utf_without_bom;
1649     }
1650
1651   while (1)
1652     {
1653       int c, c1, c2;
1654
1655       src_base = src;
1656       consumed_chars_base = consumed_chars;
1657
1658       if (charbuf >= charbuf_end)
1659         {
1660           if (byte_after_cr1 >= 0)
1661             src_base -= 2;
1662           break;
1663         }
1664
1665       if (byte_after_cr1 >= 0)
1666         c1 = byte_after_cr1, byte_after_cr1 = -1;
1667       else
1668         ONE_MORE_BYTE (c1);
1669       if (c1 < 0)
1670         {
1671           *charbuf++ = -c1;
1672           continue;
1673         }
1674       if (byte_after_cr2 >= 0)
1675         c2 = byte_after_cr2, byte_after_cr2 = -1;
1676       else
1677         ONE_MORE_BYTE (c2);
1678       if (c2 < 0)
1679         {
1680           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1681           *charbuf++ = -c2;
1682           continue;
1683         }
1684       c = (endian == utf_16_big_endian
1685            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1686
1687       if (surrogate)
1688         {
1689           if (! UTF_16_LOW_SURROGATE_P (c))
1690             {
1691               if (endian == utf_16_big_endian)
1692                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1693               else
1694                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1695               *charbuf++ = c1;
1696               *charbuf++ = c2;
1697               if (UTF_16_HIGH_SURROGATE_P (c))
1698                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1699               else
1700                 *charbuf++ = c;
1701             }
1702           else
1703             {
1704               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1705               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1706               *charbuf++ = 0x10000 + c;
1707             }
1708         }
1709       else
1710         {
1711           if (UTF_16_HIGH_SURROGATE_P (c))
1712             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1713           else
1714             {
1715               if (eol_dos && c == '\r')
1716                 {
1717                   ONE_MORE_BYTE (byte_after_cr1);
1718                   ONE_MORE_BYTE (byte_after_cr2);
1719                 }
1720               *charbuf++ = c;
1721             }
1722         }
1723     }
1724
1725  no_more_source:
1726   coding->consumed_char += consumed_chars_base;
1727   coding->consumed = src_base - coding->source;
1728   coding->charbuf_used = charbuf - coding->charbuf;
1729 }
1730
1731 static bool
1732 encode_coding_utf_16 (struct coding_system *coding)
1733 {
1734   bool multibytep = coding->dst_multibyte;
1735   int *charbuf = coding->charbuf;
1736   int *charbuf_end = charbuf + coding->charbuf_used;
1737   unsigned char *dst = coding->destination + coding->produced;
1738   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1739   int safe_room = 8;
1740   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1741   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1742   ptrdiff_t produced_chars = 0;
1743   int c;
1744
1745   if (bom != utf_without_bom)
1746     {
1747       ASSURE_DESTINATION (safe_room);
1748       if (big_endian)
1749         EMIT_TWO_BYTES (0xFE, 0xFF);
1750       else
1751         EMIT_TWO_BYTES (0xFF, 0xFE);
1752       CODING_UTF_16_BOM (coding) = utf_without_bom;
1753     }
1754
1755   while (charbuf < charbuf_end)
1756     {
1757       ASSURE_DESTINATION (safe_room);
1758       c = *charbuf++;
1759       if (c > MAX_UNICODE_CHAR)
1760         c = coding->default_char;
1761
1762       if (c < 0x10000)
1763         {
1764           if (big_endian)
1765             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1766           else
1767             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1768         }
1769       else
1770         {
1771           int c1, c2;
1772
1773           c -= 0x10000;
1774           c1 = (c >> 10) + 0xD800;
1775           c2 = (c & 0x3FF) + 0xDC00;
1776           if (big_endian)
1777             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1778           else
1779             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1780         }
1781     }
1782   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1783   coding->produced = dst - coding->destination;
1784   coding->produced_char += produced_chars;
1785   return 0;
1786 }
1787
1788 \f
1789 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1790
1791 /* Emacs' internal format for representation of multiple character
1792    sets is a kind of multi-byte encoding, i.e. characters are
1793    represented by variable-length sequences of one-byte codes.
1794
1795    ASCII characters and control characters (e.g. `tab', `newline') are
1796    represented by one-byte sequences which are their ASCII codes, in
1797    the range 0x00 through 0x7F.
1798
1799    8-bit characters of the range 0x80..0x9F are represented by
1800    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1801    code + 0x20).
1802
1803    8-bit characters of the range 0xA0..0xFF are represented by
1804    one-byte sequences which are their 8-bit code.
1805
1806    The other characters are represented by a sequence of `base
1807    leading-code', optional `extended leading-code', and one or two
1808    `position-code's.  The length of the sequence is determined by the
1809    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1810    whereas extended leading-code and position-code take the range 0xA0
1811    through 0xFF.  See `charset.h' for more details about leading-code
1812    and position-code.
1813
1814    --- CODE RANGE of Emacs' internal format ---
1815    character set        range
1816    -------------        -----
1817    ascii                0x00..0x7F
1818    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1819    eight-bit-graphic    0xA0..0xBF
1820    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1821    ---------------------------------------------
1822
1823    As this is the internal character representation, the format is
1824    usually not used externally (i.e. in a file or in a data sent to a
1825    process).  But, it is possible to have a text externally in this
1826    format (i.e. by encoding by the coding system `emacs-mule').
1827
1828    In that case, a sequence of one-byte codes has a slightly different
1829    form.
1830
1831    At first, all characters in eight-bit-control are represented by
1832    one-byte sequences which are their 8-bit code.
1833
1834    Next, character composition data are represented by the byte
1835    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1836    where,
1837         METHOD is 0xF2 plus one of composition method (enum
1838         composition_method),
1839
1840         BYTES is 0xA0 plus a byte length of this composition data,
1841
1842         CHARS is 0xA0 plus a number of characters composed by this
1843         data,
1844
1845         COMPONENTs are characters of multibyte form or composition
1846         rules encoded by two-byte of ASCII codes.
1847
1848    In addition, for backward compatibility, the following formats are
1849    also recognized as composition data on decoding.
1850
1851    0x80 MSEQ ...
1852    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1853
1854    Here,
1855         MSEQ is a multibyte form but in these special format:
1856           ASCII: 0xA0 ASCII_CODE+0x80,
1857           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1858         RULE is a one byte code of the range 0xA0..0xF0 that
1859         represents a composition rule.
1860   */
1861
1862 char emacs_mule_bytes[256];
1863
1864
1865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1866    Return true if a text is encoded in 'emacs-mule'.  */
1867
1868 static bool
1869 detect_coding_emacs_mule (struct coding_system *coding,
1870                           struct coding_detection_info *detect_info)
1871 {
1872   const unsigned char *src = coding->source, *src_base;
1873   const unsigned char *src_end = coding->source + coding->src_bytes;
1874   bool multibytep = coding->src_multibyte;
1875   ptrdiff_t consumed_chars = 0;
1876   int c;
1877   int found = 0;
1878
1879   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1880   /* A coding system of this category is always ASCII compatible.  */
1881   src += coding->head_ascii;
1882
1883   while (1)
1884     {
1885       src_base = src;
1886       ONE_MORE_BYTE (c);
1887       if (c < 0)
1888         continue;
1889       if (c == 0x80)
1890         {
1891           /* Perhaps the start of composite character.  We simply skip
1892              it because analyzing it is too heavy for detecting.  But,
1893              at least, we check that the composite character
1894              constitutes of more than 4 bytes.  */
1895           const unsigned char *src_start;
1896
1897         repeat:
1898           src_start = src;
1899           do
1900             {
1901               ONE_MORE_BYTE (c);
1902             }
1903           while (c >= 0xA0);
1904
1905           if (src - src_start <= 4)
1906             break;
1907           found = CATEGORY_MASK_EMACS_MULE;
1908           if (c == 0x80)
1909             goto repeat;
1910         }
1911
1912       if (c < 0x80)
1913         {
1914           if (c < 0x20
1915               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1916             break;
1917         }
1918       else
1919         {
1920           int more_bytes = emacs_mule_bytes[c] - 1;
1921
1922           while (more_bytes > 0)
1923             {
1924               ONE_MORE_BYTE (c);
1925               if (c < 0xA0)
1926                 {
1927                   src--;        /* Unread the last byte.  */
1928                   break;
1929                 }
1930               more_bytes--;
1931             }
1932           if (more_bytes != 0)
1933             break;
1934           found = CATEGORY_MASK_EMACS_MULE;
1935         }
1936     }
1937   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1938   return 0;
1939
1940  no_more_source:
1941   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1942     {
1943       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1944       return 0;
1945     }
1946   detect_info->found |= found;
1947   return 1;
1948 }
1949
1950
1951 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1952    character.  If CMP_STATUS indicates that we must expect MSEQ or
1953    RULE described above, decode it and return the negative value of
1954    the decoded character or rule.  If an invalid byte is found, return
1955    -1.  If SRC is too short, return -2.  */
1956
1957 static int
1958 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1959                  int *nbytes, int *nchars, int *id,
1960                  struct composition_status *cmp_status)
1961 {
1962   const unsigned char *src_end = coding->source + coding->src_bytes;
1963   const unsigned char *src_base = src;
1964   bool multibytep = coding->src_multibyte;
1965   int charset_ID;
1966   unsigned code;
1967   int c;
1968   ptrdiff_t consumed_chars = 0;
1969   bool mseq_found = 0;
1970
1971   ONE_MORE_BYTE (c);
1972   if (c < 0)
1973     {
1974       c = -c;
1975       charset_ID = emacs_mule_charset[0];
1976     }
1977   else
1978     {
1979       if (c >= 0xA0)
1980         {
1981           if (cmp_status->state != COMPOSING_NO
1982               && cmp_status->old_form)
1983             {
1984               if (cmp_status->state == COMPOSING_CHAR)
1985                 {
1986                   if (c == 0xA0)
1987                     {
1988                       ONE_MORE_BYTE (c);
1989                       c -= 0x80;
1990                       if (c < 0)
1991                         goto invalid_code;
1992                     }
1993                   else
1994                     c -= 0x20;
1995                   mseq_found = 1;
1996                 }
1997               else
1998                 {
1999                   *nbytes = src - src_base;
2000                   *nchars = consumed_chars;
2001                   return -c;
2002                 }
2003             }
2004           else
2005             goto invalid_code;
2006         }
2007
2008       switch (emacs_mule_bytes[c])
2009         {
2010         case 2:
2011           if ((charset_ID = emacs_mule_charset[c]) < 0)
2012             goto invalid_code;
2013           ONE_MORE_BYTE (c);
2014           if (c < 0xA0)
2015             goto invalid_code;
2016           code = c & 0x7F;
2017           break;
2018
2019         case 3:
2020           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2021               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2022             {
2023               ONE_MORE_BYTE (c);
2024               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2025                 goto invalid_code;
2026               ONE_MORE_BYTE (c);
2027               if (c < 0xA0)
2028                 goto invalid_code;
2029               code = c & 0x7F;
2030             }
2031           else
2032             {
2033               if ((charset_ID = emacs_mule_charset[c]) < 0)
2034                 goto invalid_code;
2035               ONE_MORE_BYTE (c);
2036               if (c < 0xA0)
2037                 goto invalid_code;
2038               code = (c & 0x7F) << 8;
2039               ONE_MORE_BYTE (c);
2040               if (c < 0xA0)
2041                 goto invalid_code;
2042               code |= c & 0x7F;
2043             }
2044           break;
2045
2046         case 4:
2047           ONE_MORE_BYTE (c);
2048           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2049             goto invalid_code;
2050           ONE_MORE_BYTE (c);
2051           if (c < 0xA0)
2052             goto invalid_code;
2053           code = (c & 0x7F) << 8;
2054           ONE_MORE_BYTE (c);
2055           if (c < 0xA0)
2056             goto invalid_code;
2057           code |= c & 0x7F;
2058           break;
2059
2060         case 1:
2061           code = c;
2062           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2063           break;
2064
2065         default:
2066           emacs_abort ();
2067         }
2068       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2069                           CHARSET_FROM_ID (charset_ID), code, c);
2070       if (c < 0)
2071         goto invalid_code;
2072     }
2073   *nbytes = src - src_base;
2074   *nchars = consumed_chars;
2075   if (id)
2076     *id = charset_ID;
2077   return (mseq_found ? -c : c);
2078
2079  no_more_source:
2080   return -2;
2081
2082  invalid_code:
2083   return -1;
2084 }
2085
2086
2087 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2088
2089 /* Handle these composition sequence ('|': the end of header elements,
2090    BYTES and CHARS >= 0xA0):
2091
2092    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2093    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2094    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2095
2096    and these old form:
2097
2098    (4) relative composition: 0x80 | MSEQ ... MSEQ
2099    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2100
2101    When the starter 0x80 and the following header elements are found,
2102    this annotation header is produced.
2103
2104         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2105
2106    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2107    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2108
2109    Then, upon reading the following elements, these codes are produced
2110    until the composition end is found:
2111
2112    (1) CHAR ... CHAR
2113    (2) ALT ... ALT CHAR ... CHAR
2114    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2115    (4) CHAR ... CHAR
2116    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2117
2118    When the composition end is found, LENGTH and NCHARS in the
2119    annotation header is updated as below:
2120
2121    (1) LENGTH: unchanged, NCHARS: unchanged
2122    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2123    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2124    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2125    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2126
2127    If an error is found while composing, the annotation header is
2128    changed to the original composition header (plus filler -1s) as
2129    below:
2130
2131    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2132    (5)          [ 0x80 0xFF -1 -1- -1 ]
2133
2134    and the sequence [ -2 DECODED-RULE ] is changed to the original
2135    byte sequence as below:
2136         o the original byte sequence is B: [ B -1 ]
2137         o the original byte sequence is B1 B2: [ B1 B2 ]
2138
2139    Most of the routines are implemented by macros because many
2140    variables and labels in the caller decode_coding_emacs_mule must be
2141    accessible, and they are usually called just once (thus doesn't
2142    increase the size of compiled object).  */
2143
2144 /* Decode a composition rule represented by C as a component of
2145    composition sequence of Emacs 20 style.  Set RULE to the decoded
2146    rule. */
2147
2148 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2149   do {                                                  \
2150     int gref, nref;                                     \
2151                                                         \
2152     c -= 0xA0;                                          \
2153     if (c < 0 || c >= 81)                               \
2154       goto invalid_code;                                \
2155     gref = c / 9, nref = c % 9;                         \
2156     if (gref == 4) gref = 10;                           \
2157     if (nref == 4) nref = 10;                           \
2158     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2159   } while (0)
2160
2161
2162 /* Decode a composition rule represented by C and the following byte
2163    at SRC as a component of composition sequence of Emacs 21 style.
2164    Set RULE to the decoded rule.  */
2165
2166 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2167   do {                                                  \
2168     int gref, nref;                                     \
2169                                                         \
2170     gref = c - 0x20;                                    \
2171     if (gref < 0 || gref >= 81)                         \
2172       goto invalid_code;                                \
2173     ONE_MORE_BYTE (c);                                  \
2174     nref = c - 0x20;                                    \
2175     if (nref < 0 || nref >= 81)                         \
2176       goto invalid_code;                                \
2177     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2178   } while (0)
2179
2180
2181 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2182    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2183    byte length of this composition information, CHARS is the number of
2184    characters composed by this composition.  */
2185
2186 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2187   do {                                                                  \
2188     enum composition_method method = c - 0xF2;                          \
2189     int nbytes, nchars;                                                 \
2190                                                                         \
2191     ONE_MORE_BYTE (c);                                                  \
2192     if (c < 0)                                                          \
2193       goto invalid_code;                                                \
2194     nbytes = c - 0xA0;                                                  \
2195     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2196       goto invalid_code;                                                \
2197     ONE_MORE_BYTE (c);                                                  \
2198     nchars = c - 0xA0;                                                  \
2199     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2200       goto invalid_code;                                                \
2201     cmp_status->old_form = 0;                                           \
2202     cmp_status->method = method;                                        \
2203     if (method == COMPOSITION_RELATIVE)                                 \
2204       cmp_status->state = COMPOSING_CHAR;                               \
2205     else                                                                \
2206       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2207     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2208     cmp_status->nchars = nchars;                                        \
2209     cmp_status->ncomps = nbytes - 4;                                    \
2210     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2211   } while (0)
2212
2213
2214 /* Start of Emacs 20 style format for relative composition.  */
2215
2216 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2217   do {                                                          \
2218     cmp_status->old_form = 1;                                   \
2219     cmp_status->method = COMPOSITION_RELATIVE;                  \
2220     cmp_status->state = COMPOSING_CHAR;                         \
2221     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2222     cmp_status->nchars = cmp_status->ncomps = 0;                \
2223     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2224   } while (0)
2225
2226
2227 /* Start of Emacs 20 style format for rule-base composition.  */
2228
2229 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2230   do {                                                          \
2231     cmp_status->old_form = 1;                                   \
2232     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2233     cmp_status->state = COMPOSING_CHAR;                         \
2234     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2235     cmp_status->nchars = cmp_status->ncomps = 0;                \
2236     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2237   } while (0)
2238
2239
2240 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2241   do {                                                  \
2242     const unsigned char *current_src = src;             \
2243                                                         \
2244     ONE_MORE_BYTE (c);                                  \
2245     if (c < 0)                                          \
2246       goto invalid_code;                                \
2247     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2248         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2249       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2250     else if (c < 0xA0)                                  \
2251       goto invalid_code;                                \
2252     else if (c < 0xC0)                                  \
2253       {                                                 \
2254         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2255         /* Re-read C as a composition component.  */    \
2256         src = current_src;                              \
2257       }                                                 \
2258     else if (c == 0xFF)                                 \
2259       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2260     else                                                \
2261       goto invalid_code;                                \
2262   } while (0)
2263
2264 #define EMACS_MULE_COMPOSITION_END()                            \
2265   do {                                                          \
2266     int idx = - cmp_status->length;                             \
2267                                                                 \
2268     if (cmp_status->old_form)                                   \
2269       charbuf[idx + 2] = cmp_status->nchars;                    \
2270     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2271       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2272     cmp_status->state = COMPOSING_NO;                           \
2273   } while (0)
2274
2275
2276 static int
2277 emacs_mule_finish_composition (int *charbuf,
2278                                struct composition_status *cmp_status)
2279 {
2280   int idx = - cmp_status->length;
2281   int new_chars;
2282
2283   if (cmp_status->old_form && cmp_status->nchars > 0)
2284     {
2285       charbuf[idx + 2] = cmp_status->nchars;
2286       new_chars = 0;
2287       if (cmp_status->method == COMPOSITION_WITH_RULE
2288           && cmp_status->state == COMPOSING_CHAR)
2289         {
2290           /* The last rule was invalid.  */
2291           int rule = charbuf[-1] + 0xA0;
2292
2293           charbuf[-2] = BYTE8_TO_CHAR (rule);
2294           charbuf[-1] = -1;
2295           new_chars = 1;
2296         }
2297     }
2298   else
2299     {
2300       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2301
2302       if (cmp_status->method == COMPOSITION_WITH_RULE)
2303         {
2304           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2305           charbuf[idx++] = -3;
2306           charbuf[idx++] = 0;
2307           new_chars = 1;
2308         }
2309       else
2310         {
2311           int nchars = charbuf[idx + 1] + 0xA0;
2312           int nbytes = charbuf[idx + 2] + 0xA0;
2313
2314           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2315           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2316           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2317           charbuf[idx++] = -1;
2318           new_chars = 4;
2319         }
2320     }
2321   cmp_status->state = COMPOSING_NO;
2322   return new_chars;
2323 }
2324
2325 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2326   do {                                                                    \
2327     if (cmp_status->state != COMPOSING_NO)                                \
2328       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2329   } while (0)
2330
2331
2332 static void
2333 decode_coding_emacs_mule (struct coding_system *coding)
2334 {
2335   const unsigned char *src = coding->source + coding->consumed;
2336   const unsigned char *src_end = coding->source + coding->src_bytes;
2337   const unsigned char *src_base;
2338   int *charbuf = coding->charbuf + coding->charbuf_used;
2339   /* We may produce two annotations (charset and composition) in one
2340      loop and one more charset annotation at the end.  */
2341   int *charbuf_end
2342     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2343       /* We can produce up to 2 characters in a loop.  */
2344       - 1;
2345   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2346   bool multibytep = coding->src_multibyte;
2347   ptrdiff_t char_offset = coding->produced_char;
2348   ptrdiff_t last_offset = char_offset;
2349   int last_id = charset_ascii;
2350   bool eol_dos
2351     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2352   int byte_after_cr = -1;
2353   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2354
2355   if (cmp_status->state != COMPOSING_NO)
2356     {
2357       int i;
2358
2359       if (charbuf_end - charbuf < cmp_status->length)
2360         emacs_abort ();
2361       for (i = 0; i < cmp_status->length; i++)
2362         *charbuf++ = cmp_status->carryover[i];
2363       coding->annotated = 1;
2364     }
2365
2366   while (1)
2367     {
2368       int c;
2369       int id UNINIT;
2370
2371       src_base = src;
2372       consumed_chars_base = consumed_chars;
2373
2374       if (charbuf >= charbuf_end)
2375         {
2376           if (byte_after_cr >= 0)
2377             src_base--;
2378           break;
2379         }
2380
2381       if (byte_after_cr >= 0)
2382         c = byte_after_cr, byte_after_cr = -1;
2383       else
2384         ONE_MORE_BYTE (c);
2385
2386       if (c < 0 || c == 0x80)
2387         {
2388           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2389           if (c < 0)
2390             {
2391               *charbuf++ = -c;
2392               char_offset++;
2393             }
2394           else
2395             DECODE_EMACS_MULE_COMPOSITION_START ();
2396           continue;
2397         }
2398
2399       if (c < 0x80)
2400         {
2401           if (eol_dos && c == '\r')
2402             ONE_MORE_BYTE (byte_after_cr);
2403           id = charset_ascii;
2404           if (cmp_status->state != COMPOSING_NO)
2405             {
2406               if (cmp_status->old_form)
2407                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2408               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2409                 cmp_status->ncomps--;
2410             }
2411         }
2412       else
2413         {
2414           int nchars UNINIT, nbytes UNINIT;
2415           /* emacs_mule_char can load a charset map from a file, which
2416              allocates a large structure and might cause buffer text
2417              to be relocated as result.  Thus, we need to remember the
2418              original pointer to buffer text, and fix up all related
2419              pointers after the call.  */
2420           const unsigned char *orig = coding->source;
2421           ptrdiff_t offset;
2422
2423           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2424                                cmp_status);
2425           offset = coding->source - orig;
2426           if (offset)
2427             {
2428               src += offset;
2429               src_base += offset;
2430               src_end += offset;
2431             }
2432           if (c < 0)
2433             {
2434               if (c == -1)
2435                 goto invalid_code;
2436               if (c == -2)
2437                 break;
2438             }
2439           src = src_base + nbytes;
2440           consumed_chars = consumed_chars_base + nchars;
2441           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2442             cmp_status->ncomps -= nchars;
2443         }
2444
2445       /* Now if C >= 0, we found a normally encoded character, if C <
2446          0, we found an old-style composition component character or
2447          rule.  */
2448
2449       if (cmp_status->state == COMPOSING_NO)
2450         {
2451           if (last_id != id)
2452             {
2453               if (last_id != charset_ascii)
2454                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2455                                   last_id);
2456               last_id = id;
2457               last_offset = char_offset;
2458             }
2459           *charbuf++ = c;
2460           char_offset++;
2461         }
2462       else if (cmp_status->state == COMPOSING_CHAR)
2463         {
2464           if (cmp_status->old_form)
2465             {
2466               if (c >= 0)
2467                 {
2468                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2469                   *charbuf++ = c;
2470                   char_offset++;
2471                 }
2472               else
2473                 {
2474                   *charbuf++ = -c;
2475                   cmp_status->nchars++;
2476                   cmp_status->length++;
2477                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2478                     EMACS_MULE_COMPOSITION_END ();
2479                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2480                     cmp_status->state = COMPOSING_RULE;
2481                 }
2482             }
2483           else
2484             {
2485               *charbuf++ = c;
2486               cmp_status->length++;
2487               cmp_status->nchars--;
2488               if (cmp_status->nchars == 0)
2489                 EMACS_MULE_COMPOSITION_END ();
2490             }
2491         }
2492       else if (cmp_status->state == COMPOSING_RULE)
2493         {
2494           int rule;
2495
2496           if (c >= 0)
2497             {
2498               EMACS_MULE_COMPOSITION_END ();
2499               *charbuf++ = c;
2500               char_offset++;
2501             }
2502           else
2503             {
2504               c = -c;
2505               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2506               if (rule < 0)
2507                 goto invalid_code;
2508               *charbuf++ = -2;
2509               *charbuf++ = rule;
2510               cmp_status->length += 2;
2511               cmp_status->state = COMPOSING_CHAR;
2512             }
2513         }
2514       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2515         {
2516           *charbuf++ = c;
2517           cmp_status->length++;
2518           if (cmp_status->ncomps == 0)
2519             cmp_status->state = COMPOSING_CHAR;
2520           else if (cmp_status->ncomps > 0)
2521             {
2522               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2523                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2524             }
2525           else
2526             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2527         }
2528       else                      /* COMPOSING_COMPONENT_RULE */
2529         {
2530           int rule;
2531
2532           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2533           if (rule < 0)
2534             goto invalid_code;
2535           *charbuf++ = -2;
2536           *charbuf++ = rule;
2537           cmp_status->length += 2;
2538           cmp_status->ncomps--;
2539           if (cmp_status->ncomps > 0)
2540             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2541           else
2542             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2543         }
2544       continue;
2545
2546     invalid_code:
2547       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2548       src = src_base;
2549       consumed_chars = consumed_chars_base;
2550       ONE_MORE_BYTE (c);
2551       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2552       char_offset++;
2553     }
2554
2555  no_more_source:
2556   if (cmp_status->state != COMPOSING_NO)
2557     {
2558       if (coding->mode & CODING_MODE_LAST_BLOCK)
2559         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2560       else
2561         {
2562           int i;
2563
2564           charbuf -= cmp_status->length;
2565           for (i = 0; i < cmp_status->length; i++)
2566             cmp_status->carryover[i] = charbuf[i];
2567         }
2568     }
2569   if (last_id != charset_ascii)
2570     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2571   coding->consumed_char += consumed_chars_base;
2572   coding->consumed = src_base - coding->source;
2573   coding->charbuf_used = charbuf - coding->charbuf;
2574 }
2575
2576
2577 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2578   do {                                          \
2579     if (id < 0xA0)                              \
2580       codes[0] = id, codes[1] = 0;              \
2581     else if (id < 0xE0)                         \
2582       codes[0] = 0x9A, codes[1] = id;           \
2583     else if (id < 0xF0)                         \
2584       codes[0] = 0x9B, codes[1] = id;           \
2585     else if (id < 0xF5)                         \
2586       codes[0] = 0x9C, codes[1] = id;           \
2587     else                                        \
2588       codes[0] = 0x9D, codes[1] = id;           \
2589   } while (0);
2590
2591
2592 static bool
2593 encode_coding_emacs_mule (struct coding_system *coding)
2594 {
2595   bool multibytep = coding->dst_multibyte;
2596   int *charbuf = coding->charbuf;
2597   int *charbuf_end = charbuf + coding->charbuf_used;
2598   unsigned char *dst = coding->destination + coding->produced;
2599   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2600   int safe_room = 8;
2601   ptrdiff_t produced_chars = 0;
2602   Lisp_Object attrs, charset_list;
2603   int c;
2604   int preferred_charset_id = -1;
2605
2606   CODING_GET_INFO (coding, attrs, charset_list);
2607   if (! EQ (charset_list, Vemacs_mule_charset_list))
2608     {
2609       charset_list = Vemacs_mule_charset_list;
2610       ASET (attrs, coding_attr_charset_list, charset_list);
2611     }
2612
2613   while (charbuf < charbuf_end)
2614     {
2615       ASSURE_DESTINATION (safe_room);
2616       c = *charbuf++;
2617
2618       if (c < 0)
2619         {
2620           /* Handle an annotation.  */
2621           switch (*charbuf)
2622             {
2623             case CODING_ANNOTATE_COMPOSITION_MASK:
2624               /* Not yet implemented.  */
2625               break;
2626             case CODING_ANNOTATE_CHARSET_MASK:
2627               preferred_charset_id = charbuf[3];
2628               if (preferred_charset_id >= 0
2629                   && NILP (Fmemq (make_number (preferred_charset_id),
2630                                   charset_list)))
2631                 preferred_charset_id = -1;
2632               break;
2633             default:
2634               emacs_abort ();
2635             }
2636           charbuf += -c - 1;
2637           continue;
2638         }
2639
2640       if (ASCII_CHAR_P (c))
2641         EMIT_ONE_ASCII_BYTE (c);
2642       else if (CHAR_BYTE8_P (c))
2643         {
2644           c = CHAR_TO_BYTE8 (c);
2645           EMIT_ONE_BYTE (c);
2646         }
2647       else
2648         {
2649           struct charset *charset;
2650           unsigned code;
2651           int dimension;
2652           int emacs_mule_id;
2653           unsigned char leading_codes[2];
2654
2655           if (preferred_charset_id >= 0)
2656             {
2657               bool result;
2658
2659               charset = CHARSET_FROM_ID (preferred_charset_id);
2660               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2661               if (result)
2662                 code = ENCODE_CHAR (charset, c);
2663               else
2664                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2665                                      &code, charset);
2666             }
2667           else
2668             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2669                                  &code, charset);
2670           if (! charset)
2671             {
2672               c = coding->default_char;
2673               if (ASCII_CHAR_P (c))
2674                 {
2675                   EMIT_ONE_ASCII_BYTE (c);
2676                   continue;
2677                 }
2678               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2679                                    &code, charset);
2680             }
2681           dimension = CHARSET_DIMENSION (charset);
2682           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2683           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2684           EMIT_ONE_BYTE (leading_codes[0]);
2685           if (leading_codes[1])
2686             EMIT_ONE_BYTE (leading_codes[1]);
2687           if (dimension == 1)
2688             EMIT_ONE_BYTE (code | 0x80);
2689           else
2690             {
2691               code |= 0x8080;
2692               EMIT_ONE_BYTE (code >> 8);
2693               EMIT_ONE_BYTE (code & 0xFF);
2694             }
2695         }
2696     }
2697   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2698   coding->produced_char += produced_chars;
2699   coding->produced = dst - coding->destination;
2700   return 0;
2701 }
2702
2703 \f
2704 /*** 7. ISO2022 handlers ***/
2705
2706 /* The following note describes the coding system ISO2022 briefly.
2707    Since the intention of this note is to help understand the
2708    functions in this file, some parts are NOT ACCURATE or are OVERLY
2709    SIMPLIFIED.  For thorough understanding, please refer to the
2710    original document of ISO2022.  This is equivalent to the standard
2711    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2712
2713    ISO2022 provides many mechanisms to encode several character sets
2714    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2715    is encoded using bytes less than 128.  This may make the encoded
2716    text a little bit longer, but the text passes more easily through
2717    several types of gateway, some of which strip off the MSB (Most
2718    Significant Bit).
2719
2720    There are two kinds of character sets: control character sets and
2721    graphic character sets.  The former contain control characters such
2722    as `newline' and `escape' to provide control functions (control
2723    functions are also provided by escape sequences).  The latter
2724    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2725    two control character sets and many graphic character sets.
2726
2727    Graphic character sets are classified into one of the following
2728    four classes, according to the number of bytes (DIMENSION) and
2729    number of characters in one dimension (CHARS) of the set:
2730    - DIMENSION1_CHARS94
2731    - DIMENSION1_CHARS96
2732    - DIMENSION2_CHARS94
2733    - DIMENSION2_CHARS96
2734
2735    In addition, each character set is assigned an identification tag,
2736    unique for each set, called the "final character" (denoted as <F>
2737    hereafter).  The <F> of each character set is decided by ECMA(*)
2738    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2739    (0x30..0x3F are for private use only).
2740
2741    Note (*): ECMA = European Computer Manufacturers Association
2742
2743    Here are examples of graphic character sets [NAME(<F>)]:
2744         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2745         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2746         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2747         o DIMENSION2_CHARS96 -- none for the moment
2748
2749    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2750         C0 [0x00..0x1F] -- control character plane 0
2751         GL [0x20..0x7F] -- graphic character plane 0
2752         C1 [0x80..0x9F] -- control character plane 1
2753         GR [0xA0..0xFF] -- graphic character plane 1
2754
2755    A control character set is directly designated and invoked to C0 or
2756    C1 by an escape sequence.  The most common case is that:
2757    - ISO646's  control character set is designated/invoked to C0, and
2758    - ISO6429's control character set is designated/invoked to C1,
2759    and usually these designations/invocations are omitted in encoded
2760    text.  In a 7-bit environment, only C0 can be used, and a control
2761    character for C1 is encoded by an appropriate escape sequence to
2762    fit into the environment.  All control characters for C1 are
2763    defined to have corresponding escape sequences.
2764
2765    A graphic character set is at first designated to one of four
2766    graphic registers (G0 through G3), then these graphic registers are
2767    invoked to GL or GR.  These designations and invocations can be
2768    done independently.  The most common case is that G0 is invoked to
2769    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2770    these invocations and designations are omitted in encoded text.
2771    In a 7-bit environment, only GL can be used.
2772
2773    When a graphic character set of CHARS94 is invoked to GL, codes
2774    0x20 and 0x7F of the GL area work as control characters SPACE and
2775    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2776    be used.
2777
2778    There are two ways of invocation: locking-shift and single-shift.
2779    With locking-shift, the invocation lasts until the next different
2780    invocation, whereas with single-shift, the invocation affects the
2781    following character only and doesn't affect the locking-shift
2782    state.  Invocations are done by the following control characters or
2783    escape sequences:
2784
2785    ----------------------------------------------------------------------
2786    abbrev  function                  cntrl escape seq   description
2787    ----------------------------------------------------------------------
2788    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2789    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2790    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2791    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2792    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2793    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2794    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2795    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2796    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2797    ----------------------------------------------------------------------
2798    (*) These are not used by any known coding system.
2799
2800    Control characters for these functions are defined by macros
2801    ISO_CODE_XXX in `coding.h'.
2802
2803    Designations are done by the following escape sequences:
2804    ----------------------------------------------------------------------
2805    escape sequence      description
2806    ----------------------------------------------------------------------
2807    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2808    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2809    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2810    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2811    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2812    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2813    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2814    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2815    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2816    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2817    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2818    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2819    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2820    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2821    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2822    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2823    ----------------------------------------------------------------------
2824
2825    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2826    of dimension 1, chars 94, and final character <F>, etc...
2827
2828    Note (*): Although these designations are not allowed in ISO2022,
2829    Emacs accepts them on decoding, and produces them on encoding
2830    CHARS96 character sets in a coding system which is characterized as
2831    7-bit environment, non-locking-shift, and non-single-shift.
2832
2833    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2834    '(' must be omitted.  We refer to this as "short-form" hereafter.
2835
2836    Now you may notice that there are a lot of ways of encoding the
2837    same multilingual text in ISO2022.  Actually, there exist many
2838    coding systems such as Compound Text (used in X11's inter client
2839    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2840    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2841    localized platforms), and all of these are variants of ISO2022.
2842
2843    In addition to the above, Emacs handles two more kinds of escape
2844    sequences: ISO6429's direction specification and Emacs' private
2845    sequence for specifying character composition.
2846
2847    ISO6429's direction specification takes the following form:
2848         o CSI ']'      -- end of the current direction
2849         o CSI '0' ']'  -- end of the current direction
2850         o CSI '1' ']'  -- start of left-to-right text
2851         o CSI '2' ']'  -- start of right-to-left text
2852    The control character CSI (0x9B: control sequence introducer) is
2853    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2854
2855    Character composition specification takes the following form:
2856         o ESC '0' -- start relative composition
2857         o ESC '1' -- end composition
2858         o ESC '2' -- start rule-base composition (*)
2859         o ESC '3' -- start relative composition with alternate chars  (**)
2860         o ESC '4' -- start rule-base composition with alternate chars  (**)
2861   Since these are not standard escape sequences of any ISO standard,
2862   the use of them with these meanings is restricted to Emacs only.
2863
2864   (*) This form is used only in Emacs 20.7 and older versions,
2865   but newer versions can safely decode it.
2866   (**) This form is used only in Emacs 21.1 and newer versions,
2867   and older versions can't decode it.
2868
2869   Here's a list of example usages of these composition escape
2870   sequences (categorized by `enum composition_method').
2871
2872   COMPOSITION_RELATIVE:
2873         ESC 0 CHAR [ CHAR ] ESC 1
2874   COMPOSITION_WITH_RULE:
2875         ESC 2 CHAR [ RULE CHAR ] ESC 1
2876   COMPOSITION_WITH_ALTCHARS:
2877         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2878   COMPOSITION_WITH_RULE_ALTCHARS:
2879         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2880
2881 static enum iso_code_class_type iso_code_class[256];
2882
2883 #define SAFE_CHARSET_P(coding, id)      \
2884   ((id) <= (coding)->max_charset_id     \
2885    && (coding)->safe_charsets[id] != 255)
2886
2887 static void
2888 setup_iso_safe_charsets (Lisp_Object attrs)
2889 {
2890   Lisp_Object charset_list, safe_charsets;
2891   Lisp_Object request;
2892   Lisp_Object reg_usage;
2893   Lisp_Object tail;
2894   EMACS_INT reg94, reg96;
2895   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2896   int max_charset_id;
2897
2898   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2899   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2900       && ! EQ (charset_list, Viso_2022_charset_list))
2901     {
2902       charset_list = Viso_2022_charset_list;
2903       ASET (attrs, coding_attr_charset_list, charset_list);
2904       ASET (attrs, coding_attr_safe_charsets, Qnil);
2905     }
2906
2907   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2908     return;
2909
2910   max_charset_id = 0;
2911   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2912     {
2913       int id = XINT (XCAR (tail));
2914       if (max_charset_id < id)
2915         max_charset_id = id;
2916     }
2917
2918   safe_charsets = make_uninit_string (max_charset_id + 1);
2919   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2920   request = AREF (attrs, coding_attr_iso_request);
2921   reg_usage = AREF (attrs, coding_attr_iso_usage);
2922   reg94 = XINT (XCAR (reg_usage));
2923   reg96 = XINT (XCDR (reg_usage));
2924
2925   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2926     {
2927       Lisp_Object id;
2928       Lisp_Object reg;
2929       struct charset *charset;
2930
2931       id = XCAR (tail);
2932       charset = CHARSET_FROM_ID (XINT (id));
2933       reg = Fcdr (Fassq (id, request));
2934       if (! NILP (reg))
2935         SSET (safe_charsets, XINT (id), XINT (reg));
2936       else if (charset->iso_chars_96)
2937         {
2938           if (reg96 < 4)
2939             SSET (safe_charsets, XINT (id), reg96);
2940         }
2941       else
2942         {
2943           if (reg94 < 4)
2944             SSET (safe_charsets, XINT (id), reg94);
2945         }
2946     }
2947   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2948 }
2949
2950
2951 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2952    Return true if a text is encoded in one of ISO-2022 based coding
2953    systems.  */
2954
2955 static bool
2956 detect_coding_iso_2022 (struct coding_system *coding,
2957                         struct coding_detection_info *detect_info)
2958 {
2959   const unsigned char *src = coding->source, *src_base = src;
2960   const unsigned char *src_end = coding->source + coding->src_bytes;
2961   bool multibytep = coding->src_multibyte;
2962   bool single_shifting = 0;
2963   int id;
2964   int c, c1;
2965   ptrdiff_t consumed_chars = 0;
2966   int i;
2967   int rejected = 0;
2968   int found = 0;
2969   int composition_count = -1;
2970
2971   detect_info->checked |= CATEGORY_MASK_ISO;
2972
2973   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2974     {
2975       struct coding_system *this = &(coding_categories[i]);
2976       Lisp_Object attrs, val;
2977
2978       if (this->id < 0)
2979         continue;
2980       attrs = CODING_ID_ATTRS (this->id);
2981       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2982           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2983         setup_iso_safe_charsets (attrs);
2984       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2985       this->max_charset_id = SCHARS (val) - 1;
2986       this->safe_charsets = SDATA (val);
2987     }
2988
2989   /* A coding system of this category is always ASCII compatible.  */
2990   src += coding->head_ascii;
2991
2992   while (rejected != CATEGORY_MASK_ISO)
2993     {
2994       src_base = src;
2995       ONE_MORE_BYTE (c);
2996       switch (c)
2997         {
2998         case ISO_CODE_ESC:
2999           if (inhibit_iso_escape_detection)
3000             break;
3001           single_shifting = 0;
3002           ONE_MORE_BYTE (c);
3003           if (c == 'N' || c == 'O')
3004             {
3005               /* ESC <Fe> for SS2 or SS3.  */
3006               single_shifting = 1;
3007               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3008             }
3009           else if (c == '1')
3010             {
3011               /* End of composition.  */
3012               if (composition_count < 0
3013                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3014                 /* Invalid */
3015                 break;
3016               composition_count = -1;
3017               found |= CATEGORY_MASK_ISO;
3018             }
3019           else if (c >= '0' && c <= '4')
3020             {
3021               /* ESC <Fp> for start/end composition.  */
3022               composition_count = 0;
3023             }
3024           else
3025             {
3026               if (c >= '(' && c <= '/')
3027                 {
3028                   /* Designation sequence for a charset of dimension 1.  */
3029                   ONE_MORE_BYTE (c1);
3030                   if (c1 < ' ' || c1 >= 0x80
3031                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3032                     {
3033                       /* Invalid designation sequence.  Just ignore.  */
3034                       if (c1 >= 0x80)
3035                         rejected |= (CATEGORY_MASK_ISO_7BIT
3036                                      | CATEGORY_MASK_ISO_7_ELSE);
3037                       break;
3038                     }
3039                 }
3040               else if (c == '$')
3041                 {
3042                   /* Designation sequence for a charset of dimension 2.  */
3043                   ONE_MORE_BYTE (c);
3044                   if (c >= '@' && c <= 'B')
3045                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3046                     id = iso_charset_table[1][0][c];
3047                   else if (c >= '(' && c <= '/')
3048                     {
3049                       ONE_MORE_BYTE (c1);
3050                       if (c1 < ' ' || c1 >= 0x80
3051                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3052                         {
3053                           /* Invalid designation sequence.  Just ignore.  */
3054                           if (c1 >= 0x80)
3055                             rejected |= (CATEGORY_MASK_ISO_7BIT
3056                                          | CATEGORY_MASK_ISO_7_ELSE);
3057                           break;
3058                         }
3059                     }
3060                   else
3061                     {
3062                       /* Invalid designation sequence.  Just ignore it.  */
3063                       if (c >= 0x80)
3064                         rejected |= (CATEGORY_MASK_ISO_7BIT
3065                                      | CATEGORY_MASK_ISO_7_ELSE);
3066                       break;
3067                     }
3068                 }
3069               else
3070                 {
3071                   /* Invalid escape sequence.  Just ignore it.  */
3072                   if (c >= 0x80)
3073                     rejected |= (CATEGORY_MASK_ISO_7BIT
3074                                  | CATEGORY_MASK_ISO_7_ELSE);
3075                   break;
3076                 }
3077
3078               /* We found a valid designation sequence for CHARSET.  */
3079               rejected |= CATEGORY_MASK_ISO_8BIT;
3080               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3081                                   id))
3082                 found |= CATEGORY_MASK_ISO_7;
3083               else
3084                 rejected |= CATEGORY_MASK_ISO_7;
3085               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3086                                   id))
3087                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3088               else
3089                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3090               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3091                                   id))
3092                 found |= CATEGORY_MASK_ISO_7_ELSE;
3093               else
3094                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3095               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3096                                   id))
3097                 found |= CATEGORY_MASK_ISO_8_ELSE;
3098               else
3099                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3100             }
3101           break;
3102
3103         case ISO_CODE_SO:
3104         case ISO_CODE_SI:
3105           /* Locking shift out/in.  */
3106           if (inhibit_iso_escape_detection)
3107             break;
3108           single_shifting = 0;
3109           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3110           break;
3111
3112         case ISO_CODE_CSI:
3113           /* Control sequence introducer.  */
3114           single_shifting = 0;
3115           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3116           found |= CATEGORY_MASK_ISO_8_ELSE;
3117           goto check_extra_latin;
3118
3119         case ISO_CODE_SS2:
3120         case ISO_CODE_SS3:
3121           /* Single shift.   */
3122           if (inhibit_iso_escape_detection)
3123             break;
3124           single_shifting = 0;
3125           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3126           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3127               & CODING_ISO_FLAG_SINGLE_SHIFT)
3128             {
3129               found |= CATEGORY_MASK_ISO_8_1;
3130               single_shifting = 1;
3131             }
3132           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3133               & CODING_ISO_FLAG_SINGLE_SHIFT)
3134             {
3135               found |= CATEGORY_MASK_ISO_8_2;
3136               single_shifting = 1;
3137             }
3138           if (single_shifting)
3139             break;
3140           goto check_extra_latin;
3141
3142         default:
3143           if (c < 0)
3144             continue;
3145           if (c < 0x80)
3146             {
3147               if (composition_count >= 0)
3148                 composition_count++;
3149               single_shifting = 0;
3150               break;
3151             }
3152           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3153           if (c >= 0xA0)
3154             {
3155               found |= CATEGORY_MASK_ISO_8_1;
3156               /* Check the length of succeeding codes of the range
3157                  0xA0..0FF.  If the byte length is even, we include
3158                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3159                  only when we are not single shifting.  */
3160               if (! single_shifting
3161                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3162                 {
3163                   ptrdiff_t len = 1;
3164                   while (src < src_end)
3165                     {
3166                       src_base = src;
3167                       ONE_MORE_BYTE (c);
3168                       if (c < 0xA0)
3169                         {
3170                           src = src_base;
3171                           break;
3172                         }
3173                       len++;
3174                     }
3175
3176                   if (len & 1 && src < src_end)
3177                     {
3178                       rejected |= CATEGORY_MASK_ISO_8_2;
3179                       if (composition_count >= 0)
3180                         composition_count += len;
3181                     }
3182                   else
3183                     {
3184                       found |= CATEGORY_MASK_ISO_8_2;
3185                       if (composition_count >= 0)
3186                         composition_count += len / 2;
3187                     }
3188                 }
3189               break;
3190             }
3191         check_extra_latin:
3192           if (! VECTORP (Vlatin_extra_code_table)
3193               || NILP (AREF (Vlatin_extra_code_table, c)))
3194             {
3195               rejected = CATEGORY_MASK_ISO;
3196               break;
3197             }
3198           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3199               & CODING_ISO_FLAG_LATIN_EXTRA)
3200             found |= CATEGORY_MASK_ISO_8_1;
3201           else
3202             rejected |= CATEGORY_MASK_ISO_8_1;
3203           rejected |= CATEGORY_MASK_ISO_8_2;
3204           break;
3205         }
3206     }
3207   detect_info->rejected |= CATEGORY_MASK_ISO;
3208   return 0;
3209
3210  no_more_source:
3211   detect_info->rejected |= rejected;
3212   detect_info->found |= (found & ~rejected);
3213   return 1;
3214 }
3215
3216
3217 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3218    escape sequence should be kept.  */
3219 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3220   do {                                                                  \
3221     int id, prev;                                                       \
3222                                                                         \
3223     if (final < '0' || final >= 128                                     \
3224         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3225         || !SAFE_CHARSET_P (coding, id))                                \
3226       {                                                                 \
3227         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3228         chars_96 = -1;                                                  \
3229         break;                                                          \
3230       }                                                                 \
3231     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3232     if (id == charset_jisx0201_roman)                                   \
3233       {                                                                 \
3234         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3235           id = charset_ascii;                                           \
3236       }                                                                 \
3237     else if (id == charset_jisx0208_1978)                               \
3238       {                                                                 \
3239         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3240           id = charset_jisx0208;                                        \
3241       }                                                                 \
3242     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3243     /* If there was an invalid designation to REG previously, and this  \
3244        designation is ASCII to REG, we should keep this designation     \
3245        sequence.  */                                                    \
3246     if (prev == -2 && id == charset_ascii)                              \
3247       chars_96 = -1;                                                    \
3248   } while (0)
3249
3250
3251 /* Handle these composition sequence (ALT: alternate char):
3252
3253    (1) relative composition: ESC 0 CHAR ... ESC 1
3254    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3255    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3256    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3257
3258    When the start sequence (ESC 0/2/3/4) is found, this annotation
3259    header is produced.
3260
3261         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3262
3263    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3264    produced until the end sequence (ESC 1) is found:
3265
3266    (1) CHAR ... CHAR
3267    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3268    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3269    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3270
3271    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3272    annotation header is updated as below:
3273
3274    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3275    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3276    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3277    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3278
3279    If an error is found while composing, the annotation header is
3280    changed to:
3281
3282         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3283
3284    and the sequence [ -2 DECODED-RULE ] is changed to the original
3285    byte sequence as below:
3286         o the original byte sequence is B: [ B -1 ]
3287         o the original byte sequence is B1 B2: [ B1 B2 ]
3288    and the sequence [ -1 -1 ] is changed to the original byte
3289    sequence:
3290         [ ESC '0' ]
3291 */
3292
3293 /* Decode a composition rule C1 and maybe one more byte from the
3294    source, and set RULE to the encoded composition rule.  If the rule
3295    is invalid, goto invalid_code.  */
3296
3297 #define DECODE_COMPOSITION_RULE(rule)                                   \
3298   do {                                                                  \
3299     rule = c1 - 32;                                                     \
3300     if (rule < 0)                                                       \
3301       goto invalid_code;                                                \
3302     if (rule < 81)              /* old format (before ver.21) */        \
3303       {                                                                 \
3304         int gref = (rule) / 9;                                          \
3305         int nref = (rule) % 9;                                          \
3306         if (gref == 4) gref = 10;                                       \
3307         if (nref == 4) nref = 10;                                       \
3308         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3309       }                                                                 \
3310     else                        /* new format (after ver.21) */         \
3311       {                                                                 \
3312         int b;                                                          \
3313                                                                         \
3314         ONE_MORE_BYTE (b);                                              \
3315         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3316           goto invalid_code;                                            \
3317         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3318         rule += 0x100;   /* Distinguish it from the old format.  */     \
3319       }                                                                 \
3320   } while (0)
3321
3322 #define ENCODE_COMPOSITION_RULE(rule)                           \
3323   do {                                                          \
3324     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3325                                                                 \
3326     if (rule < 0x100)           /* old format */                \
3327       {                                                         \
3328         if (gref == 10) gref = 4;                               \
3329         if (nref == 10) nref = 4;                               \
3330         charbuf[idx] = 32 + gref * 9 + nref;                    \
3331         charbuf[idx + 1] = -1;                                  \
3332         new_chars++;                                            \
3333       }                                                         \
3334     else                                /* new format */        \
3335       {                                                         \
3336         charbuf[idx] = 32 + 81 + gref;                          \
3337         charbuf[idx + 1] = 32 + nref;                           \
3338         new_chars += 2;                                         \
3339       }                                                         \
3340   } while (0)
3341
3342 /* Finish the current composition as invalid.  */
3343
3344 static int
3345 finish_composition (int *charbuf, struct composition_status *cmp_status)
3346 {
3347   int idx = - cmp_status->length;
3348   int new_chars;
3349
3350   /* Recover the original ESC sequence */
3351   charbuf[idx++] = ISO_CODE_ESC;
3352   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3353                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3354                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3355                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3356                     : '4');
3357   charbuf[idx++] = -2;
3358   charbuf[idx++] = 0;
3359   charbuf[idx++] = -1;
3360   new_chars = cmp_status->nchars;
3361   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3362     for (; idx < 0; idx++)
3363       {
3364         int elt = charbuf[idx];
3365
3366         if (elt == -2)
3367           {
3368             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3369             idx++;
3370           }
3371         else if (elt == -1)
3372           {
3373             charbuf[idx++] = ISO_CODE_ESC;
3374             charbuf[idx] = '0';
3375             new_chars += 2;
3376           }
3377       }
3378   cmp_status->state = COMPOSING_NO;
3379   return new_chars;
3380 }
3381
3382 /* If characters are under composition, finish the composition.  */
3383 #define MAYBE_FINISH_COMPOSITION()                              \
3384   do {                                                          \
3385     if (cmp_status->state != COMPOSING_NO)                      \
3386       char_offset += finish_composition (charbuf, cmp_status);  \
3387   } while (0)
3388
3389 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3390
3391    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3392    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3393    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3394    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3395
3396    Produce this annotation sequence now:
3397
3398    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3399 */
3400
3401 #define DECODE_COMPOSITION_START(c1)                                       \
3402   do {                                                                     \
3403     if (c1 == '0'                                                          \
3404         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3405              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3406             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3407                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3408       {                                                                    \
3409         *charbuf++ = -1;                                                   \
3410         *charbuf++= -1;                                                    \
3411         cmp_status->state = COMPOSING_CHAR;                                \
3412         cmp_status->length += 2;                                           \
3413       }                                                                    \
3414     else                                                                   \
3415       {                                                                    \
3416         MAYBE_FINISH_COMPOSITION ();                                       \
3417         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3418                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3419                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3420                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3421         cmp_status->state                                                  \
3422           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3423         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3424         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3425         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3426         coding->annotated = 1;                                             \
3427       }                                                                    \
3428   } while (0)
3429
3430
3431 /* Handle composition end sequence ESC 1.  */
3432
3433 #define DECODE_COMPOSITION_END()                                        \
3434   do {                                                                  \
3435     if (cmp_status->nchars == 0                                         \
3436         || ((cmp_status->state == COMPOSING_CHAR)                       \
3437             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3438       {                                                                 \
3439         MAYBE_FINISH_COMPOSITION ();                                    \
3440         goto invalid_code;                                              \
3441       }                                                                 \
3442     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3443       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3444     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3445       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3446     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3447     char_offset += cmp_status->nchars;                                  \
3448     cmp_status->state = COMPOSING_NO;                                   \
3449   } while (0)
3450
3451 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3452
3453 #define STORE_COMPOSITION_RULE(rule)    \
3454   do {                                  \
3455     *charbuf++ = -2;                    \
3456     *charbuf++ = rule;                  \
3457     cmp_status->length += 2;            \
3458     cmp_status->state--;                \
3459   } while (0)
3460
3461 /* Store a composed char or a component char C in charbuf, and update
3462    cmp_status.  */
3463
3464 #define STORE_COMPOSITION_CHAR(c)                                       \
3465   do {                                                                  \
3466     *charbuf++ = (c);                                                   \
3467     cmp_status->length++;                                               \
3468     if (cmp_status->state == COMPOSING_CHAR)                            \
3469       cmp_status->nchars++;                                             \
3470     else                                                                \
3471       cmp_status->ncomps++;                                             \
3472     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3473         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3474             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3475       cmp_status->state++;                                              \
3476   } while (0)
3477
3478
3479 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3480
3481 static void
3482 decode_coding_iso_2022 (struct coding_system *coding)
3483 {
3484   const unsigned char *src = coding->source + coding->consumed;
3485   const unsigned char *src_end = coding->source + coding->src_bytes;
3486   const unsigned char *src_base;
3487   int *charbuf = coding->charbuf + coding->charbuf_used;
3488   /* We may produce two annotations (charset and composition) in one
3489      loop and one more charset annotation at the end.  */
3490   int *charbuf_end
3491     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3492   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3493   bool multibytep = coding->src_multibyte;
3494   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3495   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3496   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3497   int charset_id_2, charset_id_3;
3498   struct charset *charset;
3499   int c;
3500   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3501   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3502   ptrdiff_t char_offset = coding->produced_char;
3503   ptrdiff_t last_offset = char_offset;
3504   int last_id = charset_ascii;
3505   bool eol_dos
3506     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3507   int byte_after_cr = -1;
3508   int i;
3509
3510   setup_iso_safe_charsets (attrs);
3511   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3512
3513   if (cmp_status->state != COMPOSING_NO)
3514     {
3515       if (charbuf_end - charbuf < cmp_status->length)
3516         emacs_abort ();
3517       for (i = 0; i < cmp_status->length; i++)
3518         *charbuf++ = cmp_status->carryover[i];
3519       coding->annotated = 1;
3520     }
3521
3522   while (1)
3523     {
3524       int c1, c2, c3;
3525
3526       src_base = src;
3527       consumed_chars_base = consumed_chars;
3528
3529       if (charbuf >= charbuf_end)
3530         {
3531           if (byte_after_cr >= 0)
3532             src_base--;
3533           break;
3534         }
3535
3536       if (byte_after_cr >= 0)
3537         c1 = byte_after_cr, byte_after_cr = -1;
3538       else
3539         ONE_MORE_BYTE (c1);
3540       if (c1 < 0)
3541         goto invalid_code;
3542
3543       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3544         {
3545           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3546           char_offset++;
3547           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3548           continue;
3549         }
3550
3551       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3552         {
3553           if (c1 == ISO_CODE_ESC)
3554             {
3555               if (src + 1 >= src_end)
3556                 goto no_more_source;
3557               *charbuf++ = ISO_CODE_ESC;
3558               char_offset++;
3559               if (src[0] == '%' && src[1] == '@')
3560                 {
3561                   src += 2;
3562                   consumed_chars += 2;
3563                   char_offset += 2;
3564                   /* We are sure charbuf can contain two more chars. */
3565                   *charbuf++ = '%';
3566                   *charbuf++ = '@';
3567                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3568                 }
3569             }
3570           else
3571             {
3572               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3573               char_offset++;
3574             }
3575           continue;
3576         }
3577
3578       if ((cmp_status->state == COMPOSING_RULE
3579            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3580           && c1 != ISO_CODE_ESC)
3581         {
3582           int rule;
3583
3584           DECODE_COMPOSITION_RULE (rule);
3585           STORE_COMPOSITION_RULE (rule);
3586           continue;
3587         }
3588
3589       /* We produce at most one character.  */
3590       switch (iso_code_class [c1])
3591         {
3592         case ISO_0x20_or_0x7F:
3593           if (charset_id_0 < 0
3594               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3595             /* This is SPACE or DEL.  */
3596             charset = CHARSET_FROM_ID (charset_ascii);
3597           else
3598             charset = CHARSET_FROM_ID (charset_id_0);
3599           break;
3600
3601         case ISO_graphic_plane_0:
3602           if (charset_id_0 < 0)
3603             charset = CHARSET_FROM_ID (charset_ascii);
3604           else
3605             charset = CHARSET_FROM_ID (charset_id_0);
3606           break;
3607
3608         case ISO_0xA0_or_0xFF:
3609           if (charset_id_1 < 0
3610               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3611               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3612             goto invalid_code;
3613           /* This is a graphic character, we fall down ... */
3614           FALLTHROUGH;
3615         case ISO_graphic_plane_1:
3616           if (charset_id_1 < 0)
3617             goto invalid_code;
3618           charset = CHARSET_FROM_ID (charset_id_1);
3619           break;
3620
3621         case ISO_control_0:
3622           if (eol_dos && c1 == '\r')
3623             ONE_MORE_BYTE (byte_after_cr);
3624           MAYBE_FINISH_COMPOSITION ();
3625           charset = CHARSET_FROM_ID (charset_ascii);
3626           break;
3627
3628         case ISO_control_1:
3629           goto invalid_code;
3630
3631         case ISO_shift_out:
3632           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3633               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3634             goto invalid_code;
3635           CODING_ISO_INVOCATION (coding, 0) = 1;
3636           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3637           continue;
3638
3639         case ISO_shift_in:
3640           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3641             goto invalid_code;
3642           CODING_ISO_INVOCATION (coding, 0) = 0;
3643           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3644           continue;
3645
3646         case ISO_single_shift_2_7:
3647           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3648             goto invalid_code;
3649           FALLTHROUGH;
3650         case ISO_single_shift_2:
3651           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3652             goto invalid_code;
3653           /* SS2 is handled as an escape sequence of ESC 'N' */
3654           c1 = 'N';
3655           goto label_escape_sequence;
3656
3657         case ISO_single_shift_3:
3658           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3659             goto invalid_code;
3660           /* SS2 is handled as an escape sequence of ESC 'O' */
3661           c1 = 'O';
3662           goto label_escape_sequence;
3663
3664         case ISO_control_sequence_introducer:
3665           /* CSI is handled as an escape sequence of ESC '[' ...  */
3666           c1 = '[';
3667           goto label_escape_sequence;
3668
3669         case ISO_escape:
3670           ONE_MORE_BYTE (c1);
3671         label_escape_sequence:
3672           /* Escape sequences handled here are invocation,
3673              designation, direction specification, and character
3674              composition specification.  */
3675           switch (c1)
3676             {
3677             case '&':           /* revision of following character set */
3678               ONE_MORE_BYTE (c1);
3679               if (!(c1 >= '@' && c1 <= '~'))
3680                 goto invalid_code;
3681               ONE_MORE_BYTE (c1);
3682               if (c1 != ISO_CODE_ESC)
3683                 goto invalid_code;
3684               ONE_MORE_BYTE (c1);
3685               goto label_escape_sequence;
3686
3687             case '$':           /* designation of 2-byte character set */
3688               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3689                 goto invalid_code;
3690               {
3691                 int reg, chars96;
3692
3693                 ONE_MORE_BYTE (c1);
3694                 if (c1 >= '@' && c1 <= 'B')
3695                   {     /* designation of JISX0208.1978, GB2312.1980,
3696                            or JISX0208.1980 */
3697                     reg = 0, chars96 = 0;
3698                   }
3699                 else if (c1 >= 0x28 && c1 <= 0x2B)
3700                   { /* designation of DIMENSION2_CHARS94 character set */
3701                     reg = c1 - 0x28, chars96 = 0;
3702                     ONE_MORE_BYTE (c1);
3703                   }
3704                 else if (c1 >= 0x2C && c1 <= 0x2F)
3705                   { /* designation of DIMENSION2_CHARS96 character set */
3706                     reg = c1 - 0x2C, chars96 = 1;
3707                     ONE_MORE_BYTE (c1);
3708                   }
3709                 else
3710                   goto invalid_code;
3711                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3712                 /* We must update these variables now.  */
3713                 if (reg == 0)
3714                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3715                 else if (reg == 1)
3716                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3717                 if (chars96 < 0)
3718                   goto invalid_code;
3719               }
3720               continue;
3721
3722             case 'n':           /* invocation of locking-shift-2 */
3723               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3724                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3725                 goto invalid_code;
3726               CODING_ISO_INVOCATION (coding, 0) = 2;
3727               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3728               continue;
3729
3730             case 'o':           /* invocation of locking-shift-3 */
3731               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3732                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3733                 goto invalid_code;
3734               CODING_ISO_INVOCATION (coding, 0) = 3;
3735               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3736               continue;
3737
3738             case 'N':           /* invocation of single-shift-2 */
3739               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3740                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3741                 goto invalid_code;
3742               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3743               if (charset_id_2 < 0)
3744                 charset = CHARSET_FROM_ID (charset_ascii);
3745               else
3746                 charset = CHARSET_FROM_ID (charset_id_2);
3747               ONE_MORE_BYTE (c1);
3748               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3749                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3750                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3751                           ? c1 >= 0x80 : c1 < 0x80)))
3752                 goto invalid_code;
3753               break;
3754
3755             case 'O':           /* invocation of single-shift-3 */
3756               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3757                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3758                 goto invalid_code;
3759               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3760               if (charset_id_3 < 0)
3761                 charset = CHARSET_FROM_ID (charset_ascii);
3762               else
3763                 charset = CHARSET_FROM_ID (charset_id_3);
3764               ONE_MORE_BYTE (c1);
3765               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3766                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3767                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3768                           ? c1 >= 0x80 : c1 < 0x80)))
3769                 goto invalid_code;
3770               break;
3771
3772             case '0': case '2': case '3': case '4': /* start composition */
3773               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3774                 goto invalid_code;
3775               if (last_id != charset_ascii)
3776                 {
3777                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3778                   last_id = charset_ascii;
3779                   last_offset = char_offset;
3780                 }
3781               DECODE_COMPOSITION_START (c1);
3782               continue;
3783
3784             case '1':           /* end composition */
3785               if (cmp_status->state == COMPOSING_NO)
3786                 goto invalid_code;
3787               DECODE_COMPOSITION_END ();
3788               continue;
3789
3790             case '[':           /* specification of direction */
3791               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3792                 goto invalid_code;
3793               /* For the moment, nested direction is not supported.
3794                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3795                  left-to-right, and nonzero means right-to-left.  */
3796               ONE_MORE_BYTE (c1);
3797               switch (c1)
3798                 {
3799                 case ']':       /* end of the current direction */
3800                   coding->mode &= ~CODING_MODE_DIRECTION;
3801                   break;
3802
3803                 case '0':       /* end of the current direction */
3804                 case '1':       /* start of left-to-right direction */
3805                   ONE_MORE_BYTE (c1);
3806                   if (c1 == ']')
3807                     coding->mode &= ~CODING_MODE_DIRECTION;
3808                   else
3809                     goto invalid_code;
3810                   break;
3811
3812                 case '2':       /* start of right-to-left direction */
3813                   ONE_MORE_BYTE (c1);
3814                   if (c1 == ']')
3815                     coding->mode |= CODING_MODE_DIRECTION;
3816                   else
3817                     goto invalid_code;
3818                   break;
3819
3820                 default:
3821                   goto invalid_code;
3822                 }
3823               continue;
3824
3825             case '%':
3826               ONE_MORE_BYTE (c1);
3827               if (c1 == '/')
3828                 {
3829                   /* CTEXT extended segment:
3830                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3831                      We keep these bytes as is for the moment.
3832                      They may be decoded by post-read-conversion.  */
3833                   int dim, M, L;
3834                   int size;
3835
3836                   ONE_MORE_BYTE (dim);
3837                   if (dim < '0' || dim > '4')
3838                     goto invalid_code;
3839                   ONE_MORE_BYTE (M);
3840                   if (M < 128)
3841                     goto invalid_code;
3842                   ONE_MORE_BYTE (L);
3843                   if (L < 128)
3844                     goto invalid_code;
3845                   size = ((M - 128) * 128) + (L - 128);
3846                   if (charbuf + 6 > charbuf_end)
3847                     goto break_loop;
3848                   *charbuf++ = ISO_CODE_ESC;
3849                   *charbuf++ = '%';
3850                   *charbuf++ = '/';
3851                   *charbuf++ = dim;
3852                   *charbuf++ = BYTE8_TO_CHAR (M);
3853                   *charbuf++ = BYTE8_TO_CHAR (L);
3854                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3855                 }
3856               else if (c1 == 'G')
3857                 {
3858                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3859                      ESC % G --UTF-8-BYTES-- ESC % @
3860                      We keep these bytes as is for the moment.
3861                      They may be decoded by post-read-conversion.  */
3862                   if (charbuf + 3 > charbuf_end)
3863                     goto break_loop;
3864                   *charbuf++ = ISO_CODE_ESC;
3865                   *charbuf++ = '%';
3866                   *charbuf++ = 'G';
3867                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3868                 }
3869               else
3870                 goto invalid_code;
3871               continue;
3872               break;
3873
3874             default:
3875               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3876                 goto invalid_code;
3877               {
3878                 int reg, chars96;
3879
3880                 if (c1 >= 0x28 && c1 <= 0x2B)
3881                   { /* designation of DIMENSION1_CHARS94 character set */
3882                     reg = c1 - 0x28, chars96 = 0;
3883                     ONE_MORE_BYTE (c1);
3884                   }
3885                 else if (c1 >= 0x2C && c1 <= 0x2F)
3886                   { /* designation of DIMENSION1_CHARS96 character set */
3887                     reg = c1 - 0x2C, chars96 = 1;
3888                     ONE_MORE_BYTE (c1);
3889                   }
3890                 else
3891                   goto invalid_code;
3892                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3893                 /* We must update these variables now.  */
3894                 if (reg == 0)
3895                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3896                 else if (reg == 1)
3897                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3898                 if (chars96 < 0)
3899                   goto invalid_code;
3900               }
3901               continue;
3902             }
3903           break;
3904
3905         default:
3906           emacs_abort ();
3907         }
3908
3909       if (cmp_status->state == COMPOSING_NO
3910           && charset->id != charset_ascii
3911           && last_id != charset->id)
3912         {
3913           if (last_id != charset_ascii)
3914             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3915           last_id = charset->id;
3916           last_offset = char_offset;
3917         }
3918
3919       /* Now we know CHARSET and 1st position code C1 of a character.
3920          Produce a decoded character while getting 2nd and 3rd
3921          position codes C2, C3 if necessary.  */
3922       if (CHARSET_DIMENSION (charset) > 1)
3923         {
3924           ONE_MORE_BYTE (c2);
3925           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3926               || ((c1 & 0x80) != (c2 & 0x80)))
3927             /* C2 is not in a valid range.  */
3928             goto invalid_code;
3929           if (CHARSET_DIMENSION (charset) == 2)
3930             c1 = (c1 << 8) | c2;
3931           else
3932             {
3933               ONE_MORE_BYTE (c3);
3934               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3935                   || ((c1 & 0x80) != (c3 & 0x80)))
3936                 /* C3 is not in a valid range.  */
3937                 goto invalid_code;
3938               c1 = (c1 << 16) | (c2 << 8) | c2;
3939             }
3940         }
3941       c1 &= 0x7F7F7F;
3942       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3943       if (c < 0)
3944         {
3945           MAYBE_FINISH_COMPOSITION ();
3946           for (; src_base < src; src_base++, char_offset++)
3947             {
3948               if (ASCII_CHAR_P (*src_base))
3949                 *charbuf++ = *src_base;
3950               else
3951                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3952             }
3953         }
3954       else if (cmp_status->state == COMPOSING_NO)
3955         {
3956           *charbuf++ = c;
3957           char_offset++;
3958         }
3959       else if ((cmp_status->state == COMPOSING_CHAR
3960                 ? cmp_status->nchars
3961                 : cmp_status->ncomps)
3962                >= MAX_COMPOSITION_COMPONENTS)
3963         {
3964           /* Too long composition.  */
3965           MAYBE_FINISH_COMPOSITION ();
3966           *charbuf++ = c;
3967           char_offset++;
3968         }
3969       else
3970         STORE_COMPOSITION_CHAR (c);
3971       continue;
3972
3973     invalid_code:
3974       MAYBE_FINISH_COMPOSITION ();
3975       src = src_base;
3976       consumed_chars = consumed_chars_base;
3977       ONE_MORE_BYTE (c);
3978       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
3979       char_offset++;
3980       /* Reset the invocation and designation status to the safest
3981          one; i.e. designate ASCII to the graphic register 0, and
3982          invoke that register to the graphic plane 0.  This typically
3983          helps the case that an designation sequence for ASCII "ESC (
3984          B" is somehow broken (e.g. broken by a newline).  */
3985       CODING_ISO_INVOCATION (coding, 0) = 0;
3986       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3987       charset_id_0 = charset_ascii;
3988       continue;
3989
3990     break_loop:
3991       break;
3992     }
3993
3994  no_more_source:
3995   if (cmp_status->state != COMPOSING_NO)
3996     {
3997       if (coding->mode & CODING_MODE_LAST_BLOCK)
3998         MAYBE_FINISH_COMPOSITION ();
3999       else
4000         {
4001           charbuf -= cmp_status->length;
4002           for (i = 0; i < cmp_status->length; i++)
4003             cmp_status->carryover[i] = charbuf[i];
4004         }
4005     }
4006   else if (last_id != charset_ascii)
4007     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4008   coding->consumed_char += consumed_chars_base;
4009   coding->consumed = src_base - coding->source;
4010   coding->charbuf_used = charbuf - coding->charbuf;
4011 }
4012
4013
4014 /* ISO2022 encoding stuff.  */
4015
4016 /*
4017    It is not enough to say just "ISO2022" on encoding, we have to
4018    specify more details.  In Emacs, each coding system of ISO2022
4019    variant has the following specifications:
4020         1. Initial designation to G0 thru G3.
4021         2. Allows short-form designation?
4022         3. ASCII should be designated to G0 before control characters?
4023         4. ASCII should be designated to G0 at end of line?
4024         5. 7-bit environment or 8-bit environment?
4025         6. Use locking-shift?
4026         7. Use Single-shift?
4027    And the following two are only for Japanese:
4028         8. Use ASCII in place of JIS0201-1976-Roman?
4029         9. Use JISX0208-1983 in place of JISX0208-1978?
4030    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4031    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4032    details.
4033 */
4034
4035 /* Produce codes (escape sequence) for designating CHARSET to graphic
4036    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4037    '@', 'A', or 'B' and the coding system CODING allows, produce
4038    designation sequence of short-form.  */
4039
4040 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4041   do {                                                                  \
4042     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4043     const char *intermediate_char_94 = "()*+";                          \
4044     const char *intermediate_char_96 = ",-./";                          \
4045     int revision = -1;                                                  \
4046                                                                         \
4047     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4048       revision = CHARSET_ISO_REVISION (charset);                        \
4049                                                                         \
4050     if (revision >= 0)                                                  \
4051       {                                                                 \
4052         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4053         EMIT_ONE_BYTE ('@' + revision);                                 \
4054       }                                                                 \
4055     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4056     if (CHARSET_DIMENSION (charset) == 1)                               \
4057       {                                                                 \
4058         int b;                                                          \
4059         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4060           b = intermediate_char_94[reg];                                \
4061         else                                                            \
4062           b = intermediate_char_96[reg];                                \
4063         EMIT_ONE_ASCII_BYTE (b);                                        \
4064       }                                                                 \
4065     else                                                                \
4066       {                                                                 \
4067         EMIT_ONE_ASCII_BYTE ('$');                                      \
4068         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4069           {                                                             \
4070             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4071                 || reg != 0                                             \
4072                 || final_char < '@' || final_char > 'B')                \
4073               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4074           }                                                             \
4075         else                                                            \
4076           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4077       }                                                                 \
4078     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4079                                                                         \
4080     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4081   } while (0)
4082
4083
4084 /* The following two macros produce codes (control character or escape
4085    sequence) for ISO2022 single-shift functions (single-shift-2 and
4086    single-shift-3).  */
4087
4088 #define ENCODE_SINGLE_SHIFT_2                                           \
4089   do {                                                                  \
4090     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4091       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4092     else                                                                \
4093       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4094     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4095   } while (0)
4096
4097
4098 #define ENCODE_SINGLE_SHIFT_3                                           \
4099   do {                                                                  \
4100     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4101       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4102     else                                                                \
4103       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4104     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4105   } while (0)
4106
4107
4108 /* The following four macros produce codes (control character or
4109    escape sequence) for ISO2022 locking-shift functions (shift-in,
4110    shift-out, locking-shift-2, and locking-shift-3).  */
4111
4112 #define ENCODE_SHIFT_IN                                 \
4113   do {                                                  \
4114     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4115     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4116   } while (0)
4117
4118
4119 #define ENCODE_SHIFT_OUT                                \
4120   do {                                                  \
4121     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4122     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4123   } while (0)
4124
4125
4126 #define ENCODE_LOCKING_SHIFT_2                          \
4127   do {                                                  \
4128     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4129     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4130   } while (0)
4131
4132
4133 #define ENCODE_LOCKING_SHIFT_3                          \
4134   do {                                                  \
4135     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4136     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4137   } while (0)
4138
4139
4140 /* Produce codes for a DIMENSION1 character whose character set is
4141    CHARSET and whose position-code is C1.  Designation and invocation
4142    sequences are also produced in advance if necessary.  */
4143
4144 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4145   do {                                                                  \
4146     int id = CHARSET_ID (charset);                                      \
4147                                                                         \
4148     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4149         && id == charset_ascii)                                         \
4150       {                                                                 \
4151         id = charset_jisx0201_roman;                                    \
4152         charset = CHARSET_FROM_ID (id);                                 \
4153       }                                                                 \
4154                                                                         \
4155     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4156       {                                                                 \
4157         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4158           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4159         else                                                            \
4160           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4161         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4162         break;                                                          \
4163       }                                                                 \
4164     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4165       {                                                                 \
4166         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4167         break;                                                          \
4168       }                                                                 \
4169     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4170       {                                                                 \
4171         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4172         break;                                                          \
4173       }                                                                 \
4174     else                                                                \
4175       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4176          must invoke it, or, at first, designate it to some graphic     \
4177          register.  Then repeat the loop to actually produce the        \
4178          character.  */                                                 \
4179       dst = encode_invocation_designation (charset, coding, dst,        \
4180                                            &produced_chars);            \
4181   } while (1)
4182
4183
4184 /* Produce codes for a DIMENSION2 character whose character set is
4185    CHARSET and whose position-codes are C1 and C2.  Designation and
4186    invocation codes are also produced in advance if necessary.  */
4187
4188 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4189   do {                                                                  \
4190     int id = CHARSET_ID (charset);                                      \
4191                                                                         \
4192     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4193         && id == charset_jisx0208)                                      \
4194       {                                                                 \
4195         id = charset_jisx0208_1978;                                     \
4196         charset = CHARSET_FROM_ID (id);                                 \
4197       }                                                                 \
4198                                                                         \
4199     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4200       {                                                                 \
4201         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4202           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4203         else                                                            \
4204           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4205         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4206         break;                                                          \
4207       }                                                                 \
4208     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4209       {                                                                 \
4210         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4211         break;                                                          \
4212       }                                                                 \
4213     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4214       {                                                                 \
4215         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4216         break;                                                          \
4217       }                                                                 \
4218     else                                                                \
4219       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4220          must invoke it, or, at first, designate it to some graphic     \
4221          register.  Then repeat the loop to actually produce the        \
4222          character.  */                                                 \
4223       dst = encode_invocation_designation (charset, coding, dst,        \
4224                                            &produced_chars);            \
4225   } while (1)
4226
4227
4228 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4229   do {                                                                     \
4230     unsigned code;                                                         \
4231     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4232                                                                            \
4233     if (CHARSET_DIMENSION (charset) == 1)                                  \
4234       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4235     else                                                                   \
4236       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4237   } while (0)
4238
4239
4240 /* Produce designation and invocation codes at a place pointed by DST
4241    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4242    Return new DST.  */
4243
4244 static unsigned char *
4245 encode_invocation_designation (struct charset *charset,
4246                                struct coding_system *coding,
4247                                unsigned char *dst, ptrdiff_t *p_nchars)
4248 {
4249   bool multibytep = coding->dst_multibyte;
4250   ptrdiff_t produced_chars = *p_nchars;
4251   int reg;                      /* graphic register number */
4252   int id = CHARSET_ID (charset);
4253
4254   /* At first, check designations.  */
4255   for (reg = 0; reg < 4; reg++)
4256     if (id == CODING_ISO_DESIGNATION (coding, reg))
4257       break;
4258
4259   if (reg >= 4)
4260     {
4261       /* CHARSET is not yet designated to any graphic registers.  */
4262       /* At first check the requested designation.  */
4263       reg = CODING_ISO_REQUEST (coding, id);
4264       if (reg < 0)
4265         /* Since CHARSET requests no special designation, designate it
4266            to graphic register 0.  */
4267         reg = 0;
4268
4269       ENCODE_DESIGNATION (charset, reg, coding);
4270     }
4271
4272   if (CODING_ISO_INVOCATION (coding, 0) != reg
4273       && CODING_ISO_INVOCATION (coding, 1) != reg)
4274     {
4275       /* Since the graphic register REG is not invoked to any graphic
4276          planes, invoke it to graphic plane 0.  */
4277       switch (reg)
4278         {
4279         case 0:                 /* graphic register 0 */
4280           ENCODE_SHIFT_IN;
4281           break;
4282
4283         case 1:                 /* graphic register 1 */
4284           ENCODE_SHIFT_OUT;
4285           break;
4286
4287         case 2:                 /* graphic register 2 */
4288           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4289             ENCODE_SINGLE_SHIFT_2;
4290           else
4291             ENCODE_LOCKING_SHIFT_2;
4292           break;
4293
4294         case 3:                 /* graphic register 3 */
4295           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4296             ENCODE_SINGLE_SHIFT_3;
4297           else
4298             ENCODE_LOCKING_SHIFT_3;
4299           break;
4300
4301         default:
4302           break;
4303         }
4304     }
4305
4306   *p_nchars = produced_chars;
4307   return dst;
4308 }
4309
4310
4311 /* Produce codes for designation and invocation to reset the graphic
4312    planes and registers to initial state.  */
4313 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4314   do {                                                                  \
4315     int reg;                                                            \
4316     struct charset *charset;                                            \
4317                                                                         \
4318     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4319       ENCODE_SHIFT_IN;                                                  \
4320     for (reg = 0; reg < 4; reg++)                                       \
4321       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4322           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4323               != CODING_ISO_INITIAL (coding, reg)))                     \
4324         {                                                               \
4325           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4326           ENCODE_DESIGNATION (charset, reg, coding);                    \
4327         }                                                               \
4328   } while (0)
4329
4330
4331 /* Produce designation sequences of charsets in the line started from
4332    CHARBUF to a place pointed by DST, and return the number of
4333    produced bytes.  DST should not directly point a buffer text area
4334    which may be relocated by char_charset call.
4335
4336    If the current block ends before any end-of-line, we may fail to
4337    find all the necessary designations.  */
4338
4339 static ptrdiff_t
4340 encode_designation_at_bol (struct coding_system *coding,
4341                            int *charbuf, int *charbuf_end,
4342                            unsigned char *dst)
4343 {
4344   unsigned char *orig = dst;
4345   struct charset *charset;
4346   /* Table of charsets to be designated to each graphic register.  */
4347   int r[4];
4348   int c, found = 0, reg;
4349   ptrdiff_t produced_chars = 0;
4350   bool multibytep = coding->dst_multibyte;
4351   Lisp_Object attrs;
4352   Lisp_Object charset_list;
4353
4354   attrs = CODING_ID_ATTRS (coding->id);
4355   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4356   if (EQ (charset_list, Qiso_2022))
4357     charset_list = Viso_2022_charset_list;
4358
4359   for (reg = 0; reg < 4; reg++)
4360     r[reg] = -1;
4361
4362   while (charbuf < charbuf_end && found < 4)
4363     {
4364       int id;
4365
4366       c = *charbuf++;
4367       if (c == '\n')
4368         break;
4369       charset = char_charset (c, charset_list, NULL);
4370       id = CHARSET_ID (charset);
4371       reg = CODING_ISO_REQUEST (coding, id);
4372       if (reg >= 0 && r[reg] < 0)
4373         {
4374           found++;
4375           r[reg] = id;
4376         }
4377     }
4378
4379   if (found)
4380     {
4381       for (reg = 0; reg < 4; reg++)
4382         if (r[reg] >= 0
4383             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4384           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4385     }
4386
4387   return dst - orig;
4388 }
4389
4390 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4391
4392 static bool
4393 encode_coding_iso_2022 (struct coding_system *coding)
4394 {
4395   bool multibytep = coding->dst_multibyte;
4396   int *charbuf = coding->charbuf;
4397   int *charbuf_end = charbuf + coding->charbuf_used;
4398   unsigned char *dst = coding->destination + coding->produced;
4399   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4400   int safe_room = 16;
4401   bool bol_designation
4402     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4403        && CODING_ISO_BOL (coding));
4404   ptrdiff_t produced_chars = 0;
4405   Lisp_Object attrs, eol_type, charset_list;
4406   bool ascii_compatible;
4407   int c;
4408   int preferred_charset_id = -1;
4409
4410   CODING_GET_INFO (coding, attrs, charset_list);
4411   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4412   if (VECTORP (eol_type))
4413     eol_type = Qunix;
4414
4415   setup_iso_safe_charsets (attrs);
4416   /* Charset list may have been changed.  */
4417   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4418   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4419
4420   ascii_compatible
4421     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4422        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4423                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4424
4425   while (charbuf < charbuf_end)
4426     {
4427       ASSURE_DESTINATION (safe_room);
4428
4429       if (bol_designation)
4430         {
4431           /* We have to produce designation sequences if any now.  */
4432           unsigned char desig_buf[16];
4433           ptrdiff_t nbytes;
4434           ptrdiff_t offset;
4435
4436           charset_map_loaded = 0;
4437           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4438                                               desig_buf);
4439           if (charset_map_loaded
4440               && (offset = coding_change_destination (coding)))
4441             {
4442               dst += offset;
4443               dst_end += offset;
4444             }
4445           memcpy (dst, desig_buf, nbytes);
4446           dst += nbytes;
4447           /* We are sure that designation sequences are all ASCII bytes.  */
4448           produced_chars += nbytes;
4449           bol_designation = 0;
4450           ASSURE_DESTINATION (safe_room);
4451         }
4452
4453       c = *charbuf++;
4454
4455       if (c < 0)
4456         {
4457           /* Handle an annotation.  */
4458           switch (*charbuf)
4459             {
4460             case CODING_ANNOTATE_COMPOSITION_MASK:
4461               /* Not yet implemented.  */
4462               break;
4463             case CODING_ANNOTATE_CHARSET_MASK:
4464               preferred_charset_id = charbuf[2];
4465               if (preferred_charset_id >= 0
4466                   && NILP (Fmemq (make_number (preferred_charset_id),
4467                                   charset_list)))
4468                 preferred_charset_id = -1;
4469               break;
4470             default:
4471               emacs_abort ();
4472             }
4473           charbuf += -c - 1;
4474           continue;
4475         }
4476
4477       /* Now encode the character C.  */
4478       if (c < 0x20 || c == 0x7F)
4479         {
4480           if (c == '\n'
4481               || (c == '\r' && EQ (eol_type, Qmac)))
4482             {
4483               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4484                 ENCODE_RESET_PLANE_AND_REGISTER ();
4485               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4486                 {
4487                   int i;
4488
4489                   for (i = 0; i < 4; i++)
4490                     CODING_ISO_DESIGNATION (coding, i)
4491                       = CODING_ISO_INITIAL (coding, i);
4492                 }
4493               bol_designation = ((CODING_ISO_FLAGS (coding)
4494                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4495                                  != 0);
4496             }
4497           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4498             ENCODE_RESET_PLANE_AND_REGISTER ();
4499           EMIT_ONE_ASCII_BYTE (c);
4500         }
4501       else if (ASCII_CHAR_P (c))
4502         {
4503           if (ascii_compatible)
4504             EMIT_ONE_ASCII_BYTE (c);
4505           else
4506             {
4507               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4508               ENCODE_ISO_CHARACTER (charset, c);
4509             }
4510         }
4511       else if (CHAR_BYTE8_P (c))
4512         {
4513           c = CHAR_TO_BYTE8 (c);
4514           EMIT_ONE_BYTE (c);
4515         }
4516       else
4517         {
4518           struct charset *charset;
4519
4520           if (preferred_charset_id >= 0)
4521             {
4522               bool result;
4523
4524               charset = CHARSET_FROM_ID (preferred_charset_id);
4525               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4526               if (! result)
4527                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4528                                      NULL, charset);
4529             }
4530           else
4531             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4532                                  NULL, charset);
4533           if (!charset)
4534             {
4535               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4536                 {
4537                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4538                   charset = CHARSET_FROM_ID (charset_ascii);
4539                 }
4540               else
4541                 {
4542                   c = coding->default_char;
4543                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4544                                        charset_list, NULL, charset);
4545                 }
4546             }
4547           ENCODE_ISO_CHARACTER (charset, c);
4548         }
4549     }
4550
4551   if (coding->mode & CODING_MODE_LAST_BLOCK
4552       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4553     {
4554       ASSURE_DESTINATION (safe_room);
4555       ENCODE_RESET_PLANE_AND_REGISTER ();
4556     }
4557   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4558   CODING_ISO_BOL (coding) = bol_designation;
4559   coding->produced_char += produced_chars;
4560   coding->produced = dst - coding->destination;
4561   return 0;
4562 }
4563
4564 \f
4565 /*** 8,9. SJIS and BIG5 handlers ***/
4566
4567 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4568    quite widely.  So, for the moment, Emacs supports them in the bare
4569    C code.  But, in the future, they may be supported only by CCL.  */
4570
4571 /* SJIS is a coding system encoding three character sets: ASCII, right
4572    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4573    as is.  A character of charset katakana-jisx0201 is encoded by
4574    "position-code + 0x80".  A character of charset japanese-jisx0208
4575    is encoded in 2-byte but two position-codes are divided and shifted
4576    so that it fit in the range below.
4577
4578    --- CODE RANGE of SJIS ---
4579    (character set)      (range)
4580    ASCII                0x00 .. 0x7F
4581    KATAKANA-JISX0201    0xA0 .. 0xDF
4582    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4583             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4584    -------------------------------
4585
4586 */
4587
4588 /* BIG5 is a coding system encoding two character sets: ASCII and
4589    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4590    character set and is encoded in two-byte.
4591
4592    --- CODE RANGE of BIG5 ---
4593    (character set)      (range)
4594    ASCII                0x00 .. 0x7F
4595    Big5 (1st byte)      0xA1 .. 0xFE
4596         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4597    --------------------------
4598
4599   */
4600
4601 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4602    Return true if a text is encoded in SJIS.  */
4603
4604 static bool
4605 detect_coding_sjis (struct coding_system *coding,
4606                     struct coding_detection_info *detect_info)
4607 {
4608   const unsigned char *src = coding->source, *src_base;
4609   const unsigned char *src_end = coding->source + coding->src_bytes;
4610   bool multibytep = coding->src_multibyte;
4611   ptrdiff_t consumed_chars = 0;
4612   int found = 0;
4613   int c;
4614   Lisp_Object attrs, charset_list;
4615   int max_first_byte_of_2_byte_code;
4616
4617   CODING_GET_INFO (coding, attrs, charset_list);
4618   max_first_byte_of_2_byte_code
4619     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4620
4621   detect_info->checked |= CATEGORY_MASK_SJIS;
4622   /* A coding system of this category is always ASCII compatible.  */
4623   src += coding->head_ascii;
4624
4625   while (1)
4626     {
4627       src_base = src;
4628       ONE_MORE_BYTE (c);
4629       if (c < 0x80)
4630         continue;
4631       if ((c >= 0x81 && c <= 0x9F)
4632           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4633         {
4634           ONE_MORE_BYTE (c);
4635           if (c < 0x40 || c == 0x7F || c > 0xFC)
4636             break;
4637           found = CATEGORY_MASK_SJIS;
4638         }
4639       else if (c >= 0xA0 && c < 0xE0)
4640         found = CATEGORY_MASK_SJIS;
4641       else
4642         break;
4643     }
4644   detect_info->rejected |= CATEGORY_MASK_SJIS;
4645   return 0;
4646
4647  no_more_source:
4648   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4649     {
4650       detect_info->rejected |= CATEGORY_MASK_SJIS;
4651       return 0;
4652     }
4653   detect_info->found |= found;
4654   return 1;
4655 }
4656
4657 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4658    Return true if a text is encoded in BIG5.  */
4659
4660 static bool
4661 detect_coding_big5 (struct coding_system *coding,
4662                     struct coding_detection_info *detect_info)
4663 {
4664   const unsigned char *src = coding->source, *src_base;
4665   const unsigned char *src_end = coding->source + coding->src_bytes;
4666   bool multibytep = coding->src_multibyte;
4667   ptrdiff_t consumed_chars = 0;
4668   int found = 0;
4669   int c;
4670
4671   detect_info->checked |= CATEGORY_MASK_BIG5;
4672   /* A coding system of this category is always ASCII compatible.  */
4673   src += coding->head_ascii;
4674
4675   while (1)
4676     {
4677       src_base = src;
4678       ONE_MORE_BYTE (c);
4679       if (c < 0x80)
4680         continue;
4681       if (c >= 0xA1)
4682         {
4683           ONE_MORE_BYTE (c);
4684           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4685             return 0;
4686           found = CATEGORY_MASK_BIG5;
4687         }
4688       else
4689         break;
4690     }
4691   detect_info->rejected |= CATEGORY_MASK_BIG5;
4692   return 0;
4693
4694  no_more_source:
4695   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4696     {
4697       detect_info->rejected |= CATEGORY_MASK_BIG5;
4698       return 0;
4699     }
4700   detect_info->found |= found;
4701   return 1;
4702 }
4703
4704 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4705
4706 static void
4707 decode_coding_sjis (struct coding_system *coding)
4708 {
4709   const unsigned char *src = coding->source + coding->consumed;
4710   const unsigned char *src_end = coding->source + coding->src_bytes;
4711   const unsigned char *src_base;
4712   int *charbuf = coding->charbuf + coding->charbuf_used;
4713   /* We may produce one charset annotation in one loop and one more at
4714      the end.  */
4715   int *charbuf_end
4716     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4717   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4718   bool multibytep = coding->src_multibyte;
4719   struct charset *charset_roman, *charset_kanji, *charset_kana;
4720   struct charset *charset_kanji2;
4721   Lisp_Object attrs, charset_list, val;
4722   ptrdiff_t char_offset = coding->produced_char;
4723   ptrdiff_t last_offset = char_offset;
4724   int last_id = charset_ascii;
4725   bool eol_dos
4726     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4727   int byte_after_cr = -1;
4728
4729   CODING_GET_INFO (coding, attrs, charset_list);
4730
4731   val = charset_list;
4732   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4733   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4734   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4735   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4736
4737   while (1)
4738     {
4739       int c, c1;
4740       struct charset *charset;
4741
4742       src_base = src;
4743       consumed_chars_base = consumed_chars;
4744
4745       if (charbuf >= charbuf_end)
4746         {
4747           if (byte_after_cr >= 0)
4748             src_base--;
4749           break;
4750         }
4751
4752       if (byte_after_cr >= 0)
4753         c = byte_after_cr, byte_after_cr = -1;
4754       else
4755         ONE_MORE_BYTE (c);
4756       if (c < 0)
4757         goto invalid_code;
4758       if (c < 0x80)
4759         {
4760           if (eol_dos && c == '\r')
4761             ONE_MORE_BYTE (byte_after_cr);
4762           charset = charset_roman;
4763         }
4764       else if (c == 0x80 || c == 0xA0)
4765         goto invalid_code;
4766       else if (c >= 0xA1 && c <= 0xDF)
4767         {
4768           /* SJIS -> JISX0201-Kana */
4769           c &= 0x7F;
4770           charset = charset_kana;
4771         }
4772       else if (c <= 0xEF)
4773         {
4774           /* SJIS -> JISX0208 */
4775           ONE_MORE_BYTE (c1);
4776           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4777             goto invalid_code;
4778           c = (c << 8) | c1;
4779           SJIS_TO_JIS (c);
4780           charset = charset_kanji;
4781         }
4782       else if (c <= 0xFC && charset_kanji2)
4783         {
4784           /* SJIS -> JISX0213-2 */
4785           ONE_MORE_BYTE (c1);
4786           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4787             goto invalid_code;
4788           c = (c << 8) | c1;
4789           SJIS_TO_JIS2 (c);
4790           charset = charset_kanji2;
4791         }
4792       else
4793         goto invalid_code;
4794       if (charset->id != charset_ascii
4795           && last_id != charset->id)
4796         {
4797           if (last_id != charset_ascii)
4798             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4799           last_id = charset->id;
4800           last_offset = char_offset;
4801         }
4802       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4803       *charbuf++ = c;
4804       char_offset++;
4805       continue;
4806
4807     invalid_code:
4808       src = src_base;
4809       consumed_chars = consumed_chars_base;
4810       ONE_MORE_BYTE (c);
4811       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4812       char_offset++;
4813     }
4814
4815  no_more_source:
4816   if (last_id != charset_ascii)
4817     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4818   coding->consumed_char += consumed_chars_base;
4819   coding->consumed = src_base - coding->source;
4820   coding->charbuf_used = charbuf - coding->charbuf;
4821 }
4822
4823 static void
4824 decode_coding_big5 (struct coding_system *coding)
4825 {
4826   const unsigned char *src = coding->source + coding->consumed;
4827   const unsigned char *src_end = coding->source + coding->src_bytes;
4828   const unsigned char *src_base;
4829   int *charbuf = coding->charbuf + coding->charbuf_used;
4830   /* We may produce one charset annotation in one loop and one more at
4831      the end.  */
4832   int *charbuf_end
4833     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4834   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4835   bool multibytep = coding->src_multibyte;
4836   struct charset *charset_roman, *charset_big5;
4837   Lisp_Object attrs, charset_list, val;
4838   ptrdiff_t char_offset = coding->produced_char;
4839   ptrdiff_t last_offset = char_offset;
4840   int last_id = charset_ascii;
4841   bool eol_dos
4842     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4843   int byte_after_cr = -1;
4844
4845   CODING_GET_INFO (coding, attrs, charset_list);
4846   val = charset_list;
4847   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4848   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4849
4850   while (1)
4851     {
4852       int c, c1;
4853       struct charset *charset;
4854
4855       src_base = src;
4856       consumed_chars_base = consumed_chars;
4857
4858       if (charbuf >= charbuf_end)
4859         {
4860           if (byte_after_cr >= 0)
4861             src_base--;
4862           break;
4863         }
4864
4865       if (byte_after_cr >= 0)
4866         c = byte_after_cr, byte_after_cr = -1;
4867       else
4868         ONE_MORE_BYTE (c);
4869
4870       if (c < 0)
4871         goto invalid_code;
4872       if (c < 0x80)
4873         {
4874           if (eol_dos && c == '\r')
4875             ONE_MORE_BYTE (byte_after_cr);
4876           charset = charset_roman;
4877         }
4878       else
4879         {
4880           /* BIG5 -> Big5 */
4881           if (c < 0xA1 || c > 0xFE)
4882             goto invalid_code;
4883           ONE_MORE_BYTE (c1);
4884           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4885             goto invalid_code;
4886           c = c << 8 | c1;
4887           charset = charset_big5;
4888         }
4889       if (charset->id != charset_ascii
4890           && last_id != charset->id)
4891         {
4892           if (last_id != charset_ascii)
4893             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4894           last_id = charset->id;
4895           last_offset = char_offset;
4896         }
4897       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4898       *charbuf++ = c;
4899       char_offset++;
4900       continue;
4901
4902     invalid_code:
4903       src = src_base;
4904       consumed_chars = consumed_chars_base;
4905       ONE_MORE_BYTE (c);
4906       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4907       char_offset++;
4908     }
4909
4910  no_more_source:
4911   if (last_id != charset_ascii)
4912     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4913   coding->consumed_char += consumed_chars_base;
4914   coding->consumed = src_base - coding->source;
4915   coding->charbuf_used = charbuf - coding->charbuf;
4916 }
4917
4918 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4919    This function can encode charsets `ascii', `katakana-jisx0201',
4920    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4921    are sure that all these charsets are registered as official charset
4922    (i.e. do not have extended leading-codes).  Characters of other
4923    charsets are produced without any encoding.  */
4924
4925 static bool
4926 encode_coding_sjis (struct coding_system *coding)
4927 {
4928   bool multibytep = coding->dst_multibyte;
4929   int *charbuf = coding->charbuf;
4930   int *charbuf_end = charbuf + coding->charbuf_used;
4931   unsigned char *dst = coding->destination + coding->produced;
4932   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4933   int safe_room = 4;
4934   ptrdiff_t produced_chars = 0;
4935   Lisp_Object attrs, charset_list, val;
4936   bool ascii_compatible;
4937   struct charset *charset_kanji, *charset_kana;
4938   struct charset *charset_kanji2;
4939   int c;
4940
4941   CODING_GET_INFO (coding, attrs, charset_list);
4942   val = XCDR (charset_list);
4943   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4944   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4945   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4946
4947   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4948
4949   while (charbuf < charbuf_end)
4950     {
4951       ASSURE_DESTINATION (safe_room);
4952       c = *charbuf++;
4953       /* Now encode the character C.  */
4954       if (ASCII_CHAR_P (c) && ascii_compatible)
4955         EMIT_ONE_ASCII_BYTE (c);
4956       else if (CHAR_BYTE8_P (c))
4957         {
4958           c = CHAR_TO_BYTE8 (c);
4959           EMIT_ONE_BYTE (c);
4960         }
4961       else
4962         {
4963           unsigned code;
4964           struct charset *charset;
4965           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4966                                &code, charset);
4967
4968           if (!charset)
4969             {
4970               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4971                 {
4972                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4973                   charset = CHARSET_FROM_ID (charset_ascii);
4974                 }
4975               else
4976                 {
4977                   c = coding->default_char;
4978                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4979                                        charset_list, &code, charset);
4980                 }
4981             }
4982           if (code == CHARSET_INVALID_CODE (charset))
4983             emacs_abort ();
4984           if (charset == charset_kanji)
4985             {
4986               int c1, c2;
4987               JIS_TO_SJIS (code);
4988               c1 = code >> 8, c2 = code & 0xFF;
4989               EMIT_TWO_BYTES (c1, c2);
4990             }
4991           else if (charset == charset_kana)
4992             EMIT_ONE_BYTE (code | 0x80);
4993           else if (charset_kanji2 && charset == charset_kanji2)
4994             {
4995               int c1, c2;
4996
4997               c1 = code >> 8;
4998               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4999                   || c1 == 0x28
5000                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5001                 {
5002                   JIS_TO_SJIS2 (code);
5003                   c1 = code >> 8, c2 = code & 0xFF;
5004                   EMIT_TWO_BYTES (c1, c2);
5005                 }
5006               else
5007                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5008             }
5009           else
5010             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5011         }
5012     }
5013   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5014   coding->produced_char += produced_chars;
5015   coding->produced = dst - coding->destination;
5016   return 0;
5017 }
5018
5019 static bool
5020 encode_coding_big5 (struct coding_system *coding)
5021 {
5022   bool multibytep = coding->dst_multibyte;
5023   int *charbuf = coding->charbuf;
5024   int *charbuf_end = charbuf + coding->charbuf_used;
5025   unsigned char *dst = coding->destination + coding->produced;
5026   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5027   int safe_room = 4;
5028   ptrdiff_t produced_chars = 0;
5029   Lisp_Object attrs, charset_list, val;
5030   bool ascii_compatible;
5031   struct charset *charset_big5;
5032   int c;
5033
5034   CODING_GET_INFO (coding, attrs, charset_list);
5035   val = XCDR (charset_list);
5036   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5037   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5038
5039   while (charbuf < charbuf_end)
5040     {
5041       ASSURE_DESTINATION (safe_room);
5042       c = *charbuf++;
5043       /* Now encode the character C.  */
5044       if (ASCII_CHAR_P (c) && ascii_compatible)
5045         EMIT_ONE_ASCII_BYTE (c);
5046       else if (CHAR_BYTE8_P (c))
5047         {
5048           c = CHAR_TO_BYTE8 (c);
5049           EMIT_ONE_BYTE (c);
5050         }
5051       else
5052         {
5053           unsigned code;
5054           struct charset *charset;
5055           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5056                                &code, charset);
5057
5058           if (! charset)
5059             {
5060               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5061                 {
5062                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5063                   charset = CHARSET_FROM_ID (charset_ascii);
5064                 }
5065               else
5066                 {
5067                   c = coding->default_char;
5068                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5069                                        charset_list, &code, charset);
5070                 }
5071             }
5072           if (code == CHARSET_INVALID_CODE (charset))
5073             emacs_abort ();
5074           if (charset == charset_big5)
5075             {
5076               int c1, c2;
5077
5078               c1 = code >> 8, c2 = code & 0xFF;
5079               EMIT_TWO_BYTES (c1, c2);
5080             }
5081           else
5082             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5083         }
5084     }
5085   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5086   coding->produced_char += produced_chars;
5087   coding->produced = dst - coding->destination;
5088   return 0;
5089 }
5090
5091 \f
5092 /*** 10. CCL handlers ***/
5093
5094 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5095    Return true if a text is encoded in a coding system of which
5096    encoder/decoder are written in CCL program.  */
5097
5098 static bool
5099 detect_coding_ccl (struct coding_system *coding,
5100                    struct coding_detection_info *detect_info)
5101 {
5102   const unsigned char *src = coding->source, *src_base;
5103   const unsigned char *src_end = coding->source + coding->src_bytes;
5104   bool multibytep = coding->src_multibyte;
5105   ptrdiff_t consumed_chars = 0;
5106   int found = 0;
5107   unsigned char *valids;
5108   ptrdiff_t head_ascii = coding->head_ascii;
5109   Lisp_Object attrs;
5110
5111   detect_info->checked |= CATEGORY_MASK_CCL;
5112
5113   coding = &coding_categories[coding_category_ccl];
5114   valids = CODING_CCL_VALIDS (coding);
5115   attrs = CODING_ID_ATTRS (coding->id);
5116   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5117     src += head_ascii;
5118
5119   while (1)
5120     {
5121       int c;
5122
5123       src_base = src;
5124       ONE_MORE_BYTE (c);
5125       if (c < 0 || ! valids[c])
5126         break;
5127       if ((valids[c] > 1))
5128         found = CATEGORY_MASK_CCL;
5129     }
5130   detect_info->rejected |= CATEGORY_MASK_CCL;
5131   return 0;
5132
5133  no_more_source:
5134   detect_info->found |= found;
5135   return 1;
5136 }
5137
5138 static void
5139 decode_coding_ccl (struct coding_system *coding)
5140 {
5141   const unsigned char *src = coding->source + coding->consumed;
5142   const unsigned char *src_end = coding->source + coding->src_bytes;
5143   int *charbuf = coding->charbuf + coding->charbuf_used;
5144   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5145   ptrdiff_t consumed_chars = 0;
5146   bool multibytep = coding->src_multibyte;
5147   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5148   int source_charbuf[1024];
5149   int source_byteidx[1025];
5150   Lisp_Object attrs, charset_list;
5151
5152   CODING_GET_INFO (coding, attrs, charset_list);
5153
5154   while (1)
5155     {
5156       const unsigned char *p = src;
5157       ptrdiff_t offset;
5158       int i = 0;
5159
5160       if (multibytep)
5161         {
5162           while (i < 1024 && p < src_end)
5163             {
5164               source_byteidx[i] = p - src;
5165               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5166             }
5167           source_byteidx[i] = p - src;
5168         }
5169       else
5170         while (i < 1024 && p < src_end)
5171           source_charbuf[i++] = *p++;
5172
5173       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5174         ccl->last_block = true;
5175       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5176       charset_map_loaded = 0;
5177       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5178                   charset_list);
5179       if (charset_map_loaded
5180           && (offset = coding_change_source (coding)))
5181         {
5182           p += offset;
5183           src += offset;
5184           src_end += offset;
5185         }
5186       charbuf += ccl->produced;
5187       if (multibytep)
5188         src += source_byteidx[ccl->consumed];
5189       else
5190         src += ccl->consumed;
5191       consumed_chars += ccl->consumed;
5192       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5193         break;
5194     }
5195
5196   switch (ccl->status)
5197     {
5198     case CCL_STAT_SUSPEND_BY_SRC:
5199       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5200       break;
5201     case CCL_STAT_SUSPEND_BY_DST:
5202       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5203       break;
5204     case CCL_STAT_QUIT:
5205     case CCL_STAT_INVALID_CMD:
5206       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5207       break;
5208     default:
5209       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5210       break;
5211     }
5212   coding->consumed_char += consumed_chars;
5213   coding->consumed = src - coding->source;
5214   coding->charbuf_used = charbuf - coding->charbuf;
5215 }
5216
5217 static bool
5218 encode_coding_ccl (struct coding_system *coding)
5219 {
5220   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5221   bool multibytep = coding->dst_multibyte;
5222   int *charbuf = coding->charbuf;
5223   int *charbuf_end = charbuf + coding->charbuf_used;
5224   unsigned char *dst = coding->destination + coding->produced;
5225   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5226   int destination_charbuf[1024];
5227   ptrdiff_t produced_chars = 0;
5228   int i;
5229   Lisp_Object attrs, charset_list;
5230
5231   CODING_GET_INFO (coding, attrs, charset_list);
5232   if (coding->consumed_char == coding->src_chars
5233       && coding->mode & CODING_MODE_LAST_BLOCK)
5234     ccl->last_block = true;
5235
5236   do
5237     {
5238       ptrdiff_t offset;
5239
5240       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5241       charset_map_loaded = 0;
5242       ccl_driver (ccl, charbuf, destination_charbuf,
5243                   charbuf_end - charbuf, 1024, charset_list);
5244       if (charset_map_loaded
5245           && (offset = coding_change_destination (coding)))
5246         dst += offset;
5247       if (multibytep)
5248         {
5249           ASSURE_DESTINATION (ccl->produced * 2);
5250           for (i = 0; i < ccl->produced; i++)
5251             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5252         }
5253       else
5254         {
5255           ASSURE_DESTINATION (ccl->produced);
5256           for (i = 0; i < ccl->produced; i++)
5257             *dst++ = destination_charbuf[i] & 0xFF;
5258           produced_chars += ccl->produced;
5259         }
5260       charbuf += ccl->consumed;
5261       if (ccl->status == CCL_STAT_QUIT
5262           || ccl->status == CCL_STAT_INVALID_CMD)
5263         break;
5264     }
5265   while (charbuf < charbuf_end);
5266
5267   switch (ccl->status)
5268     {
5269     case CCL_STAT_SUSPEND_BY_SRC:
5270       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5271       break;
5272     case CCL_STAT_SUSPEND_BY_DST:
5273       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5274       break;
5275     case CCL_STAT_QUIT:
5276     case CCL_STAT_INVALID_CMD:
5277       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5278       break;
5279     default:
5280       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5281       break;
5282     }
5283
5284   coding->produced_char += produced_chars;
5285   coding->produced = dst - coding->destination;
5286   return 0;
5287 }
5288
5289 \f
5290 /*** 10, 11. no-conversion handlers ***/
5291
5292 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5293
5294 static void
5295 decode_coding_raw_text (struct coding_system *coding)
5296 {
5297   bool eol_dos
5298     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5299
5300   coding->chars_at_source = 1;
5301   coding->consumed_char = coding->src_chars;
5302   coding->consumed = coding->src_bytes;
5303   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5304     {
5305       coding->consumed_char--;
5306       coding->consumed--;
5307       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5308     }
5309   else
5310     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5311 }
5312
5313 static bool
5314 encode_coding_raw_text (struct coding_system *coding)
5315 {
5316   bool multibytep = coding->dst_multibyte;
5317   int *charbuf = coding->charbuf;
5318   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5319   unsigned char *dst = coding->destination + coding->produced;
5320   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5321   ptrdiff_t produced_chars = 0;
5322   int c;
5323
5324   if (multibytep)
5325     {
5326       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5327
5328       if (coding->src_multibyte)
5329         while (charbuf < charbuf_end)
5330           {
5331             ASSURE_DESTINATION (safe_room);
5332             c = *charbuf++;
5333             if (ASCII_CHAR_P (c))
5334               EMIT_ONE_ASCII_BYTE (c);
5335             else if (CHAR_BYTE8_P (c))
5336               {
5337                 c = CHAR_TO_BYTE8 (c);
5338                 EMIT_ONE_BYTE (c);
5339               }
5340             else
5341               {
5342                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5343
5344                 CHAR_STRING_ADVANCE (c, p1);
5345                 do
5346                   {
5347                     EMIT_ONE_BYTE (*p0);
5348                     p0++;
5349                   }
5350                 while (p0 < p1);
5351               }
5352           }
5353       else
5354         while (charbuf < charbuf_end)
5355           {
5356             ASSURE_DESTINATION (safe_room);
5357             c = *charbuf++;
5358             EMIT_ONE_BYTE (c);
5359           }
5360     }
5361   else
5362     {
5363       if (coding->src_multibyte)
5364         {
5365           int safe_room = MAX_MULTIBYTE_LENGTH;
5366
5367           while (charbuf < charbuf_end)
5368             {
5369               ASSURE_DESTINATION (safe_room);
5370               c = *charbuf++;
5371               if (ASCII_CHAR_P (c))
5372                 *dst++ = c;
5373               else if (CHAR_BYTE8_P (c))
5374                 *dst++ = CHAR_TO_BYTE8 (c);
5375               else
5376                 CHAR_STRING_ADVANCE (c, dst);
5377             }
5378         }
5379       else
5380         {
5381           ASSURE_DESTINATION (charbuf_end - charbuf);
5382           while (charbuf < charbuf_end && dst < dst_end)
5383             *dst++ = *charbuf++;
5384         }
5385       produced_chars = dst - (coding->destination + coding->produced);
5386     }
5387   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5388   coding->produced_char += produced_chars;
5389   coding->produced = dst - coding->destination;
5390   return 0;
5391 }
5392
5393 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5394    Return true if a text is encoded in a charset-based coding system.  */
5395
5396 static bool
5397 detect_coding_charset (struct coding_system *coding,
5398                        struct coding_detection_info *detect_info)
5399 {
5400   const unsigned char *src = coding->source, *src_base;
5401   const unsigned char *src_end = coding->source + coding->src_bytes;
5402   bool multibytep = coding->src_multibyte;
5403   ptrdiff_t consumed_chars = 0;
5404   Lisp_Object attrs, valids, name;
5405   int found = 0;
5406   ptrdiff_t head_ascii = coding->head_ascii;
5407   bool check_latin_extra = 0;
5408
5409   detect_info->checked |= CATEGORY_MASK_CHARSET;
5410
5411   coding = &coding_categories[coding_category_charset];
5412   attrs = CODING_ID_ATTRS (coding->id);
5413   valids = AREF (attrs, coding_attr_charset_valids);
5414   name = CODING_ID_NAME (coding->id);
5415   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5416                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5417       || strncmp (SSDATA (SYMBOL_NAME (name)),
5418                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5419     check_latin_extra = 1;
5420
5421   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5422     src += head_ascii;
5423
5424   while (1)
5425     {
5426       int c;
5427       Lisp_Object val;
5428       struct charset *charset;
5429       int dim, idx;
5430
5431       src_base = src;
5432       ONE_MORE_BYTE (c);
5433       if (c < 0)
5434         continue;
5435       val = AREF (valids, c);
5436       if (NILP (val))
5437         break;
5438       if (c >= 0x80)
5439         {
5440           if (c < 0xA0
5441               && check_latin_extra
5442               && (!VECTORP (Vlatin_extra_code_table)
5443                   || NILP (AREF (Vlatin_extra_code_table, c))))
5444             break;
5445           found = CATEGORY_MASK_CHARSET;
5446         }
5447       if (INTEGERP (val))
5448         {
5449           charset = CHARSET_FROM_ID (XFASTINT (val));
5450           dim = CHARSET_DIMENSION (charset);
5451           for (idx = 1; idx < dim; idx++)
5452             {
5453               if (src == src_end)
5454                 goto too_short;
5455               ONE_MORE_BYTE (c);
5456               if (c < charset->code_space[(dim - 1 - idx) * 4]
5457                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5458                 break;
5459             }
5460           if (idx < dim)
5461             break;
5462         }
5463       else
5464         {
5465           idx = 1;
5466           for (; CONSP (val); val = XCDR (val))
5467             {
5468               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5469               dim = CHARSET_DIMENSION (charset);
5470               while (idx < dim)
5471                 {
5472                   if (src == src_end)
5473                     goto too_short;
5474                   ONE_MORE_BYTE (c);
5475                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5476                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5477                     break;
5478                   idx++;
5479                 }
5480               if (idx == dim)
5481                 {
5482                   val = Qnil;
5483                   break;
5484                 }
5485             }
5486           if (CONSP (val))
5487             break;
5488         }
5489     }
5490  too_short:
5491   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5492   return 0;
5493
5494  no_more_source:
5495   detect_info->found |= found;
5496   return 1;
5497 }
5498
5499 static void
5500 decode_coding_charset (struct coding_system *coding)
5501 {
5502   const unsigned char *src = coding->source + coding->consumed;
5503   const unsigned char *src_end = coding->source + coding->src_bytes;
5504   const unsigned char *src_base;
5505   int *charbuf = coding->charbuf + coding->charbuf_used;
5506   /* We may produce one charset annotation in one loop and one more at
5507      the end.  */
5508   int *charbuf_end
5509     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5510   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5511   bool multibytep = coding->src_multibyte;
5512   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5513   Lisp_Object valids;
5514   ptrdiff_t char_offset = coding->produced_char;
5515   ptrdiff_t last_offset = char_offset;
5516   int last_id = charset_ascii;
5517   bool eol_dos
5518     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5519   int byte_after_cr = -1;
5520
5521   valids = AREF (attrs, coding_attr_charset_valids);
5522
5523   while (1)
5524     {
5525       int c;
5526       Lisp_Object val;
5527       struct charset *charset;
5528       int dim;
5529       int len = 1;
5530       unsigned code;
5531
5532       src_base = src;
5533       consumed_chars_base = consumed_chars;
5534
5535       if (charbuf >= charbuf_end)
5536         {
5537           if (byte_after_cr >= 0)
5538             src_base--;
5539           break;
5540         }
5541
5542       if (byte_after_cr >= 0)
5543         {
5544           c = byte_after_cr;
5545           byte_after_cr = -1;
5546         }
5547       else
5548         {
5549           ONE_MORE_BYTE (c);
5550           if (eol_dos && c == '\r')
5551             ONE_MORE_BYTE (byte_after_cr);
5552         }
5553       if (c < 0)
5554         goto invalid_code;
5555       code = c;
5556
5557       val = AREF (valids, c);
5558       if (! INTEGERP (val) && ! CONSP (val))
5559         goto invalid_code;
5560       if (INTEGERP (val))
5561         {
5562           charset = CHARSET_FROM_ID (XFASTINT (val));
5563           dim = CHARSET_DIMENSION (charset);
5564           while (len < dim)
5565             {
5566               ONE_MORE_BYTE (c);
5567               code = (code << 8) | c;
5568               len++;
5569             }
5570           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5571                               charset, code, c);
5572         }
5573       else
5574         {
5575           /* VAL is a list of charset IDs.  It is assured that the
5576              list is sorted by charset dimensions (smaller one
5577              comes first).  */
5578           while (CONSP (val))
5579             {
5580               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5581               dim = CHARSET_DIMENSION (charset);
5582               while (len < dim)
5583                 {
5584                   ONE_MORE_BYTE (c);
5585                   code = (code << 8) | c;
5586                   len++;
5587                 }
5588               CODING_DECODE_CHAR (coding, src, src_base,
5589                                   src_end, charset, code, c);
5590               if (c >= 0)
5591                 break;
5592               val = XCDR (val);
5593             }
5594         }
5595       if (c < 0)
5596         goto invalid_code;
5597       if (charset->id != charset_ascii
5598           && last_id != charset->id)
5599         {
5600           if (last_id != charset_ascii)
5601             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5602           last_id = charset->id;
5603           last_offset = char_offset;
5604         }
5605
5606       *charbuf++ = c;
5607       char_offset++;
5608       continue;
5609
5610     invalid_code:
5611       src = src_base;
5612       consumed_chars = consumed_chars_base;
5613       ONE_MORE_BYTE (c);
5614       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5615       char_offset++;
5616     }
5617
5618  no_more_source:
5619   if (last_id != charset_ascii)
5620     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5621   coding->consumed_char += consumed_chars_base;
5622   coding->consumed = src_base - coding->source;
5623   coding->charbuf_used = charbuf - coding->charbuf;
5624 }
5625
5626 static bool
5627 encode_coding_charset (struct coding_system *coding)
5628 {
5629   bool multibytep = coding->dst_multibyte;
5630   int *charbuf = coding->charbuf;
5631   int *charbuf_end = charbuf + coding->charbuf_used;
5632   unsigned char *dst = coding->destination + coding->produced;
5633   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5634   int safe_room = MAX_MULTIBYTE_LENGTH;
5635   ptrdiff_t produced_chars = 0;
5636   Lisp_Object attrs, charset_list;
5637   bool ascii_compatible;
5638   int c;
5639
5640   CODING_GET_INFO (coding, attrs, charset_list);
5641   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5642
5643   while (charbuf < charbuf_end)
5644     {
5645       struct charset *charset;
5646       unsigned code;
5647
5648       ASSURE_DESTINATION (safe_room);
5649       c = *charbuf++;
5650       if (ascii_compatible && ASCII_CHAR_P (c))
5651         EMIT_ONE_ASCII_BYTE (c);
5652       else if (CHAR_BYTE8_P (c))
5653         {
5654           c = CHAR_TO_BYTE8 (c);
5655           EMIT_ONE_BYTE (c);
5656         }
5657       else
5658         {
5659           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5660                                &code, charset);
5661
5662           if (charset)
5663             {
5664               if (CHARSET_DIMENSION (charset) == 1)
5665                 EMIT_ONE_BYTE (code);
5666               else if (CHARSET_DIMENSION (charset) == 2)
5667                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5668               else if (CHARSET_DIMENSION (charset) == 3)
5669                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5670               else
5671                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5672                                  (code >> 8) & 0xFF, code & 0xFF);
5673             }
5674           else
5675             {
5676               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5677                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5678               else
5679                 c = coding->default_char;
5680               EMIT_ONE_BYTE (c);
5681             }
5682         }
5683     }
5684
5685   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5686   coding->produced_char += produced_chars;
5687   coding->produced = dst - coding->destination;
5688   return 0;
5689 }
5690
5691 \f
5692 /*** 7. C library functions ***/
5693
5694 /* Setup coding context CODING from information about CODING_SYSTEM.
5695    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5696    CODING_SYSTEM is invalid, signal an error.  */
5697
5698 void
5699 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5700 {
5701   Lisp_Object attrs;
5702   Lisp_Object eol_type;
5703   Lisp_Object coding_type;
5704   Lisp_Object val;
5705
5706   if (NILP (coding_system))
5707     coding_system = Qundecided;
5708
5709   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5710
5711   attrs = CODING_ID_ATTRS (coding->id);
5712   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5713
5714   coding->mode = 0;
5715   if (VECTORP (eol_type))
5716     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5717                             | CODING_REQUIRE_DETECTION_MASK);
5718   else if (! EQ (eol_type, Qunix))
5719     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5720                             | CODING_REQUIRE_ENCODING_MASK);
5721   else
5722     coding->common_flags = 0;
5723   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5724     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5725   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5726     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5727   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5728     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5729
5730   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5731   coding->max_charset_id = SCHARS (val) - 1;
5732   coding->safe_charsets = SDATA (val);
5733   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5734   coding->carryover_bytes = 0;
5735   coding->raw_destination = 0;
5736
5737   coding_type = CODING_ATTR_TYPE (attrs);
5738   if (EQ (coding_type, Qundecided))
5739     {
5740       coding->detector = NULL;
5741       coding->decoder = decode_coding_raw_text;
5742       coding->encoder = encode_coding_raw_text;
5743       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5744       coding->spec.undecided.inhibit_nbd
5745         = (encode_inhibit_flag
5746            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5747       coding->spec.undecided.inhibit_ied
5748         = (encode_inhibit_flag
5749            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5750       coding->spec.undecided.prefer_utf_8
5751         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5752     }
5753   else if (EQ (coding_type, Qiso_2022))
5754     {
5755       int i;
5756       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5757
5758       /* Invoke graphic register 0 to plane 0.  */
5759       CODING_ISO_INVOCATION (coding, 0) = 0;
5760       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5761       CODING_ISO_INVOCATION (coding, 1)
5762         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5763       /* Setup the initial status of designation.  */
5764       for (i = 0; i < 4; i++)
5765         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5766       /* Not single shifting initially.  */
5767       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5768       /* Beginning of buffer should also be regarded as bol. */
5769       CODING_ISO_BOL (coding) = 1;
5770       coding->detector = detect_coding_iso_2022;
5771       coding->decoder = decode_coding_iso_2022;
5772       coding->encoder = encode_coding_iso_2022;
5773       if (flags & CODING_ISO_FLAG_SAFE)
5774         coding->mode |= CODING_MODE_SAFE_ENCODING;
5775       coding->common_flags
5776         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5777             | CODING_REQUIRE_FLUSHING_MASK);
5778       if (flags & CODING_ISO_FLAG_COMPOSITION)
5779         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5780       if (flags & CODING_ISO_FLAG_DESIGNATION)
5781         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5782       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5783         {
5784           setup_iso_safe_charsets (attrs);
5785           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5786           coding->max_charset_id = SCHARS (val) - 1;
5787           coding->safe_charsets = SDATA (val);
5788         }
5789       CODING_ISO_FLAGS (coding) = flags;
5790       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5791       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5792       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5793       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5794     }
5795   else if (EQ (coding_type, Qcharset))
5796     {
5797       coding->detector = detect_coding_charset;
5798       coding->decoder = decode_coding_charset;
5799       coding->encoder = encode_coding_charset;
5800       coding->common_flags
5801         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5802     }
5803   else if (EQ (coding_type, Qutf_8))
5804     {
5805       val = AREF (attrs, coding_attr_utf_bom);
5806       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5807                                    : EQ (val, Qt) ? utf_with_bom
5808                                    : utf_without_bom);
5809       coding->detector = detect_coding_utf_8;
5810       coding->decoder = decode_coding_utf_8;
5811       coding->encoder = encode_coding_utf_8;
5812       coding->common_flags
5813         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5814       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5815         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5816     }
5817   else if (EQ (coding_type, Qutf_16))
5818     {
5819       val = AREF (attrs, coding_attr_utf_bom);
5820       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5821                                     : EQ (val, Qt) ? utf_with_bom
5822                                     : utf_without_bom);
5823       val = AREF (attrs, coding_attr_utf_16_endian);
5824       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5825                                        : utf_16_little_endian);
5826       CODING_UTF_16_SURROGATE (coding) = 0;
5827       coding->detector = detect_coding_utf_16;
5828       coding->decoder = decode_coding_utf_16;
5829       coding->encoder = encode_coding_utf_16;
5830       coding->common_flags
5831         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5832       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5833         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5834     }
5835   else if (EQ (coding_type, Qccl))
5836     {
5837       coding->detector = detect_coding_ccl;
5838       coding->decoder = decode_coding_ccl;
5839       coding->encoder = encode_coding_ccl;
5840       coding->common_flags
5841         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5842             | CODING_REQUIRE_FLUSHING_MASK);
5843     }
5844   else if (EQ (coding_type, Qemacs_mule))
5845     {
5846       coding->detector = detect_coding_emacs_mule;
5847       coding->decoder = decode_coding_emacs_mule;
5848       coding->encoder = encode_coding_emacs_mule;
5849       coding->common_flags
5850         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5851       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5852           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5853         {
5854           Lisp_Object tail, safe_charsets;
5855           int max_charset_id = 0;
5856
5857           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5858                tail = XCDR (tail))
5859             if (max_charset_id < XFASTINT (XCAR (tail)))
5860               max_charset_id = XFASTINT (XCAR (tail));
5861           safe_charsets = make_uninit_string (max_charset_id + 1);
5862           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5863           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5864                tail = XCDR (tail))
5865             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5866           coding->max_charset_id = max_charset_id;
5867           coding->safe_charsets = SDATA (safe_charsets);
5868         }
5869       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5870       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5871     }
5872   else if (EQ (coding_type, Qshift_jis))
5873     {
5874       coding->detector = detect_coding_sjis;
5875       coding->decoder = decode_coding_sjis;
5876       coding->encoder = encode_coding_sjis;
5877       coding->common_flags
5878         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5879     }
5880   else if (EQ (coding_type, Qbig5))
5881     {
5882       coding->detector = detect_coding_big5;
5883       coding->decoder = decode_coding_big5;
5884       coding->encoder = encode_coding_big5;
5885       coding->common_flags
5886         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5887     }
5888   else                          /* EQ (coding_type, Qraw_text) */
5889     {
5890       coding->detector = NULL;
5891       coding->decoder = decode_coding_raw_text;
5892       coding->encoder = encode_coding_raw_text;
5893       if (! EQ (eol_type, Qunix))
5894         {
5895           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5896           if (! VECTORP (eol_type))
5897             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5898         }
5899
5900     }
5901
5902   return;
5903 }
5904
5905 /* Return a list of charsets supported by CODING.  */
5906
5907 Lisp_Object
5908 coding_charset_list (struct coding_system *coding)
5909 {
5910   Lisp_Object attrs, charset_list;
5911
5912   CODING_GET_INFO (coding, attrs, charset_list);
5913   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5914     {
5915       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5916
5917       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5918         charset_list = Viso_2022_charset_list;
5919     }
5920   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5921     {
5922       charset_list = Vemacs_mule_charset_list;
5923     }
5924   return charset_list;
5925 }
5926
5927
5928 /* Return a list of charsets supported by CODING-SYSTEM.  */
5929
5930 Lisp_Object
5931 coding_system_charset_list (Lisp_Object coding_system)
5932 {
5933   ptrdiff_t id;
5934   Lisp_Object attrs, charset_list;
5935
5936   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5937   attrs = CODING_ID_ATTRS (id);
5938
5939   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5940     {
5941       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5942
5943       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5944         charset_list = Viso_2022_charset_list;
5945       else
5946         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5947     }
5948   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5949     {
5950       charset_list = Vemacs_mule_charset_list;
5951     }
5952   else
5953     {
5954       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5955     }
5956   return charset_list;
5957 }
5958
5959
5960 /* Return raw-text or one of its subsidiaries that has the same
5961    eol_type as CODING-SYSTEM.  */
5962
5963 Lisp_Object
5964 raw_text_coding_system (Lisp_Object coding_system)
5965 {
5966   Lisp_Object spec, attrs;
5967   Lisp_Object eol_type, raw_text_eol_type;
5968
5969   if (NILP (coding_system))
5970     return Qraw_text;
5971   spec = CODING_SYSTEM_SPEC (coding_system);
5972   attrs = AREF (spec, 0);
5973
5974   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5975     return coding_system;
5976
5977   eol_type = AREF (spec, 2);
5978   if (VECTORP (eol_type))
5979     return Qraw_text;
5980   spec = CODING_SYSTEM_SPEC (Qraw_text);
5981   raw_text_eol_type = AREF (spec, 2);
5982   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5983           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5984           : AREF (raw_text_eol_type, 2));
5985 }
5986
5987 /* Return true if CODING corresponds to raw-text coding-system.  */
5988
5989 bool
5990 raw_text_coding_system_p (struct coding_system *coding)
5991 {
5992   return (coding->decoder == decode_coding_raw_text
5993           && coding->encoder == encode_coding_raw_text) ? true : false;
5994 }
5995
5996
5997 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5998    the subsidiary that has the same eol-spec as PARENT (if it is not
5999    nil and specifies end-of-line format) or the system's setting
6000    (system_eol_type).  */
6001
6002 Lisp_Object
6003 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6004 {
6005   Lisp_Object spec, eol_type;
6006
6007   if (NILP (coding_system))
6008     coding_system = Qraw_text;
6009   else
6010     CHECK_CODING_SYSTEM (coding_system);
6011   spec = CODING_SYSTEM_SPEC (coding_system);
6012   eol_type = AREF (spec, 2);
6013   if (VECTORP (eol_type))
6014     {
6015       Lisp_Object parent_eol_type;
6016
6017       if (! NILP (parent))
6018         {
6019           Lisp_Object parent_spec;
6020
6021           CHECK_CODING_SYSTEM (parent);
6022           parent_spec = CODING_SYSTEM_SPEC (parent);
6023           parent_eol_type = AREF (parent_spec, 2);
6024           if (VECTORP (parent_eol_type))
6025             parent_eol_type = system_eol_type;
6026         }
6027       else
6028         parent_eol_type = system_eol_type;
6029       if (EQ (parent_eol_type, Qunix))
6030         coding_system = AREF (eol_type, 0);
6031       else if (EQ (parent_eol_type, Qdos))
6032         coding_system = AREF (eol_type, 1);
6033       else if (EQ (parent_eol_type, Qmac))
6034         coding_system = AREF (eol_type, 2);
6035     }
6036   return coding_system;
6037 }
6038
6039
6040 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6041    decided for writing to a process.  If not, complement them, and
6042    return a new coding system.  */
6043
6044 Lisp_Object
6045 complement_process_encoding_system (Lisp_Object coding_system)
6046 {
6047   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6048   Lisp_Object spec, attrs;
6049   int i;
6050
6051   for (i = 0; i < 3; i++)
6052     {
6053       if (i == 1)
6054         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6055       else if (i == 2)
6056         coding_system = preferred_coding_system ();
6057       spec = CODING_SYSTEM_SPEC (coding_system);
6058       if (NILP (spec))
6059         continue;
6060       attrs = AREF (spec, 0);
6061       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6062         coding_base = CODING_ATTR_BASE_NAME (attrs);
6063       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6064         eol_base = coding_system;
6065       if (! NILP (coding_base) && ! NILP (eol_base))
6066         break;
6067     }
6068
6069   if (i > 0)
6070     /* The original CODING_SYSTEM didn't specify text-conversion or
6071        eol-conversion.  Be sure that we return a fully complemented
6072        coding system.  */
6073     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6074   return coding_system;
6075 }
6076
6077
6078 /* Emacs has a mechanism to automatically detect a coding system if it
6079    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6080    it's impossible to distinguish some coding systems accurately
6081    because they use the same range of codes.  So, at first, coding
6082    systems are categorized into 7, those are:
6083
6084    o coding-category-emacs-mule
6085
6086         The category for a coding system which has the same code range
6087         as Emacs' internal format.  Assigned the coding-system (Lisp
6088         symbol) `emacs-mule' by default.
6089
6090    o coding-category-sjis
6091
6092         The category for a coding system which has the same code range
6093         as SJIS.  Assigned the coding-system (Lisp
6094         symbol) `japanese-shift-jis' by default.
6095
6096    o coding-category-iso-7
6097
6098         The category for a coding system which has the same code range
6099         as ISO2022 of 7-bit environment.  This doesn't use any locking
6100         shift and single shift functions.  This can encode/decode all
6101         charsets.  Assigned the coding-system (Lisp symbol)
6102         `iso-2022-7bit' by default.
6103
6104    o coding-category-iso-7-tight
6105
6106         Same as coding-category-iso-7 except that this can
6107         encode/decode only the specified charsets.
6108
6109    o coding-category-iso-8-1
6110
6111         The category for a coding system which has the same code range
6112         as ISO2022 of 8-bit environment and graphic plane 1 used only
6113         for DIMENSION1 charset.  This doesn't use any locking shift
6114         and single shift functions.  Assigned the coding-system (Lisp
6115         symbol) `iso-latin-1' by default.
6116
6117    o coding-category-iso-8-2
6118
6119         The category for a coding system which has the same code range
6120         as ISO2022 of 8-bit environment and graphic plane 1 used only
6121         for DIMENSION2 charset.  This doesn't use any locking shift
6122         and single shift functions.  Assigned the coding-system (Lisp
6123         symbol) `japanese-iso-8bit' by default.
6124
6125    o coding-category-iso-7-else
6126
6127         The category for a coding system which has the same code range
6128         as ISO2022 of 7-bit environment but uses locking shift or
6129         single shift functions.  Assigned the coding-system (Lisp
6130         symbol) `iso-2022-7bit-lock' by default.
6131
6132    o coding-category-iso-8-else
6133
6134         The category for a coding system which has the same code range
6135         as ISO2022 of 8-bit environment but uses locking shift or
6136         single shift functions.  Assigned the coding-system (Lisp
6137         symbol) `iso-2022-8bit-ss2' by default.
6138
6139    o coding-category-big5
6140
6141         The category for a coding system which has the same code range
6142         as BIG5.  Assigned the coding-system (Lisp symbol)
6143         `cn-big5' by default.
6144
6145    o coding-category-utf-8
6146
6147         The category for a coding system which has the same code range
6148         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6149         symbol) `utf-8' by default.
6150
6151    o coding-category-utf-16-be
6152
6153         The category for a coding system in which a text has an
6154         Unicode signature (cf. Unicode Standard) in the order of BIG
6155         endian at the head.  Assigned the coding-system (Lisp symbol)
6156         `utf-16-be' by default.
6157
6158    o coding-category-utf-16-le
6159
6160         The category for a coding system in which a text has an
6161         Unicode signature (cf. Unicode Standard) in the order of
6162         LITTLE endian at the head.  Assigned the coding-system (Lisp
6163         symbol) `utf-16-le' by default.
6164
6165    o coding-category-ccl
6166
6167         The category for a coding system of which encoder/decoder is
6168         written in CCL programs.  The default value is nil, i.e., no
6169         coding system is assigned.
6170
6171    o coding-category-binary
6172
6173         The category for a coding system not categorized in any of the
6174         above.  Assigned the coding-system (Lisp symbol)
6175         `no-conversion' by default.
6176
6177    Each of them is a Lisp symbol and the value is an actual
6178    `coding-system's (this is also a Lisp symbol) assigned by a user.
6179    What Emacs does actually is to detect a category of coding system.
6180    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6181    decide only one possible category, it selects a category of the
6182    highest priority.  Priorities of categories are also specified by a
6183    user in a Lisp variable `coding-category-list'.
6184
6185 */
6186
6187 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6188                                            int eol_seen);
6189
6190
6191 /* Return the number of ASCII characters at the head of the source.
6192    By side effects, set coding->head_ascii and update
6193    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6194    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6195    reliable only when all the source bytes are ASCII.  */
6196
6197 static ptrdiff_t
6198 check_ascii (struct coding_system *coding)
6199 {
6200   const unsigned char *src, *end;
6201   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6202   int eol_seen = coding->eol_seen;
6203
6204   coding_set_source (coding);
6205   src = coding->source;
6206   end = src + coding->src_bytes;
6207
6208   if (inhibit_eol_conversion
6209       || SYMBOLP (eol_type))
6210     {
6211       /* We don't have to check EOL format.  */
6212       while (src < end && !( *src & 0x80))
6213         {
6214           if (*src++ == '\n')
6215             eol_seen |= EOL_SEEN_LF;
6216         }
6217     }
6218   else
6219     {
6220       end--;                /* We look ahead one byte for "CR LF".  */
6221       while (src < end)
6222         {
6223           int c = *src;
6224
6225           if (c & 0x80)
6226             break;
6227           src++;
6228           if (c == '\r')
6229             {
6230               if (*src == '\n')
6231                 {
6232                   eol_seen |= EOL_SEEN_CRLF;
6233                   src++;
6234                 }
6235               else
6236                 eol_seen |= EOL_SEEN_CR;
6237             }
6238           else if (c == '\n')
6239             eol_seen |= EOL_SEEN_LF;
6240         }
6241       if (src == end)
6242         {
6243           int c = *src;
6244
6245           /* All bytes but the last one C are ASCII.  */
6246           if (! (c & 0x80))
6247             {
6248               if (c == '\r')
6249                 eol_seen |= EOL_SEEN_CR;
6250               else if (c  == '\n')
6251                 eol_seen |= EOL_SEEN_LF;
6252               src++;
6253             }
6254         }
6255     }
6256   coding->head_ascii = src - coding->source;
6257   coding->eol_seen = eol_seen;
6258   return (coding->head_ascii);
6259 }
6260
6261
6262 /* Return the number of characters at the source if all the bytes are
6263    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6264    effects, update coding->eol_seen.  The value of coding->eol_seen is
6265    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6266    the value is reliable only when all the source bytes are valid
6267    UTF-8.  */
6268
6269 static ptrdiff_t
6270 check_utf_8 (struct coding_system *coding)
6271 {
6272   const unsigned char *src, *end;
6273   int eol_seen;
6274   ptrdiff_t nchars = coding->head_ascii;
6275
6276   if (coding->head_ascii < 0)
6277     check_ascii (coding);
6278   else
6279     coding_set_source (coding);
6280   src = coding->source + coding->head_ascii;
6281   /* We look ahead one byte for CR LF.  */
6282   end = coding->source + coding->src_bytes - 1;
6283   eol_seen = coding->eol_seen;
6284   while (src < end)
6285     {
6286       int c = *src;
6287
6288       if (UTF_8_1_OCTET_P (*src))
6289         {
6290           src++;
6291           if (c < 0x20)
6292             {
6293               if (c == '\r')
6294                 {
6295                   if (*src == '\n')
6296                     {
6297                       eol_seen |= EOL_SEEN_CRLF;
6298                       src++;
6299                       nchars++;
6300                     }
6301                   else
6302                     eol_seen |= EOL_SEEN_CR;
6303                 }
6304               else if (c == '\n')
6305                 eol_seen |= EOL_SEEN_LF;
6306             }
6307         }
6308       else if (UTF_8_2_OCTET_LEADING_P (c))
6309         {
6310           if (c < 0xC2          /* overlong sequence */
6311               || src + 1 >= end
6312               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6313             return -1;
6314           src += 2;
6315         }
6316       else if (UTF_8_3_OCTET_LEADING_P (c))
6317         {
6318           if (src + 2 >= end
6319               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6320                     && UTF_8_EXTRA_OCTET_P (src[2])))
6321             return -1;
6322           c = (((c & 0xF) << 12)
6323                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6324           if (c < 0x800                       /* overlong sequence */
6325               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6326             return -1;
6327           src += 3;
6328         }
6329       else if (UTF_8_4_OCTET_LEADING_P (c))
6330         {
6331           if (src + 3 >= end
6332               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6333                     && UTF_8_EXTRA_OCTET_P (src[2])
6334                     && UTF_8_EXTRA_OCTET_P (src[3])))
6335             return -1;
6336           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6337                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6338           if (c < 0x10000       /* overlong sequence */
6339               || c >= 0x110000) /* non-Unicode character  */
6340             return -1;
6341           src += 4;
6342         }
6343       else
6344         return -1;
6345       nchars++;
6346     }
6347
6348   if (src == end)
6349     {
6350       if (! UTF_8_1_OCTET_P (*src))
6351         return -1;
6352       nchars++;
6353       if (*src == '\r')
6354         eol_seen |= EOL_SEEN_CR;
6355       else if (*src  == '\n')
6356         eol_seen |= EOL_SEEN_LF;
6357     }
6358   coding->eol_seen = eol_seen;
6359   return nchars;
6360 }
6361
6362
6363 /* Return whether STRING is a valid UTF-8 string.  STRING must be a
6364    unibyte string.  */
6365
6366 bool
6367 utf8_string_p (Lisp_Object string)
6368 {
6369   eassert (!STRING_MULTIBYTE (string));
6370   struct coding_system coding;
6371   setup_coding_system (Qutf_8_unix, &coding);
6372   /* We initialize only the fields that check_utf_8 accesses.  */
6373   coding.head_ascii = -1;
6374   coding.src_pos = 0;
6375   coding.src_pos_byte = 0;
6376   coding.src_chars = SCHARS (string);
6377   coding.src_bytes = SBYTES (string);
6378   coding.src_object = string;
6379   coding.eol_seen = EOL_SEEN_NONE;
6380   return check_utf_8 (&coding) != -1;
6381 }
6382
6383
6384 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6385    SOURCE is encoded.  If CATEGORY is one of
6386    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6387    two-byte, else they are encoded by one-byte.
6388
6389    Return one of EOL_SEEN_XXX.  */
6390
6391 #define MAX_EOL_CHECK_COUNT 3
6392
6393 static int
6394 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6395             enum coding_category category)
6396 {
6397   const unsigned char *src = source, *src_end = src + src_bytes;
6398   unsigned char c;
6399   int total  = 0;
6400   int eol_seen = EOL_SEEN_NONE;
6401
6402   if ((1 << category) & CATEGORY_MASK_UTF_16)
6403     {
6404       bool msb = category == (coding_category_utf_16_le
6405                               | coding_category_utf_16_le_nosig);
6406       bool lsb = !msb;
6407
6408       while (src + 1 < src_end)
6409         {
6410           c = src[lsb];
6411           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6412             {
6413               int this_eol;
6414
6415               if (c == '\n')
6416                 this_eol = EOL_SEEN_LF;
6417               else if (src + 3 >= src_end
6418                        || src[msb + 2] != 0
6419                        || src[lsb + 2] != '\n')
6420                 this_eol = EOL_SEEN_CR;
6421               else
6422                 {
6423                   this_eol = EOL_SEEN_CRLF;
6424                   src += 2;
6425                 }
6426
6427               if (eol_seen == EOL_SEEN_NONE)
6428                 /* This is the first end-of-line.  */
6429                 eol_seen = this_eol;
6430               else if (eol_seen != this_eol)
6431                 {
6432                   /* The found type is different from what found before.
6433                      Allow for stray ^M characters in DOS EOL files.  */
6434                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6435                       || (eol_seen == EOL_SEEN_CRLF
6436                           && this_eol == EOL_SEEN_CR))
6437                     eol_seen = EOL_SEEN_CRLF;
6438                   else
6439                     {
6440                       eol_seen = EOL_SEEN_LF;
6441                       break;
6442                     }
6443                 }
6444               if (++total == MAX_EOL_CHECK_COUNT)
6445                 break;
6446             }
6447           src += 2;
6448         }
6449     }
6450   else
6451     while (src < src_end)
6452       {
6453         c = *src++;
6454         if (c == '\n' || c == '\r')
6455           {
6456             int this_eol;
6457
6458             if (c == '\n')
6459               this_eol = EOL_SEEN_LF;
6460             else if (src >= src_end || *src != '\n')
6461               this_eol = EOL_SEEN_CR;
6462             else
6463               this_eol = EOL_SEEN_CRLF, src++;
6464
6465             if (eol_seen == EOL_SEEN_NONE)
6466               /* This is the first end-of-line.  */
6467               eol_seen = this_eol;
6468             else if (eol_seen != this_eol)
6469               {
6470                 /* The found type is different from what found before.
6471                    Allow for stray ^M characters in DOS EOL files.  */
6472                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6473                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6474                   eol_seen = EOL_SEEN_CRLF;
6475                 else
6476                   {
6477                     eol_seen = EOL_SEEN_LF;
6478                     break;
6479                   }
6480               }
6481             if (++total == MAX_EOL_CHECK_COUNT)
6482               break;
6483           }
6484       }
6485   return eol_seen;
6486 }
6487
6488
6489 static Lisp_Object
6490 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6491 {
6492   Lisp_Object eol_type;
6493
6494   eol_type = CODING_ID_EOL_TYPE (coding->id);
6495   if (! VECTORP (eol_type))
6496     /* Already adjusted.  */
6497     return eol_type;
6498   if (eol_seen & EOL_SEEN_LF)
6499     {
6500       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6501       eol_type = Qunix;
6502     }
6503   else if (eol_seen & EOL_SEEN_CRLF)
6504     {
6505       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6506       eol_type = Qdos;
6507     }
6508   else if (eol_seen & EOL_SEEN_CR)
6509     {
6510       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6511       eol_type = Qmac;
6512     }
6513   return eol_type;
6514 }
6515
6516 /* Detect how a text specified in CODING is encoded.  If a coding
6517    system is detected, update fields of CODING by the detected coding
6518    system.  */
6519
6520 static void
6521 detect_coding (struct coding_system *coding)
6522 {
6523   const unsigned char *src, *src_end;
6524   unsigned int saved_mode = coding->mode;
6525   Lisp_Object found = Qnil;
6526   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6527
6528   coding->consumed = coding->consumed_char = 0;
6529   coding->produced = coding->produced_char = 0;
6530   coding_set_source (coding);
6531
6532   src_end = coding->source + coding->src_bytes;
6533
6534   coding->eol_seen = EOL_SEEN_NONE;
6535   /* If we have not yet decided the text encoding type, detect it
6536      now.  */
6537   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6538     {
6539       int c, i;
6540       struct coding_detection_info detect_info;
6541       bool null_byte_found = 0, eight_bit_found = 0;
6542       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6543                                        inhibit_null_byte_detection);
6544       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6545                                        inhibit_iso_escape_detection);
6546       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6547
6548       coding->head_ascii = 0;
6549       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6550       for (src = coding->source; src < src_end; src++)
6551         {
6552           c = *src;
6553           if (c & 0x80)
6554             {
6555               eight_bit_found = 1;
6556               if (null_byte_found)
6557                 break;
6558             }
6559           else if (c < 0x20)
6560             {
6561               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6562                   && ! inhibit_ied
6563                   && ! detect_info.checked)
6564                 {
6565                   if (detect_coding_iso_2022 (coding, &detect_info))
6566                     {
6567                       /* We have scanned the whole data.  */
6568                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6569                         {
6570                           /* We didn't find an 8-bit code.  We may
6571                              have found a null-byte, but it's very
6572                              rare that a binary file conforms to
6573                              ISO-2022.  */
6574                           src = src_end;
6575                           coding->head_ascii = src - coding->source;
6576                         }
6577                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6578                       break;
6579                     }
6580                 }
6581               else if (! c && !inhibit_nbd)
6582                 {
6583                   null_byte_found = 1;
6584                   if (eight_bit_found)
6585                     break;
6586                 }
6587               else if (! disable_ascii_optimization
6588                        && ! inhibit_eol_conversion)
6589                 {
6590                   if (c == '\r')
6591                     {
6592                       if (src < src_end && src[1] == '\n')
6593                         {
6594                           coding->eol_seen |= EOL_SEEN_CRLF;
6595                           src++;
6596                           if (! eight_bit_found)
6597                             coding->head_ascii++;
6598                         }
6599                       else
6600                         coding->eol_seen |= EOL_SEEN_CR;
6601                     }
6602                   else if (c == '\n')
6603                     {
6604                       coding->eol_seen |= EOL_SEEN_LF;
6605                     }
6606                 }
6607
6608               if (! eight_bit_found)
6609                 coding->head_ascii++;
6610             }
6611           else if (! eight_bit_found)
6612             coding->head_ascii++;
6613         }
6614
6615       if (null_byte_found || eight_bit_found
6616           || coding->head_ascii < coding->src_bytes
6617           || detect_info.found)
6618         {
6619           enum coding_category category;
6620           struct coding_system *this;
6621
6622           if (coding->head_ascii == coding->src_bytes)
6623             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6624             for (i = 0; i < coding_category_raw_text; i++)
6625               {
6626                 category = coding_priorities[i];
6627                 this = coding_categories + category;
6628                 if (detect_info.found & (1 << category))
6629                   break;
6630               }
6631           else
6632             {
6633               if (null_byte_found)
6634                 {
6635                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6636                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6637                 }
6638               else if (prefer_utf_8
6639                        && detect_coding_utf_8 (coding, &detect_info))
6640                 {
6641                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6642                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6643                 }
6644               for (i = 0; i < coding_category_raw_text; i++)
6645                 {
6646                   category = coding_priorities[i];
6647                   this = coding_categories + category;
6648                   /* Some of this->detector (e.g. detect_coding_sjis)
6649                      require this information.  */
6650                   coding->id = this->id;
6651                   if (this->id < 0)
6652                     {
6653                       /* No coding system of this category is defined.  */
6654                       detect_info.rejected |= (1 << category);
6655                     }
6656                   else if (category >= coding_category_raw_text)
6657                     continue;
6658                   else if (detect_info.checked & (1 << category))
6659                     {
6660                       if (detect_info.found & (1 << category))
6661                         break;
6662                     }
6663                   else if ((*(this->detector)) (coding, &detect_info)
6664                            && detect_info.found & (1 << category))
6665                     break;
6666                 }
6667             }
6668
6669           if (i < coding_category_raw_text)
6670             {
6671               if (category == coding_category_utf_8_auto)
6672                 {
6673                   Lisp_Object coding_systems;
6674
6675                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6676                                          coding_attr_utf_bom);
6677                   if (CONSP (coding_systems))
6678                     {
6679                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6680                         found = XCAR (coding_systems);
6681                       else
6682                         found = XCDR (coding_systems);
6683                     }
6684                   else
6685                     found = CODING_ID_NAME (this->id);
6686                 }
6687               else if (category == coding_category_utf_16_auto)
6688                 {
6689                   Lisp_Object coding_systems;
6690
6691                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6692                                          coding_attr_utf_bom);
6693                   if (CONSP (coding_systems))
6694                     {
6695                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6696                         found = XCAR (coding_systems);
6697                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6698                         found = XCDR (coding_systems);
6699                     }
6700                   else
6701                     found = CODING_ID_NAME (this->id);
6702                 }
6703               else
6704                 found = CODING_ID_NAME (this->id);
6705             }
6706           else if (null_byte_found)
6707             found = Qno_conversion;
6708           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6709                    == CATEGORY_MASK_ANY)
6710             found = Qraw_text;
6711           else if (detect_info.rejected)
6712             for (i = 0; i < coding_category_raw_text; i++)
6713               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6714                 {
6715                   this = coding_categories + coding_priorities[i];
6716                   found = CODING_ID_NAME (this->id);
6717                   break;
6718                 }
6719         }
6720     }
6721   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6722            == coding_category_utf_8_auto)
6723     {
6724       Lisp_Object coding_systems;
6725       struct coding_detection_info detect_info;
6726
6727       coding_systems
6728         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6729       detect_info.found = detect_info.rejected = 0;
6730       if (check_ascii (coding) == coding->src_bytes)
6731         {
6732           if (CONSP (coding_systems))
6733             found = XCDR (coding_systems);
6734         }
6735       else
6736         {
6737           if (CONSP (coding_systems)
6738               && detect_coding_utf_8 (coding, &detect_info))
6739             {
6740               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6741                 found = XCAR (coding_systems);
6742               else
6743                 found = XCDR (coding_systems);
6744             }
6745         }
6746     }
6747   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6748            == coding_category_utf_16_auto)
6749     {
6750       Lisp_Object coding_systems;
6751       struct coding_detection_info detect_info;
6752
6753       coding_systems
6754         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6755       detect_info.found = detect_info.rejected = 0;
6756       coding->head_ascii = 0;
6757       if (CONSP (coding_systems)
6758           && detect_coding_utf_16 (coding, &detect_info))
6759         {
6760           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6761             found = XCAR (coding_systems);
6762           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6763             found = XCDR (coding_systems);
6764         }
6765     }
6766
6767   if (! NILP (found))
6768     {
6769       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6770                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6771                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6772                            : EOL_SEEN_LF);
6773
6774       setup_coding_system (found, coding);
6775       if (specified_eol != EOL_SEEN_NONE)
6776         adjust_coding_eol_type (coding, specified_eol);
6777     }
6778
6779   coding->mode = saved_mode;
6780 }
6781
6782
6783 static void
6784 decode_eol (struct coding_system *coding)
6785 {
6786   Lisp_Object eol_type;
6787   unsigned char *p, *pbeg, *pend;
6788
6789   eol_type = CODING_ID_EOL_TYPE (coding->id);
6790   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6791     return;
6792
6793   if (NILP (coding->dst_object))
6794     pbeg = coding->destination;
6795   else
6796     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6797   pend = pbeg + coding->produced;
6798
6799   if (VECTORP (eol_type))
6800     {
6801       int eol_seen = EOL_SEEN_NONE;
6802
6803       for (p = pbeg; p < pend; p++)
6804         {
6805           if (*p == '\n')
6806             eol_seen |= EOL_SEEN_LF;
6807           else if (*p == '\r')
6808             {
6809               if (p + 1 < pend && *(p + 1) == '\n')
6810                 {
6811                   eol_seen |= EOL_SEEN_CRLF;
6812                   p++;
6813                 }
6814               else
6815                 eol_seen |= EOL_SEEN_CR;
6816             }
6817         }
6818       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6819       if ((eol_seen & EOL_SEEN_CRLF) != 0
6820           && (eol_seen & EOL_SEEN_CR) != 0
6821           && (eol_seen & EOL_SEEN_LF) == 0)
6822         eol_seen = EOL_SEEN_CRLF;
6823       else if (eol_seen != EOL_SEEN_NONE
6824           && eol_seen != EOL_SEEN_LF
6825           && eol_seen != EOL_SEEN_CRLF
6826           && eol_seen != EOL_SEEN_CR)
6827         eol_seen = EOL_SEEN_LF;
6828       if (eol_seen != EOL_SEEN_NONE)
6829         eol_type = adjust_coding_eol_type (coding, eol_seen);
6830     }
6831
6832   if (EQ (eol_type, Qmac))
6833     {
6834       for (p = pbeg; p < pend; p++)
6835         if (*p == '\r')
6836           *p = '\n';
6837     }
6838   else if (EQ (eol_type, Qdos))
6839     {
6840       ptrdiff_t n = 0;
6841       ptrdiff_t pos = coding->dst_pos;
6842       ptrdiff_t pos_byte = coding->dst_pos_byte;
6843       ptrdiff_t pos_end = pos_byte + coding->produced - 1;
6844
6845       /* This assertion is here instead of code, now deleted, that
6846          handled the NILP case, which no longer happens with the
6847          current codebase.  */
6848       eassert (!NILP (coding->dst_object));
6849
6850       while (pos_byte < pos_end)
6851         {
6852           int incr;
6853
6854           p = BYTE_POS_ADDR (pos_byte);
6855           if (coding->dst_multibyte)
6856             incr = BYTES_BY_CHAR_HEAD (*p);
6857           else
6858             incr = 1;
6859
6860           if (*p == '\r' && p[1] == '\n')
6861             {
6862               del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6863               n++;
6864               pos_end--;
6865             }
6866           pos++;
6867           pos_byte += incr;
6868         }
6869       coding->produced -= n;
6870       coding->produced_char -= n;
6871     }
6872 }
6873
6874
6875 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
6876    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
6877    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
6878 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6879
6880 /* Return a translation table (or list of them) from coding system
6881    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6882    not ENCODEP). */
6883
6884 static Lisp_Object
6885 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6886 {
6887   Lisp_Object standard, translation_table;
6888   Lisp_Object val;
6889
6890   if (NILP (Venable_character_translation))
6891     {
6892       if (max_lookup)
6893         *max_lookup = 0;
6894       return Qnil;
6895     }
6896   if (encodep)
6897     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6898       standard = Vstandard_translation_table_for_encode;
6899   else
6900     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6901       standard = Vstandard_translation_table_for_decode;
6902   if (NILP (translation_table))
6903     translation_table = standard;
6904   else
6905     {
6906       if (SYMBOLP (translation_table))
6907         translation_table = Fget (translation_table, Qtranslation_table);
6908       else if (CONSP (translation_table))
6909         {
6910           translation_table = Fcopy_sequence (translation_table);
6911           for (val = translation_table; CONSP (val); val = XCDR (val))
6912             if (SYMBOLP (XCAR (val)))
6913               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6914         }
6915       if (CHAR_TABLE_P (standard))
6916         {
6917           if (CONSP (translation_table))
6918             translation_table = nconc2 (translation_table, list1 (standard));
6919           else
6920             translation_table = list2 (translation_table, standard);
6921         }
6922     }
6923
6924   if (max_lookup)
6925     {
6926       *max_lookup = 1;
6927       if (CHAR_TABLE_P (translation_table)
6928           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6929         {
6930           val = XCHAR_TABLE (translation_table)->extras[1];
6931           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6932             *max_lookup = min (XFASTINT (val), MAX_LOOKUP_MAX);
6933         }
6934       else if (CONSP (translation_table))
6935         {
6936           Lisp_Object tail;
6937
6938           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6939             if (CHAR_TABLE_P (XCAR (tail))
6940                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6941               {
6942                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6943                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6944                   *max_lookup = min (XFASTINT (tailval), MAX_LOOKUP_MAX);
6945               }
6946         }
6947     }
6948   return translation_table;
6949 }
6950
6951 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6952   do {                                                          \
6953     trans = Qnil;                                               \
6954     if (CHAR_TABLE_P (table))                                   \
6955       {                                                         \
6956         trans = CHAR_TABLE_REF (table, c);                      \
6957         if (CHARACTERP (trans))                                 \
6958           c = XFASTINT (trans), trans = Qnil;                   \
6959       }                                                         \
6960     else if (CONSP (table))                                     \
6961       {                                                         \
6962         Lisp_Object tail;                                       \
6963                                                                 \
6964         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6965           if (CHAR_TABLE_P (XCAR (tail)))                       \
6966             {                                                   \
6967               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6968               if (CHARACTERP (trans))                           \
6969                 c = XFASTINT (trans), trans = Qnil;             \
6970               else if (! NILP (trans))                          \
6971                 break;                                          \
6972             }                                                   \
6973       }                                                         \
6974   } while (0)
6975
6976
6977 /* Return a translation of character(s) at BUF according to TRANS.
6978    TRANS is TO-CHAR, [TO-CHAR ...], or ((FROM .  TO) ...) where FROM =
6979    [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].  The return value
6980    is TO-CHAR or [TO-CHAR ...] if a translation is found, Qnil if not
6981    found, or Qt if BUF is too short to lookup characters in FROM.  As
6982    a side effect, if a translation is found, *NCHARS is set to the
6983    number of characters being translated.  */
6984
6985 static Lisp_Object
6986 get_translation (Lisp_Object trans, int *buf, int *buf_end, ptrdiff_t *nchars)
6987 {
6988   if (INTEGERP (trans) || VECTORP (trans))
6989     {
6990       *nchars = 1;
6991       return trans;
6992     }
6993   for (; CONSP (trans); trans = XCDR (trans))
6994     {
6995       Lisp_Object val = XCAR (trans);
6996       Lisp_Object from = XCAR (val);
6997       ptrdiff_t len = ASIZE (from);
6998       ptrdiff_t i;
6999
7000       for (i = 0; i < len; i++)
7001         {
7002           if (buf + i == buf_end)
7003             return Qt;
7004           if (XINT (AREF (from, i)) != buf[i])
7005             break;
7006         }
7007       if (i == len)
7008         {
7009           *nchars = len;
7010           return XCDR (val);
7011         }
7012     }
7013   return Qnil;
7014 }
7015
7016
7017 static int
7018 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7019                bool last_block)
7020 {
7021   unsigned char *dst = coding->destination + coding->produced;
7022   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7023   ptrdiff_t produced;
7024   ptrdiff_t produced_chars = 0;
7025   int carryover = 0;
7026
7027   if (! coding->chars_at_source)
7028     {
7029       /* Source characters are in coding->charbuf.  */
7030       int *buf = coding->charbuf;
7031       int *buf_end = buf + coding->charbuf_used;
7032
7033       if (EQ (coding->src_object, coding->dst_object)
7034           && ! NILP (coding->dst_object))
7035         {
7036           eassert (growable_destination (coding));
7037           coding_set_source (coding);
7038           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7039         }
7040
7041       while (buf < buf_end)
7042         {
7043           int c = *buf;
7044           ptrdiff_t i;
7045
7046           if (c >= 0)
7047             {
7048               ptrdiff_t from_nchars = 1, to_nchars = 1;
7049               Lisp_Object trans = Qnil;
7050
7051               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7052               if (! NILP (trans))
7053                 {
7054                   trans = get_translation (trans, buf, buf_end, &from_nchars);
7055                   if (INTEGERP (trans))
7056                     c = XINT (trans);
7057                   else if (VECTORP (trans))
7058                     {
7059                       to_nchars = ASIZE (trans);
7060                       c = XINT (AREF (trans, 0));
7061                     }
7062                   else if (EQ (trans, Qt) && ! last_block)
7063                     break;
7064                 }
7065
7066               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7067                 {
7068                   eassert (growable_destination (coding));
7069                   ptrdiff_t dst_size;
7070                   if (INT_MULTIPLY_WRAPV (to_nchars, MAX_MULTIBYTE_LENGTH,
7071                                           &dst_size)
7072                       || INT_ADD_WRAPV (buf_end - buf, dst_size, &dst_size))
7073                     memory_full (SIZE_MAX);
7074                   dst = alloc_destination (coding, dst_size, dst);
7075                   if (EQ (coding->src_object, coding->dst_object))
7076                     {
7077                       coding_set_source (coding);
7078                       dst_end = (((unsigned char *) coding->source)
7079                                  + coding->consumed);
7080                     }
7081                   else
7082                     dst_end = coding->destination + coding->dst_bytes;
7083                 }
7084
7085               for (i = 0; i < to_nchars; i++)
7086                 {
7087                   if (i > 0)
7088                     c = XINT (AREF (trans, i));
7089                   if (coding->dst_multibyte
7090                       || ! CHAR_BYTE8_P (c))
7091                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7092                   else
7093                     *dst++ = CHAR_TO_BYTE8 (c);
7094                 }
7095               produced_chars += to_nchars;
7096               buf += from_nchars;
7097             }
7098           else
7099             /* This is an annotation datum.  (-C) is the length.  */
7100             buf += -c;
7101         }
7102       carryover = buf_end - buf;
7103     }
7104   else
7105     {
7106       /* Source characters are at coding->source.  */
7107       const unsigned char *src = coding->source;
7108       const unsigned char *src_end = src + coding->consumed;
7109
7110       if (EQ (coding->dst_object, coding->src_object))
7111         {
7112           eassert (growable_destination (coding));
7113           dst_end = (unsigned char *) src;
7114         }
7115       if (coding->src_multibyte != coding->dst_multibyte)
7116         {
7117           if (coding->src_multibyte)
7118             {
7119               bool multibytep = 1;
7120               ptrdiff_t consumed_chars = 0;
7121
7122               while (1)
7123                 {
7124                   const unsigned char *src_base = src;
7125                   int c;
7126
7127                   ONE_MORE_BYTE (c);
7128                   if (dst == dst_end)
7129                     {
7130                       eassert (growable_destination (coding));
7131                       if (EQ (coding->src_object, coding->dst_object))
7132                         dst_end = (unsigned char *) src;
7133                       if (dst == dst_end)
7134                         {
7135                           ptrdiff_t offset = src - coding->source;
7136
7137                           dst = alloc_destination (coding, src_end - src + 1,
7138                                                    dst);
7139                           dst_end = coding->destination + coding->dst_bytes;
7140                           coding_set_source (coding);
7141                           src = coding->source + offset;
7142                           src_end = coding->source + coding->consumed;
7143                           if (EQ (coding->src_object, coding->dst_object))
7144                             dst_end = (unsigned char *) src;
7145                         }
7146                     }
7147                   *dst++ = c;
7148                   produced_chars++;
7149                 }
7150             no_more_source:
7151               ;
7152             }
7153           else
7154             while (src < src_end)
7155               {
7156                 bool multibytep = 1;
7157                 int c = *src++;
7158
7159                 if (dst >= dst_end - 1)
7160                   {
7161                     eassert (growable_destination (coding));
7162                     if (EQ (coding->src_object, coding->dst_object))
7163                       dst_end = (unsigned char *) src;
7164                     if (dst >= dst_end - 1)
7165                       {
7166                         ptrdiff_t offset = src - coding->source;
7167                         ptrdiff_t more_bytes;
7168
7169                         if (EQ (coding->src_object, coding->dst_object))
7170                           more_bytes = ((src_end - src) / 2) + 2;
7171                         else
7172                           more_bytes = src_end - src + 2;
7173                         dst = alloc_destination (coding, more_bytes, dst);
7174                         dst_end = coding->destination + coding->dst_bytes;
7175                         coding_set_source (coding);
7176                         src = coding->source + offset;
7177                         src_end = coding->source + coding->consumed;
7178                         if (EQ (coding->src_object, coding->dst_object))
7179                           dst_end = (unsigned char *) src;
7180                       }
7181                   }
7182                 EMIT_ONE_BYTE (c);
7183               }
7184         }
7185       else
7186         {
7187           if (!EQ (coding->src_object, coding->dst_object))
7188             {
7189               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7190
7191               if (require > 0)
7192                 {
7193                   ptrdiff_t offset = src - coding->source;
7194
7195                   dst = alloc_destination (coding, require, dst);
7196                   coding_set_source (coding);
7197                   src = coding->source + offset;
7198                   src_end = coding->source + coding->consumed;
7199                 }
7200             }
7201           produced_chars = coding->consumed_char;
7202           while (src < src_end)
7203             *dst++ = *src++;
7204         }
7205     }
7206
7207   produced = dst - (coding->destination + coding->produced);
7208   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7209     insert_from_gap (produced_chars, produced, 0);
7210   coding->produced += produced;
7211   coding->produced_char += produced_chars;
7212   return carryover;
7213 }
7214
7215 /* Compose text in CODING->object according to the annotation data at
7216    CHARBUF.  CHARBUF is an array:
7217      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7218  */
7219
7220 static void
7221 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7222 {
7223   int len;
7224   ptrdiff_t to;
7225   enum composition_method method;
7226   Lisp_Object components;
7227
7228   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7229   to = pos + charbuf[2];
7230   method = (enum composition_method) (charbuf[4]);
7231
7232   if (method == COMPOSITION_RELATIVE)
7233     components = Qnil;
7234   else
7235     {
7236       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7237       int i, j;
7238
7239       if (method == COMPOSITION_WITH_RULE)
7240         len = charbuf[2] * 3 - 2;
7241       charbuf += MAX_ANNOTATION_LENGTH;
7242       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7243       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7244         {
7245           if (charbuf[i] >= 0)
7246             args[j] = make_number (charbuf[i]);
7247           else
7248             {
7249               i++;
7250               args[j] = make_number (charbuf[i] % 0x100);
7251             }
7252         }
7253       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7254     }
7255   compose_text (pos, to, components, Qnil, coding->dst_object);
7256 }
7257
7258
7259 /* Put `charset' property on text in CODING->object according to
7260    the annotation data at CHARBUF.  CHARBUF is an array:
7261      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7262  */
7263
7264 static void
7265 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7266 {
7267   ptrdiff_t from = pos - charbuf[2];
7268   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7269
7270   Fput_text_property (make_number (from), make_number (pos),
7271                       Qcharset, CHARSET_NAME (charset),
7272                       coding->dst_object);
7273 }
7274
7275 #define MAX_CHARBUF_SIZE 0x4000
7276 /* How many units decoding functions expect in coding->charbuf at
7277    most.  Currently, decode_coding_emacs_mule expects the following
7278    size, and that is the largest value.  */
7279 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7280
7281 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7282   do {                                                          \
7283     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
7284                            MAX_CHARBUF_SIZE);                   \
7285     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
7286     coding->charbuf_size = units;                               \
7287   } while (0)
7288
7289 static void
7290 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7291 {
7292   int *charbuf = coding->charbuf;
7293   int *charbuf_end = charbuf + coding->charbuf_used;
7294
7295   if (NILP (coding->dst_object))
7296     return;
7297
7298   while (charbuf < charbuf_end)
7299     {
7300       if (*charbuf >= 0)
7301         pos++, charbuf++;
7302       else
7303         {
7304           int len = -*charbuf;
7305
7306           if (len > 2)
7307             switch (charbuf[1])
7308               {
7309               case CODING_ANNOTATE_COMPOSITION_MASK:
7310                 produce_composition (coding, charbuf, pos);
7311                 break;
7312               case CODING_ANNOTATE_CHARSET_MASK:
7313                 produce_charset (coding, charbuf, pos);
7314                 break;
7315               default:
7316                 break;
7317               }
7318           charbuf += len;
7319         }
7320     }
7321 }
7322
7323 /* Decode the data at CODING->src_object into CODING->dst_object.
7324    CODING->src_object is a buffer, a string, or nil.
7325    CODING->dst_object is a buffer.
7326
7327    If CODING->src_object is a buffer, it must be the current buffer.
7328    In this case, if CODING->src_pos is positive, it is a position of
7329    the source text in the buffer, otherwise, the source text is in the
7330    gap area of the buffer, and CODING->src_pos specifies the offset of
7331    the text from GPT (which must be the same as PT).  If this is the
7332    same buffer as CODING->dst_object, CODING->src_pos must be
7333    negative.
7334
7335    If CODING->src_object is a string, CODING->src_pos is an index to
7336    that string.
7337
7338    If CODING->src_object is nil, CODING->source must already point to
7339    the non-relocatable memory area.  In this case, CODING->src_pos is
7340    an offset from CODING->source.
7341
7342    The decoded data is inserted at the current point of the buffer
7343    CODING->dst_object.
7344 */
7345
7346 static void
7347 decode_coding (struct coding_system *coding)
7348 {
7349   Lisp_Object attrs;
7350   Lisp_Object undo_list;
7351   Lisp_Object translation_table;
7352   struct ccl_spec cclspec;
7353   int carryover;
7354   int i;
7355
7356   USE_SAFE_ALLOCA;
7357
7358   if (BUFFERP (coding->src_object)
7359       && coding->src_pos > 0
7360       && coding->src_pos < GPT
7361       && coding->src_pos + coding->src_chars > GPT)
7362     move_gap_both (coding->src_pos, coding->src_pos_byte);
7363
7364   undo_list = Qt;
7365   if (BUFFERP (coding->dst_object))
7366     {
7367       set_buffer_internal (XBUFFER (coding->dst_object));
7368       if (GPT != PT)
7369         move_gap_both (PT, PT_BYTE);
7370
7371       /* We must disable undo_list in order to record the whole insert
7372          transaction via record_insert at the end.  But doing so also
7373          disables the recording of the first change to the undo_list.
7374          Therefore we check for first change here and record it via
7375          record_first_change if needed.  */
7376       if (MODIFF <= SAVE_MODIFF)
7377         record_first_change ();
7378
7379       undo_list = BVAR (current_buffer, undo_list);
7380       bset_undo_list (current_buffer, Qt);
7381     }
7382
7383   coding->consumed = coding->consumed_char = 0;
7384   coding->produced = coding->produced_char = 0;
7385   coding->chars_at_source = 0;
7386   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7387
7388   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7389
7390   attrs = CODING_ID_ATTRS (coding->id);
7391   translation_table = get_translation_table (attrs, 0, NULL);
7392
7393   carryover = 0;
7394   if (coding->decoder == decode_coding_ccl)
7395     {
7396       coding->spec.ccl = &cclspec;
7397       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7398     }
7399   do
7400     {
7401       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7402
7403       coding_set_source (coding);
7404       coding->annotated = 0;
7405       coding->charbuf_used = carryover;
7406       (*(coding->decoder)) (coding);
7407       coding_set_destination (coding);
7408       carryover = produce_chars (coding, translation_table, 0);
7409       if (coding->annotated)
7410         produce_annotation (coding, pos);
7411       for (i = 0; i < carryover; i++)
7412         coding->charbuf[i]
7413           = coding->charbuf[coding->charbuf_used - carryover + i];
7414     }
7415   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7416          || (coding->consumed < coding->src_bytes
7417              && (coding->result == CODING_RESULT_SUCCESS
7418                  || coding->result == CODING_RESULT_INVALID_SRC)));
7419
7420   if (carryover > 0)
7421     {
7422       coding_set_destination (coding);
7423       coding->charbuf_used = carryover;
7424       produce_chars (coding, translation_table, 1);
7425     }
7426
7427   coding->carryover_bytes = 0;
7428   if (coding->consumed < coding->src_bytes)
7429     {
7430       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7431       const unsigned char *src;
7432
7433       coding_set_source (coding);
7434       coding_set_destination (coding);
7435       src = coding->source + coding->consumed;
7436
7437       if (coding->mode & CODING_MODE_LAST_BLOCK)
7438         {
7439           /* Flush out unprocessed data as binary chars.  We are sure
7440              that the number of data is less than the size of
7441              coding->charbuf.  */
7442           coding->charbuf_used = 0;
7443           coding->chars_at_source = 0;
7444
7445           while (nbytes-- > 0)
7446             {
7447               int c = *src++;
7448
7449               if (c & 0x80)
7450                 c = BYTE8_TO_CHAR (c);
7451               coding->charbuf[coding->charbuf_used++] = c;
7452             }
7453           produce_chars (coding, Qnil, 1);
7454         }
7455       else
7456         {
7457           /* Record unprocessed bytes in coding->carryover.  We are
7458              sure that the number of data is less than the size of
7459              coding->carryover.  */
7460           unsigned char *p = coding->carryover;
7461
7462           if (nbytes > sizeof coding->carryover)
7463             nbytes = sizeof coding->carryover;
7464           coding->carryover_bytes = nbytes;
7465           while (nbytes-- > 0)
7466             *p++ = *src++;
7467         }
7468       coding->consumed = coding->src_bytes;
7469     }
7470
7471   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7472       && !inhibit_eol_conversion)
7473     decode_eol (coding);
7474   if (BUFFERP (coding->dst_object))
7475     {
7476       bset_undo_list (current_buffer, undo_list);
7477       record_insert (coding->dst_pos, coding->produced_char);
7478     }
7479
7480   SAFE_FREE ();
7481 }
7482
7483
7484 /* Extract an annotation datum from a composition starting at POS and
7485    ending before LIMIT of CODING->src_object (buffer or string), store
7486    the data in BUF, set *STOP to a starting position of the next
7487    composition (if any) or to LIMIT, and return the address of the
7488    next element of BUF.
7489
7490    If such an annotation is not found, set *STOP to a starting
7491    position of a composition after POS (if any) or to LIMIT, and
7492    return BUF.  */
7493
7494 static int *
7495 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7496                                struct coding_system *coding, int *buf,
7497                                ptrdiff_t *stop)
7498 {
7499   ptrdiff_t start, end;
7500   Lisp_Object prop;
7501
7502   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7503       || end > limit)
7504     *stop = limit;
7505   else if (start > pos)
7506     *stop = start;
7507   else
7508     {
7509       if (start == pos)
7510         {
7511           /* We found a composition.  Store the corresponding
7512              annotation data in BUF.  */
7513           int *head = buf;
7514           enum composition_method method = composition_method (prop);
7515           int nchars = COMPOSITION_LENGTH (prop);
7516
7517           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7518           if (method != COMPOSITION_RELATIVE)
7519             {
7520               Lisp_Object components;
7521               ptrdiff_t i, len, i_byte;
7522
7523               components = COMPOSITION_COMPONENTS (prop);
7524               if (VECTORP (components))
7525                 {
7526                   len = ASIZE (components);
7527                   for (i = 0; i < len; i++)
7528                     *buf++ = XINT (AREF (components, i));
7529                 }
7530               else if (STRINGP (components))
7531                 {
7532                   len = SCHARS (components);
7533                   i = i_byte = 0;
7534                   while (i < len)
7535                     {
7536                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7537                       buf++;
7538                     }
7539                 }
7540               else if (INTEGERP (components))
7541                 {
7542                   len = 1;
7543                   *buf++ = XINT (components);
7544                 }
7545               else if (CONSP (components))
7546                 {
7547                   for (len = 0; CONSP (components);
7548                        len++, components = XCDR (components))
7549                     *buf++ = XINT (XCAR (components));
7550                 }
7551               else
7552                 emacs_abort ();
7553               *head -= len;
7554             }
7555         }
7556
7557       if (find_composition (end, limit, &start, &end, &prop,
7558                             coding->src_object)
7559           && end <= limit)
7560         *stop = start;
7561       else
7562         *stop = limit;
7563     }
7564   return buf;
7565 }
7566
7567
7568 /* Extract an annotation datum from a text property `charset' at POS of
7569    CODING->src_object (buffer of string), store the data in BUF, set
7570    *STOP to the position where the value of `charset' property changes
7571    (limiting by LIMIT), and return the address of the next element of
7572    BUF.
7573
7574    If the property value is nil, set *STOP to the position where the
7575    property value is non-nil (limiting by LIMIT), and return BUF.  */
7576
7577 static int *
7578 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7579                            struct coding_system *coding, int *buf,
7580                            ptrdiff_t *stop)
7581 {
7582   Lisp_Object val, next;
7583   int id;
7584
7585   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7586   if (! NILP (val) && CHARSETP (val))
7587     id = XINT (CHARSET_SYMBOL_ID (val));
7588   else
7589     id = -1;
7590   ADD_CHARSET_DATA (buf, 0, id);
7591   next = Fnext_single_property_change (make_number (pos), Qcharset,
7592                                        coding->src_object,
7593                                        make_number (limit));
7594   *stop = XINT (next);
7595   return buf;
7596 }
7597
7598
7599 static void
7600 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7601                int max_lookup)
7602 {
7603   int *buf = coding->charbuf;
7604   int *buf_end = coding->charbuf + coding->charbuf_size;
7605   const unsigned char *src = coding->source + coding->consumed;
7606   const unsigned char *src_end = coding->source + coding->src_bytes;
7607   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7608   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7609   bool multibytep = coding->src_multibyte;
7610   Lisp_Object eol_type;
7611   int c;
7612   ptrdiff_t stop, stop_composition, stop_charset;
7613   int *lookup_buf = NULL;
7614
7615   if (! NILP (translation_table))
7616     lookup_buf = alloca (sizeof (int) * max_lookup);
7617
7618   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7619   if (VECTORP (eol_type))
7620     eol_type = Qunix;
7621
7622   /* Note: composition handling is not yet implemented.  */
7623   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7624
7625   if (NILP (coding->src_object))
7626     stop = stop_composition = stop_charset = end_pos;
7627   else
7628     {
7629       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7630         stop = stop_composition = pos;
7631       else
7632         stop = stop_composition = end_pos;
7633       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7634         stop = stop_charset = pos;
7635       else
7636         stop_charset = end_pos;
7637     }
7638
7639   /* Compensate for CRLF and conversion.  */
7640   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7641   while (buf < buf_end)
7642     {
7643       Lisp_Object trans;
7644
7645       if (pos == stop)
7646         {
7647           if (pos == end_pos)
7648             break;
7649           if (pos == stop_composition)
7650             buf = handle_composition_annotation (pos, end_pos, coding,
7651                                                  buf, &stop_composition);
7652           if (pos == stop_charset)
7653             buf = handle_charset_annotation (pos, end_pos, coding,
7654                                              buf, &stop_charset);
7655           stop = (stop_composition < stop_charset
7656                   ? stop_composition : stop_charset);
7657         }
7658
7659       if (! multibytep)
7660         {
7661           int bytes;
7662
7663           if (coding->encoder == encode_coding_raw_text
7664               || coding->encoder == encode_coding_ccl)
7665             c = *src++, pos++;
7666           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7667             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7668           else
7669             c = BYTE8_TO_CHAR (*src), src++, pos++;
7670         }
7671       else
7672         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7673       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7674         c = '\n';
7675       if (! EQ (eol_type, Qunix))
7676         {
7677           if (c == '\n')
7678             {
7679               if (EQ (eol_type, Qdos))
7680                 *buf++ = '\r';
7681               else
7682                 c = '\r';
7683             }
7684         }
7685
7686       trans = Qnil;
7687       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7688       if (NILP (trans))
7689         *buf++ = c;
7690       else
7691         {
7692           ptrdiff_t from_nchars = 1, to_nchars = 1;
7693           int *lookup_buf_end;
7694           const unsigned char *p = src;
7695           int i;
7696
7697           lookup_buf[0] = c;
7698           for (i = 1; i < max_lookup && p < src_end; i++)
7699             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7700           lookup_buf_end = lookup_buf + i;
7701           trans = get_translation (trans, lookup_buf, lookup_buf_end,
7702                                    &from_nchars);
7703           if (INTEGERP (trans))
7704             c = XINT (trans);
7705           else if (VECTORP (trans))
7706             {
7707               to_nchars = ASIZE (trans);
7708               if (buf_end - buf < to_nchars)
7709                 break;
7710               c = XINT (AREF (trans, 0));
7711             }
7712           else
7713             break;
7714           *buf++ = c;
7715           for (i = 1; i < to_nchars; i++)
7716             *buf++ = XINT (AREF (trans, i));
7717           for (i = 1; i < from_nchars; i++, pos++)
7718             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7719         }
7720     }
7721
7722   coding->consumed = src - coding->source;
7723   coding->consumed_char = pos - coding->src_pos;
7724   coding->charbuf_used = buf - coding->charbuf;
7725   coding->chars_at_source = 0;
7726 }
7727
7728
7729 /* Encode the text at CODING->src_object into CODING->dst_object.
7730    CODING->src_object is a buffer or a string.
7731    CODING->dst_object is a buffer or nil.
7732
7733    If CODING->src_object is a buffer, it must be the current buffer.
7734    In this case, if CODING->src_pos is positive, it is a position of
7735    the source text in the buffer, otherwise. the source text is in the
7736    gap area of the buffer, and coding->src_pos specifies the offset of
7737    the text from GPT (which must be the same as PT).  If this is the
7738    same buffer as CODING->dst_object, CODING->src_pos must be
7739    negative and CODING should not have `pre-write-conversion'.
7740
7741    If CODING->src_object is a string, CODING should not have
7742    `pre-write-conversion'.
7743
7744    If CODING->dst_object is a buffer, the encoded data is inserted at
7745    the current point of that buffer.
7746
7747    If CODING->dst_object is nil, the encoded data is placed at the
7748    memory area specified by CODING->destination.  */
7749
7750 static void
7751 encode_coding (struct coding_system *coding)
7752 {
7753   Lisp_Object attrs;
7754   Lisp_Object translation_table;
7755   int max_lookup;
7756   struct ccl_spec cclspec;
7757
7758   USE_SAFE_ALLOCA;
7759
7760   attrs = CODING_ID_ATTRS (coding->id);
7761   if (coding->encoder == encode_coding_raw_text)
7762     translation_table = Qnil, max_lookup = 0;
7763   else
7764     translation_table = get_translation_table (attrs, 1, &max_lookup);
7765
7766   if (BUFFERP (coding->dst_object))
7767     {
7768       set_buffer_internal (XBUFFER (coding->dst_object));
7769       coding->dst_multibyte
7770         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7771     }
7772
7773   coding->consumed = coding->consumed_char = 0;
7774   coding->produced = coding->produced_char = 0;
7775   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7776
7777   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7778
7779   if (coding->encoder == encode_coding_ccl)
7780     {
7781       coding->spec.ccl = &cclspec;
7782       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7783     }
7784   do {
7785     coding_set_source (coding);
7786     consume_chars (coding, translation_table, max_lookup);
7787     coding_set_destination (coding);
7788     (*(coding->encoder)) (coding);
7789   } while (coding->consumed_char < coding->src_chars);
7790
7791   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7792     insert_from_gap (coding->produced_char, coding->produced, 0);
7793
7794   SAFE_FREE ();
7795 }
7796
7797
7798 /* Name (or base name) of work buffer for code conversion.  */
7799 static Lisp_Object Vcode_conversion_workbuf_name;
7800
7801 /* A working buffer used by the top level conversion.  Once it is
7802    created, it is never destroyed.  It has the name
7803    Vcode_conversion_workbuf_name.  The other working buffers are
7804    destroyed after the use is finished, and their names are modified
7805    versions of Vcode_conversion_workbuf_name.  */
7806 static Lisp_Object Vcode_conversion_reused_workbuf;
7807
7808 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7809 static bool reused_workbuf_in_use;
7810
7811
7812 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7813    multibyteness of returning buffer.  */
7814
7815 static Lisp_Object
7816 make_conversion_work_buffer (bool multibyte)
7817 {
7818   Lisp_Object name, workbuf;
7819   struct buffer *current;
7820
7821   if (reused_workbuf_in_use)
7822     {
7823       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7824       workbuf = Fget_buffer_create (name);
7825     }
7826   else
7827     {
7828       reused_workbuf_in_use = 1;
7829       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7830         Vcode_conversion_reused_workbuf
7831           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7832       workbuf = Vcode_conversion_reused_workbuf;
7833     }
7834   current = current_buffer;
7835   set_buffer_internal (XBUFFER (workbuf));
7836   /* We can't allow modification hooks to run in the work buffer.  For
7837      instance, directory_files_internal assumes that file decoding
7838      doesn't compile new regexps.  */
7839   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7840   Ferase_buffer ();
7841   bset_undo_list (current_buffer, Qt);
7842   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7843   set_buffer_internal (current);
7844   return workbuf;
7845 }
7846
7847
7848 static void
7849 code_conversion_restore (Lisp_Object arg)
7850 {
7851   Lisp_Object current, workbuf;
7852
7853   current = XCAR (arg);
7854   workbuf = XCDR (arg);
7855   if (! NILP (workbuf))
7856     {
7857       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7858         reused_workbuf_in_use = 0;
7859       else
7860         Fkill_buffer (workbuf);
7861     }
7862   set_buffer_internal (XBUFFER (current));
7863 }
7864
7865 Lisp_Object
7866 code_conversion_save (bool with_work_buf, bool multibyte)
7867 {
7868   Lisp_Object workbuf = Qnil;
7869
7870   if (with_work_buf)
7871     workbuf = make_conversion_work_buffer (multibyte);
7872   record_unwind_protect (code_conversion_restore,
7873                          Fcons (Fcurrent_buffer (), workbuf));
7874   return workbuf;
7875 }
7876
7877 static void
7878 coding_restore_undo_list (Lisp_Object arg)
7879 {
7880   Lisp_Object undo_list = XCAR (arg);
7881   struct buffer *buf = XBUFFER (XCDR (arg));
7882
7883   bset_undo_list (buf, undo_list);
7884 }
7885
7886 void
7887 decode_coding_gap (struct coding_system *coding,
7888                    ptrdiff_t chars, ptrdiff_t bytes)
7889 {
7890   ptrdiff_t count = SPECPDL_INDEX ();
7891   Lisp_Object attrs;
7892
7893   coding->src_object = Fcurrent_buffer ();
7894   coding->src_chars = chars;
7895   coding->src_bytes = bytes;
7896   coding->src_pos = -chars;
7897   coding->src_pos_byte = -bytes;
7898   coding->src_multibyte = chars < bytes;
7899   coding->dst_object = coding->src_object;
7900   coding->dst_pos = PT;
7901   coding->dst_pos_byte = PT_BYTE;
7902   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7903
7904   coding->head_ascii = -1;
7905   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7906   coding->eol_seen = EOL_SEEN_NONE;
7907   if (CODING_REQUIRE_DETECTION (coding))
7908     detect_coding (coding);
7909   attrs = CODING_ID_ATTRS (coding->id);
7910   if (! disable_ascii_optimization
7911       && ! coding->src_multibyte
7912       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7913       && NILP (CODING_ATTR_POST_READ (attrs))
7914       && NILP (get_translation_table (attrs, 0, NULL)))
7915     {
7916       chars = coding->head_ascii;
7917       if (chars < 0)
7918         chars = check_ascii (coding);
7919       if (chars != bytes)
7920         {
7921           /* There exists a non-ASCII byte.  */
7922           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7923               && coding->detected_utf8_bytes == coding->src_bytes)
7924             {
7925               if (coding->detected_utf8_chars >= 0)
7926                 chars = coding->detected_utf8_chars;
7927               else
7928                 chars = check_utf_8 (coding);
7929               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7930                   && coding->head_ascii == 0
7931                   && coding->source[0] == UTF_8_BOM_1
7932                   && coding->source[1] == UTF_8_BOM_2
7933                   && coding->source[2] == UTF_8_BOM_3)
7934                 {
7935                   chars--;
7936                   bytes -= 3;
7937                   coding->src_bytes -= 3;
7938                 }
7939             }
7940           else
7941             chars = -1;
7942         }
7943       if (chars >= 0)
7944         {
7945           Lisp_Object eol_type;
7946
7947           eol_type = CODING_ID_EOL_TYPE (coding->id);
7948           if (VECTORP (eol_type))
7949             {
7950               if (coding->eol_seen != EOL_SEEN_NONE)
7951                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7952             }
7953           if (EQ (eol_type, Qmac))
7954             {
7955               unsigned char *src_end = GAP_END_ADDR;
7956               unsigned char *src = src_end - coding->src_bytes;
7957
7958               while (src < src_end)
7959                 {
7960                   if (*src++ == '\r')
7961                     src[-1] = '\n';
7962                 }
7963             }
7964           else if (EQ (eol_type, Qdos))
7965             {
7966               unsigned char *src = GAP_END_ADDR;
7967               unsigned char *src_beg = src - coding->src_bytes;
7968               unsigned char *dst = src;
7969               ptrdiff_t diff;
7970
7971               while (src_beg < src)
7972                 {
7973                   *--dst = *--src;
7974                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7975                     src--;
7976                 }
7977               diff = dst - src;
7978               bytes -= diff;
7979               chars -= diff;
7980             }
7981           coding->produced = bytes;
7982           coding->produced_char = chars;
7983           insert_from_gap (chars, bytes, 1);
7984           return;
7985         }
7986     }
7987   code_conversion_save (0, 0);
7988
7989   coding->mode |= CODING_MODE_LAST_BLOCK;
7990   current_buffer->text->inhibit_shrinking = 1;
7991   decode_coding (coding);
7992   current_buffer->text->inhibit_shrinking = 0;
7993
7994   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7995     {
7996       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7997       Lisp_Object val;
7998       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
7999       ptrdiff_t count1 = SPECPDL_INDEX ();
8000
8001       record_unwind_protect (coding_restore_undo_list,
8002                              Fcons (undo_list, Fcurrent_buffer ()));
8003       bset_undo_list (current_buffer, Qt);
8004       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8005       val = call1 (CODING_ATTR_POST_READ (attrs),
8006                    make_number (coding->produced_char));
8007       CHECK_NATNUM (val);
8008       coding->produced_char += Z - prev_Z;
8009       coding->produced += Z_BYTE - prev_Z_BYTE;
8010       unbind_to (count1, Qnil);
8011     }
8012
8013   unbind_to (count, Qnil);
8014 }
8015
8016
8017 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8018    SRC_OBJECT into DST_OBJECT by coding context CODING.
8019
8020    SRC_OBJECT is a buffer, a string, or Qnil.
8021
8022    If it is a buffer, the text is at point of the buffer.  FROM and TO
8023    are positions in the buffer.
8024
8025    If it is a string, the text is at the beginning of the string.
8026    FROM and TO are indices to the string.
8027
8028    If it is nil, the text is at coding->source.  FROM and TO are
8029    indices to coding->source.
8030
8031    DST_OBJECT is a buffer, Qt, or Qnil.
8032
8033    If it is a buffer, the decoded text is inserted at point of the
8034    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8035    is deleted.
8036
8037    If it is Qt, a string is made from the decoded text, and
8038    set in CODING->dst_object.
8039
8040    If it is Qnil, the decoded text is stored at CODING->destination.
8041    The caller must allocate CODING->dst_bytes bytes at
8042    CODING->destination by xmalloc.  If the decoded text is longer than
8043    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8044  */
8045
8046 void
8047 decode_coding_object (struct coding_system *coding,
8048                       Lisp_Object src_object,
8049                       ptrdiff_t from, ptrdiff_t from_byte,
8050                       ptrdiff_t to, ptrdiff_t to_byte,
8051                       Lisp_Object dst_object)
8052 {
8053   ptrdiff_t count = SPECPDL_INDEX ();
8054   unsigned char *destination UNINIT;
8055   ptrdiff_t dst_bytes UNINIT;
8056   ptrdiff_t chars = to - from;
8057   ptrdiff_t bytes = to_byte - from_byte;
8058   Lisp_Object attrs;
8059   ptrdiff_t saved_pt = -1, saved_pt_byte UNINIT;
8060   bool need_marker_adjustment = 0;
8061   Lisp_Object old_deactivate_mark;
8062
8063   old_deactivate_mark = Vdeactivate_mark;
8064
8065   if (NILP (dst_object))
8066     {
8067       destination = coding->destination;
8068       dst_bytes = coding->dst_bytes;
8069     }
8070
8071   coding->src_object = src_object;
8072   coding->src_chars = chars;
8073   coding->src_bytes = bytes;
8074   coding->src_multibyte = chars < bytes;
8075
8076   if (STRINGP (src_object))
8077     {
8078       coding->src_pos = from;
8079       coding->src_pos_byte = from_byte;
8080     }
8081   else if (BUFFERP (src_object))
8082     {
8083       set_buffer_internal (XBUFFER (src_object));
8084       if (from != GPT)
8085         move_gap_both (from, from_byte);
8086       if (EQ (src_object, dst_object))
8087         {
8088           struct Lisp_Marker *tail;
8089
8090           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8091             {
8092               tail->need_adjustment
8093                 = tail->charpos == (tail->insertion_type ? from : to);
8094               need_marker_adjustment |= tail->need_adjustment;
8095             }
8096           saved_pt = PT, saved_pt_byte = PT_BYTE;
8097           TEMP_SET_PT_BOTH (from, from_byte);
8098           current_buffer->text->inhibit_shrinking = 1;
8099           del_range_both (from, from_byte, to, to_byte, 1);
8100           coding->src_pos = -chars;
8101           coding->src_pos_byte = -bytes;
8102         }
8103       else
8104         {
8105           coding->src_pos = from;
8106           coding->src_pos_byte = from_byte;
8107         }
8108     }
8109
8110   if (CODING_REQUIRE_DETECTION (coding))
8111     detect_coding (coding);
8112   attrs = CODING_ID_ATTRS (coding->id);
8113
8114   if (EQ (dst_object, Qt)
8115       || (! NILP (CODING_ATTR_POST_READ (attrs))
8116           && NILP (dst_object)))
8117     {
8118       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8119       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8120       coding->dst_pos = BEG;
8121       coding->dst_pos_byte = BEG_BYTE;
8122     }
8123   else if (BUFFERP (dst_object))
8124     {
8125       code_conversion_save (0, 0);
8126       coding->dst_object = dst_object;
8127       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8128       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8129       coding->dst_multibyte
8130         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8131     }
8132   else
8133     {
8134       code_conversion_save (0, 0);
8135       coding->dst_object = Qnil;
8136       /* Most callers presume this will return a multibyte result, and they
8137          won't use `binary' or `raw-text' anyway, so let's not worry about
8138          CODING_FOR_UNIBYTE.  */
8139       coding->dst_multibyte = 1;
8140     }
8141
8142   decode_coding (coding);
8143
8144   if (BUFFERP (coding->dst_object))
8145     set_buffer_internal (XBUFFER (coding->dst_object));
8146
8147   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8148     {
8149       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8150       Lisp_Object val;
8151       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
8152       ptrdiff_t count1 = SPECPDL_INDEX ();
8153
8154       record_unwind_protect (coding_restore_undo_list,
8155                              Fcons (undo_list, Fcurrent_buffer ()));
8156       bset_undo_list (current_buffer, Qt);
8157       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8158       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8159                         make_number (coding->produced_char));
8160       CHECK_NATNUM (val);
8161       coding->produced_char += Z - prev_Z;
8162       coding->produced += Z_BYTE - prev_Z_BYTE;
8163       unbind_to (count1, Qnil);
8164     }
8165
8166   if (EQ (dst_object, Qt))
8167     {
8168       coding->dst_object = Fbuffer_string ();
8169     }
8170   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8171     {
8172       set_buffer_internal (XBUFFER (coding->dst_object));
8173       if (dst_bytes < coding->produced)
8174         {
8175           eassert (coding->produced > 0);
8176           destination = xrealloc (destination, coding->produced);
8177           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8178             move_gap_both (BEGV, BEGV_BYTE);
8179           memcpy (destination, BEGV_ADDR, coding->produced);
8180           coding->destination = destination;
8181         }
8182     }
8183
8184   if (saved_pt >= 0)
8185     {
8186       /* This is the case of:
8187          (BUFFERP (src_object) && EQ (src_object, dst_object))
8188          As we have moved PT while replacing the original buffer
8189          contents, we must recover it now.  */
8190       set_buffer_internal (XBUFFER (src_object));
8191       current_buffer->text->inhibit_shrinking = 0;
8192       if (saved_pt < from)
8193         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8194       else if (saved_pt < from + chars)
8195         TEMP_SET_PT_BOTH (from, from_byte);
8196       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8197         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8198                           saved_pt_byte + (coding->produced - bytes));
8199       else
8200         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8201                           saved_pt_byte + (coding->produced - bytes));
8202
8203       if (need_marker_adjustment)
8204         {
8205           struct Lisp_Marker *tail;
8206
8207           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8208             if (tail->need_adjustment)
8209               {
8210                 tail->need_adjustment = 0;
8211                 if (tail->insertion_type)
8212                   {
8213                     tail->bytepos = from_byte;
8214                     tail->charpos = from;
8215                   }
8216                 else
8217                   {
8218                     tail->bytepos = from_byte + coding->produced;
8219                     tail->charpos
8220                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8221                          ? tail->bytepos : from + coding->produced_char);
8222                   }
8223               }
8224         }
8225     }
8226
8227   Vdeactivate_mark = old_deactivate_mark;
8228   unbind_to (count, coding->dst_object);
8229 }
8230
8231
8232 void
8233 encode_coding_object (struct coding_system *coding,
8234                       Lisp_Object src_object,
8235                       ptrdiff_t from, ptrdiff_t from_byte,
8236                       ptrdiff_t to, ptrdiff_t to_byte,
8237                       Lisp_Object dst_object)
8238 {
8239   ptrdiff_t count = SPECPDL_INDEX ();
8240   ptrdiff_t chars = to - from;
8241   ptrdiff_t bytes = to_byte - from_byte;
8242   Lisp_Object attrs;
8243   ptrdiff_t saved_pt = -1, saved_pt_byte;
8244   bool need_marker_adjustment = 0;
8245   bool kill_src_buffer = 0;
8246   Lisp_Object old_deactivate_mark;
8247
8248   old_deactivate_mark = Vdeactivate_mark;
8249
8250   coding->src_object = src_object;
8251   coding->src_chars = chars;
8252   coding->src_bytes = bytes;
8253   coding->src_multibyte = chars < bytes;
8254
8255   attrs = CODING_ID_ATTRS (coding->id);
8256
8257   if (EQ (src_object, dst_object))
8258     {
8259       struct Lisp_Marker *tail;
8260
8261       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8262         {
8263           tail->need_adjustment
8264             = tail->charpos == (tail->insertion_type ? from : to);
8265           need_marker_adjustment |= tail->need_adjustment;
8266         }
8267     }
8268
8269   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8270     {
8271       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8272       set_buffer_internal (XBUFFER (coding->src_object));
8273       if (STRINGP (src_object))
8274         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8275       else if (BUFFERP (src_object))
8276         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8277       else
8278         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8279
8280       if (EQ (src_object, dst_object))
8281         {
8282           set_buffer_internal (XBUFFER (src_object));
8283           saved_pt = PT, saved_pt_byte = PT_BYTE;
8284           del_range_both (from, from_byte, to, to_byte, 1);
8285           set_buffer_internal (XBUFFER (coding->src_object));
8286         }
8287
8288       safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8289                   make_number (BEG), make_number (Z));
8290       if (XBUFFER (coding->src_object) != current_buffer)
8291         kill_src_buffer = 1;
8292       coding->src_object = Fcurrent_buffer ();
8293       if (BEG != GPT)
8294         move_gap_both (BEG, BEG_BYTE);
8295       coding->src_chars = Z - BEG;
8296       coding->src_bytes = Z_BYTE - BEG_BYTE;
8297       coding->src_pos = BEG;
8298       coding->src_pos_byte = BEG_BYTE;
8299       coding->src_multibyte = Z < Z_BYTE;
8300     }
8301   else if (STRINGP (src_object))
8302     {
8303       code_conversion_save (0, 0);
8304       coding->src_pos = from;
8305       coding->src_pos_byte = from_byte;
8306     }
8307   else if (BUFFERP (src_object))
8308     {
8309       code_conversion_save (0, 0);
8310       set_buffer_internal (XBUFFER (src_object));
8311       if (EQ (src_object, dst_object))
8312         {
8313           saved_pt = PT, saved_pt_byte = PT_BYTE;
8314           coding->src_object = del_range_1 (from, to, 1, 1);
8315           coding->src_pos = 0;
8316           coding->src_pos_byte = 0;
8317         }
8318       else
8319         {
8320           if (from < GPT && to >= GPT)
8321             move_gap_both (from, from_byte);
8322           coding->src_pos = from;
8323           coding->src_pos_byte = from_byte;
8324         }
8325     }
8326   else
8327     {
8328       code_conversion_save (0, 0);
8329       coding->src_pos = from;
8330       coding->src_pos_byte = from_byte;
8331     }
8332
8333   if (BUFFERP (dst_object))
8334     {
8335       coding->dst_object = dst_object;
8336       if (EQ (src_object, dst_object))
8337         {
8338           coding->dst_pos = from;
8339           coding->dst_pos_byte = from_byte;
8340         }
8341       else
8342         {
8343           struct buffer *current = current_buffer;
8344
8345           set_buffer_temp (XBUFFER (dst_object));
8346           coding->dst_pos = PT;
8347           coding->dst_pos_byte = PT_BYTE;
8348           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8349           set_buffer_temp (current);
8350         }
8351       coding->dst_multibyte
8352         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8353     }
8354   else if (EQ (dst_object, Qt))
8355     {
8356       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8357       coding->dst_object = Qnil;
8358       coding->destination = xmalloc (dst_bytes);
8359       coding->dst_bytes = dst_bytes;
8360       coding->dst_multibyte = 0;
8361     }
8362   else
8363     {
8364       coding->dst_object = Qnil;
8365       coding->dst_multibyte = 0;
8366     }
8367
8368   encode_coding (coding);
8369
8370   if (EQ (dst_object, Qt))
8371     {
8372       if (BUFFERP (coding->dst_object))
8373         coding->dst_object = Fbuffer_string ();
8374       else if (coding->raw_destination)
8375         /* This is used to avoid creating huge Lisp string.
8376            NOTE: caller who sets `raw_destination' is also
8377            responsible for freeing `destination' buffer.  */
8378         coding->dst_object = Qnil;
8379       else
8380         {
8381           coding->dst_object
8382             = make_unibyte_string ((char *) coding->destination,
8383                                    coding->produced);
8384           xfree (coding->destination);
8385         }
8386     }
8387
8388   if (saved_pt >= 0)
8389     {
8390       /* This is the case of:
8391          (BUFFERP (src_object) && EQ (src_object, dst_object))
8392          As we have moved PT while replacing the original buffer
8393          contents, we must recover it now.  */
8394       set_buffer_internal (XBUFFER (src_object));
8395       if (saved_pt < from)
8396         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8397       else if (saved_pt < from + chars)
8398         TEMP_SET_PT_BOTH (from, from_byte);
8399       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8400         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8401                           saved_pt_byte + (coding->produced - bytes));
8402       else
8403         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8404                           saved_pt_byte + (coding->produced - bytes));
8405
8406       if (need_marker_adjustment)
8407         {
8408           struct Lisp_Marker *tail;
8409
8410           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8411             if (tail->need_adjustment)
8412               {
8413                 tail->need_adjustment = 0;
8414                 if (tail->insertion_type)
8415                   {
8416                     tail->bytepos = from_byte;
8417                     tail->charpos = from;
8418                   }
8419                 else
8420                   {
8421                     tail->bytepos = from_byte + coding->produced;
8422                     tail->charpos
8423                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8424                          ? tail->bytepos : from + coding->produced_char);
8425                   }
8426               }
8427         }
8428     }
8429
8430   if (kill_src_buffer)
8431     Fkill_buffer (coding->src_object);
8432
8433   Vdeactivate_mark = old_deactivate_mark;
8434   unbind_to (count, Qnil);
8435 }
8436
8437
8438 Lisp_Object
8439 preferred_coding_system (void)
8440 {
8441   int id = coding_categories[coding_priorities[0]].id;
8442
8443   return CODING_ID_NAME (id);
8444 }
8445
8446 #if defined (WINDOWSNT) || defined (CYGWIN)
8447
8448 Lisp_Object
8449 from_unicode (Lisp_Object str)
8450 {
8451   CHECK_STRING (str);
8452   if (!STRING_MULTIBYTE (str) &&
8453       SBYTES (str) & 1)
8454     {
8455       str = Fsubstring (str, make_number (0), make_number (-1));
8456     }
8457
8458   return code_convert_string_norecord (str, Qutf_16le, 0);
8459 }
8460
8461 Lisp_Object
8462 from_unicode_buffer (const wchar_t *wstr)
8463 {
8464   /* We get one of the two final null bytes for free.  */
8465   ptrdiff_t len = 1 + sizeof (wchar_t) * wcslen (wstr);
8466   AUTO_STRING_WITH_LEN (str, (char *) wstr, len);
8467   return from_unicode (str);
8468 }
8469
8470 wchar_t *
8471 to_unicode (Lisp_Object str, Lisp_Object *buf)
8472 {
8473   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8474   /* We need to make another copy (in addition to the one made by
8475      code_convert_string_norecord) to ensure that the final string is
8476      _doubly_ zero terminated --- that is, that the string is
8477      terminated by two zero bytes and one utf-16le null character.
8478      Because strings are already terminated with a single zero byte,
8479      we just add one additional zero. */
8480   str = make_uninit_string (SBYTES (*buf) + 1);
8481   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8482   SDATA (str) [SBYTES (*buf)] = '\0';
8483   *buf = str;
8484   return WCSDATA (*buf);
8485 }
8486
8487 #endif /* WINDOWSNT || CYGWIN */
8488
8489 \f
8490 #ifdef emacs
8491 /*** 8. Emacs Lisp library functions ***/
8492
8493 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8494        doc: /* Return t if OBJECT is nil or a coding-system.
8495 See the documentation of `define-coding-system' for information
8496 about coding-system objects.  */)
8497   (Lisp_Object object)
8498 {
8499   if (NILP (object)
8500       || CODING_SYSTEM_ID (object) >= 0)
8501     return Qt;
8502   if (! SYMBOLP (object)
8503       || NILP (Fget (object, Qcoding_system_define_form)))
8504     return Qnil;
8505   return Qt;
8506 }
8507
8508 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8509        Sread_non_nil_coding_system, 1, 1, 0,
8510        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8511   (Lisp_Object prompt)
8512 {
8513   Lisp_Object val;
8514   do
8515     {
8516       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8517                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8518     }
8519   while (SCHARS (val) == 0);
8520   return (Fintern (val, Qnil));
8521 }
8522
8523 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8524        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8525 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8526 Ignores case when completing coding systems (all Emacs coding systems
8527 are lower-case).  */)
8528   (Lisp_Object prompt, Lisp_Object default_coding_system)
8529 {
8530   Lisp_Object val;
8531   ptrdiff_t count = SPECPDL_INDEX ();
8532
8533   if (SYMBOLP (default_coding_system))
8534     default_coding_system = SYMBOL_NAME (default_coding_system);
8535   specbind (Qcompletion_ignore_case, Qt);
8536   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8537                           Qt, Qnil, Qcoding_system_history,
8538                           default_coding_system, Qnil);
8539   unbind_to (count, Qnil);
8540   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8541 }
8542
8543 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8544        1, 1, 0,
8545        doc: /* Check validity of CODING-SYSTEM.
8546 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8547 It is valid if it is nil or a symbol defined as a coding system by the
8548 function `define-coding-system'.  */)
8549   (Lisp_Object coding_system)
8550 {
8551   Lisp_Object define_form;
8552
8553   define_form = Fget (coding_system, Qcoding_system_define_form);
8554   if (! NILP (define_form))
8555     {
8556       Fput (coding_system, Qcoding_system_define_form, Qnil);
8557       safe_eval (define_form);
8558     }
8559   if (!NILP (Fcoding_system_p (coding_system)))
8560     return coding_system;
8561   xsignal1 (Qcoding_system_error, coding_system);
8562 }
8563
8564 \f
8565 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8566    HIGHEST, return the coding system of the highest
8567    priority among the detected coding systems.  Otherwise return a
8568    list of detected coding systems sorted by their priorities.  If
8569    MULTIBYTEP, it is assumed that the bytes are in correct
8570    multibyte form but contains only ASCII and eight-bit chars.
8571    Otherwise, the bytes are raw bytes.
8572
8573    CODING-SYSTEM controls the detection as below:
8574
8575    If it is nil, detect both text-format and eol-format.  If the
8576    text-format part of CODING-SYSTEM is already specified
8577    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8578    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8579    detect only text-format.  */
8580
8581 Lisp_Object
8582 detect_coding_system (const unsigned char *src,
8583                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8584                       bool highest, bool multibytep,
8585                       Lisp_Object coding_system)
8586 {
8587   const unsigned char *src_end = src + src_bytes;
8588   Lisp_Object attrs, eol_type;
8589   Lisp_Object val = Qnil;
8590   struct coding_system coding;
8591   ptrdiff_t id;
8592   struct coding_detection_info detect_info;
8593   enum coding_category base_category;
8594   bool null_byte_found = 0, eight_bit_found = 0;
8595
8596   if (NILP (coding_system))
8597     coding_system = Qundecided;
8598   setup_coding_system (coding_system, &coding);
8599   attrs = CODING_ID_ATTRS (coding.id);
8600   eol_type = CODING_ID_EOL_TYPE (coding.id);
8601   coding_system = CODING_ATTR_BASE_NAME (attrs);
8602
8603   coding.source = src;
8604   coding.src_chars = src_chars;
8605   coding.src_bytes = src_bytes;
8606   coding.src_multibyte = multibytep;
8607   coding.consumed = 0;
8608   coding.mode |= CODING_MODE_LAST_BLOCK;
8609   coding.head_ascii = 0;
8610
8611   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8612
8613   /* At first, detect text-format if necessary.  */
8614   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8615   if (base_category == coding_category_undecided)
8616     {
8617       enum coding_category category UNINIT;
8618       struct coding_system *this UNINIT;
8619       int c, i;
8620       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8621                                        inhibit_null_byte_detection);
8622       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8623                                        inhibit_iso_escape_detection);
8624       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8625
8626       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8627       for (; src < src_end; src++)
8628         {
8629           c = *src;
8630           if (c & 0x80)
8631             {
8632               eight_bit_found = 1;
8633               if (null_byte_found)
8634                 break;
8635             }
8636           else if (c < 0x20)
8637             {
8638               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8639                   && ! inhibit_ied
8640                   && ! detect_info.checked)
8641                 {
8642                   if (detect_coding_iso_2022 (&coding, &detect_info))
8643                     {
8644                       /* We have scanned the whole data.  */
8645                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8646                         {
8647                           /* We didn't find an 8-bit code.  We may
8648                              have found a null-byte, but it's very
8649                              rare that a binary file confirm to
8650                              ISO-2022.  */
8651                           src = src_end;
8652                           coding.head_ascii = src - coding.source;
8653                         }
8654                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8655                       break;
8656                     }
8657                 }
8658               else if (! c && !inhibit_nbd)
8659                 {
8660                   null_byte_found = 1;
8661                   if (eight_bit_found)
8662                     break;
8663                 }
8664               if (! eight_bit_found)
8665                 coding.head_ascii++;
8666             }
8667           else if (! eight_bit_found)
8668             coding.head_ascii++;
8669         }
8670
8671       if (null_byte_found || eight_bit_found
8672           || coding.head_ascii < coding.src_bytes
8673           || detect_info.found)
8674         {
8675           if (coding.head_ascii == coding.src_bytes)
8676             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8677             for (i = 0; i < coding_category_raw_text; i++)
8678               {
8679                 category = coding_priorities[i];
8680                 this = coding_categories + category;
8681                 if (detect_info.found & (1 << category))
8682                   break;
8683               }
8684           else
8685             {
8686               if (null_byte_found)
8687                 {
8688                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8689                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8690                 }
8691               else if (prefer_utf_8
8692                        && detect_coding_utf_8 (&coding, &detect_info))
8693                 {
8694                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8695                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8696                 }
8697               for (i = 0; i < coding_category_raw_text; i++)
8698                 {
8699                   category = coding_priorities[i];
8700                   this = coding_categories + category;
8701
8702                   if (this->id < 0)
8703                     {
8704                       /* No coding system of this category is defined.  */
8705                       detect_info.rejected |= (1 << category);
8706                     }
8707                   else if (category >= coding_category_raw_text)
8708                     continue;
8709                   else if (detect_info.checked & (1 << category))
8710                     {
8711                       if (highest
8712                           && (detect_info.found & (1 << category)))
8713                         break;
8714                     }
8715                   else if ((*(this->detector)) (&coding, &detect_info)
8716                            && highest
8717                            && (detect_info.found & (1 << category)))
8718                     {
8719                       if (category == coding_category_utf_16_auto)
8720                         {
8721                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8722                             category = coding_category_utf_16_le;
8723                           else
8724                             category = coding_category_utf_16_be;
8725                         }
8726                       break;
8727                     }
8728                 }
8729             }
8730         }
8731
8732       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8733           || null_byte_found)
8734         {
8735           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8736           id = CODING_SYSTEM_ID (Qno_conversion);
8737           val = list1 (make_number (id));
8738         }
8739       else if (! detect_info.rejected && ! detect_info.found)
8740         {
8741           detect_info.found = CATEGORY_MASK_ANY;
8742           id = coding_categories[coding_category_undecided].id;
8743           val = list1 (make_number (id));
8744         }
8745       else if (highest)
8746         {
8747           if (detect_info.found)
8748             {
8749               detect_info.found = 1 << category;
8750               val = list1 (make_number (this->id));
8751             }
8752           else
8753             for (i = 0; i < coding_category_raw_text; i++)
8754               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8755                 {
8756                   detect_info.found = 1 << coding_priorities[i];
8757                   id = coding_categories[coding_priorities[i]].id;
8758                   val = list1 (make_number (id));
8759                   break;
8760                 }
8761         }
8762       else
8763         {
8764           int mask = detect_info.rejected | detect_info.found;
8765           int found = 0;
8766
8767           for (i = coding_category_raw_text - 1; i >= 0; i--)
8768             {
8769               category = coding_priorities[i];
8770               if (! (mask & (1 << category)))
8771                 {
8772                   found |= 1 << category;
8773                   id = coding_categories[category].id;
8774                   if (id >= 0)
8775                     val = list1 (make_number (id));
8776                 }
8777             }
8778           for (i = coding_category_raw_text - 1; i >= 0; i--)
8779             {
8780               category = coding_priorities[i];
8781               if (detect_info.found & (1 << category))
8782                 {
8783                   id = coding_categories[category].id;
8784                   val = Fcons (make_number (id), val);
8785                 }
8786             }
8787           detect_info.found |= found;
8788         }
8789     }
8790   else if (base_category == coding_category_utf_8_auto)
8791     {
8792       if (detect_coding_utf_8 (&coding, &detect_info))
8793         {
8794           struct coding_system *this;
8795
8796           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8797             this = coding_categories + coding_category_utf_8_sig;
8798           else
8799             this = coding_categories + coding_category_utf_8_nosig;
8800           val = list1 (make_number (this->id));
8801         }
8802     }
8803   else if (base_category == coding_category_utf_16_auto)
8804     {
8805       if (detect_coding_utf_16 (&coding, &detect_info))
8806         {
8807           struct coding_system *this;
8808
8809           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8810             this = coding_categories + coding_category_utf_16_le;
8811           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8812             this = coding_categories + coding_category_utf_16_be;
8813           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8814             this = coding_categories + coding_category_utf_16_be_nosig;
8815           else
8816             this = coding_categories + coding_category_utf_16_le_nosig;
8817           val = list1 (make_number (this->id));
8818         }
8819     }
8820   else
8821     {
8822       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8823       val = list1 (make_number (coding.id));
8824     }
8825
8826   /* Then, detect eol-format if necessary.  */
8827   {
8828     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8829     Lisp_Object tail;
8830
8831     if (VECTORP (eol_type))
8832       {
8833         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8834           {
8835             if (null_byte_found)
8836               normal_eol = EOL_SEEN_LF;
8837             else
8838               normal_eol = detect_eol (coding.source, src_bytes,
8839                                        coding_category_raw_text);
8840           }
8841         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8842                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8843           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8844                                       coding_category_utf_16_be);
8845         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8846                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8847           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8848                                       coding_category_utf_16_le);
8849       }
8850     else
8851       {
8852         if (EQ (eol_type, Qunix))
8853           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8854         else if (EQ (eol_type, Qdos))
8855           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8856         else
8857           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8858       }
8859
8860     for (tail = val; CONSP (tail); tail = XCDR (tail))
8861       {
8862         enum coding_category category;
8863         int this_eol;
8864
8865         id = XINT (XCAR (tail));
8866         attrs = CODING_ID_ATTRS (id);
8867         category = XINT (CODING_ATTR_CATEGORY (attrs));
8868         eol_type = CODING_ID_EOL_TYPE (id);
8869         if (VECTORP (eol_type))
8870           {
8871             if (category == coding_category_utf_16_be
8872                 || category == coding_category_utf_16_be_nosig)
8873               this_eol = utf_16_be_eol;
8874             else if (category == coding_category_utf_16_le
8875                      || category == coding_category_utf_16_le_nosig)
8876               this_eol = utf_16_le_eol;
8877             else
8878               this_eol = normal_eol;
8879
8880             if (this_eol == EOL_SEEN_LF)
8881               XSETCAR (tail, AREF (eol_type, 0));
8882             else if (this_eol == EOL_SEEN_CRLF)
8883               XSETCAR (tail, AREF (eol_type, 1));
8884             else if (this_eol == EOL_SEEN_CR)
8885               XSETCAR (tail, AREF (eol_type, 2));
8886             else
8887               XSETCAR (tail, CODING_ID_NAME (id));
8888           }
8889         else
8890           XSETCAR (tail, CODING_ID_NAME (id));
8891       }
8892   }
8893
8894   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8895 }
8896
8897
8898 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8899        2, 3, 0,
8900        doc: /* Detect coding system of the text in the region between START and END.
8901 Return a list of possible coding systems ordered by priority.
8902 The coding systems to try and their priorities follows what
8903 the function `coding-system-priority-list' (which see) returns.
8904
8905 If only ASCII characters are found (except for such ISO-2022 control
8906 characters as ESC), it returns a list of single element `undecided'
8907 or its subsidiary coding system according to a detected end-of-line
8908 format.
8909
8910 If optional argument HIGHEST is non-nil, return the coding system of
8911 highest priority.  */)
8912   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8913 {
8914   ptrdiff_t from, to;
8915   ptrdiff_t from_byte, to_byte;
8916
8917   validate_region (&start, &end);
8918   from = XINT (start), to = XINT (end);
8919   from_byte = CHAR_TO_BYTE (from);
8920   to_byte = CHAR_TO_BYTE (to);
8921
8922   if (from < GPT && to >= GPT)
8923     move_gap_both (to, to_byte);
8924
8925   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8926                                to - from, to_byte - from_byte,
8927                                !NILP (highest),
8928                                !NILP (BVAR (current_buffer
8929                                       , enable_multibyte_characters)),
8930                                Qnil);
8931 }
8932
8933 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8934        1, 2, 0,
8935        doc: /* Detect coding system of the text in STRING.
8936 Return a list of possible coding systems ordered by priority.
8937 The coding systems to try and their priorities follows what
8938 the function `coding-system-priority-list' (which see) returns.
8939
8940 If only ASCII characters are found (except for such ISO-2022 control
8941 characters as ESC), it returns a list of single element `undecided'
8942 or its subsidiary coding system according to a detected end-of-line
8943 format.
8944
8945 If optional argument HIGHEST is non-nil, return the coding system of
8946 highest priority.  */)
8947   (Lisp_Object string, Lisp_Object highest)
8948 {
8949   CHECK_STRING (string);
8950
8951   return detect_coding_system (SDATA (string),
8952                                SCHARS (string), SBYTES (string),
8953                                !NILP (highest), STRING_MULTIBYTE (string),
8954                                Qnil);
8955 }
8956
8957
8958 static bool
8959 char_encodable_p (int c, Lisp_Object attrs)
8960 {
8961   Lisp_Object tail;
8962   struct charset *charset;
8963   Lisp_Object translation_table;
8964
8965   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8966   if (! NILP (translation_table))
8967     c = translate_char (translation_table, c);
8968   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8969        CONSP (tail); tail = XCDR (tail))
8970     {
8971       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8972       if (CHAR_CHARSET_P (c, charset))
8973         break;
8974     }
8975   return (! NILP (tail));
8976 }
8977
8978
8979 /* Return a list of coding systems that safely encode the text between
8980    START and END.  If EXCLUDE is non-nil, it is a list of coding
8981    systems not to check.  The returned list doesn't contain any such
8982    coding systems.  In any case, if the text contains only ASCII or is
8983    unibyte, return t.  */
8984
8985 DEFUN ("find-coding-systems-region-internal",
8986        Ffind_coding_systems_region_internal,
8987        Sfind_coding_systems_region_internal, 2, 3, 0,
8988        doc: /* Internal use only.  */)
8989   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8990 {
8991   Lisp_Object coding_attrs_list, safe_codings;
8992   ptrdiff_t start_byte, end_byte;
8993   const unsigned char *p, *pbeg, *pend;
8994   int c;
8995   Lisp_Object tail, elt, work_table;
8996
8997   if (STRINGP (start))
8998     {
8999       if (!STRING_MULTIBYTE (start)
9000           || SCHARS (start) == SBYTES (start))
9001         return Qt;
9002       start_byte = 0;
9003       end_byte = SBYTES (start);
9004     }
9005   else
9006     {
9007       CHECK_NUMBER_COERCE_MARKER (start);
9008       CHECK_NUMBER_COERCE_MARKER (end);
9009       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9010         args_out_of_range (start, end);
9011       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9012         return Qt;
9013       start_byte = CHAR_TO_BYTE (XINT (start));
9014       end_byte = CHAR_TO_BYTE (XINT (end));
9015       if (XINT (end) - XINT (start) == end_byte - start_byte)
9016         return Qt;
9017
9018       if (XINT (start) < GPT && XINT (end) > GPT)
9019         {
9020           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9021             move_gap_both (XINT (start), start_byte);
9022           else
9023             move_gap_both (XINT (end), end_byte);
9024         }
9025     }
9026
9027   coding_attrs_list = Qnil;
9028   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9029     if (NILP (exclude)
9030         || NILP (Fmemq (XCAR (tail), exclude)))
9031       {
9032         Lisp_Object attrs;
9033
9034         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9035         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9036           {
9037             ASET (attrs, coding_attr_trans_tbl,
9038                   get_translation_table (attrs, 1, NULL));
9039             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9040           }
9041       }
9042
9043   if (STRINGP (start))
9044     p = pbeg = SDATA (start);
9045   else
9046     p = pbeg = BYTE_POS_ADDR (start_byte);
9047   pend = p + (end_byte - start_byte);
9048
9049   while (p < pend && ASCII_CHAR_P (*p)) p++;
9050   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9051
9052   work_table = Fmake_char_table (Qnil, Qnil);
9053   while (p < pend)
9054     {
9055       if (ASCII_CHAR_P (*p))
9056         p++;
9057       else
9058         {
9059           c = STRING_CHAR_ADVANCE (p);
9060           if (!NILP (char_table_ref (work_table, c)))
9061             /* This character was already checked.  Ignore it.  */
9062             continue;
9063
9064           charset_map_loaded = 0;
9065           for (tail = coding_attrs_list; CONSP (tail);)
9066             {
9067               elt = XCAR (tail);
9068               if (NILP (elt))
9069                 tail = XCDR (tail);
9070               else if (char_encodable_p (c, elt))
9071                 tail = XCDR (tail);
9072               else if (CONSP (XCDR (tail)))
9073                 {
9074                   XSETCAR (tail, XCAR (XCDR (tail)));
9075                   XSETCDR (tail, XCDR (XCDR (tail)));
9076                 }
9077               else
9078                 {
9079                   XSETCAR (tail, Qnil);
9080                   tail = XCDR (tail);
9081                 }
9082             }
9083           if (charset_map_loaded)
9084             {
9085               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9086
9087               if (STRINGP (start))
9088                 pbeg = SDATA (start);
9089               else
9090                 pbeg = BYTE_POS_ADDR (start_byte);
9091               p = pbeg + p_offset;
9092               pend = pbeg + pend_offset;
9093             }
9094           char_table_set (work_table, c, Qt);
9095         }
9096     }
9097
9098   safe_codings = list2 (Qraw_text, Qno_conversion);
9099   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9100     if (! NILP (XCAR (tail)))
9101       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9102
9103   return safe_codings;
9104 }
9105
9106
9107 DEFUN ("unencodable-char-position", Funencodable_char_position,
9108        Sunencodable_char_position, 3, 5, 0,
9109        doc: /* Return position of first un-encodable character in a region.
9110 START and END specify the region and CODING-SYSTEM specifies the
9111 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9112
9113 If optional 4th argument COUNT is non-nil, it specifies at most how
9114 many un-encodable characters to search.  In this case, the value is a
9115 list of positions.
9116
9117 If optional 5th argument STRING is non-nil, it is a string to search
9118 for un-encodable characters.  In that case, START and END are indexes
9119 to the string and treated as in `substring'.  */)
9120   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9121    Lisp_Object count, Lisp_Object string)
9122 {
9123   EMACS_INT n;
9124   struct coding_system coding;
9125   Lisp_Object attrs, charset_list, translation_table;
9126   Lisp_Object positions;
9127   ptrdiff_t from, to;
9128   const unsigned char *p, *stop, *pend;
9129   bool ascii_compatible;
9130
9131   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9132   attrs = CODING_ID_ATTRS (coding.id);
9133   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9134     return Qnil;
9135   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9136   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9137   translation_table = get_translation_table (attrs, 1, NULL);
9138
9139   if (NILP (string))
9140     {
9141       validate_region (&start, &end);
9142       from = XINT (start);
9143       to = XINT (end);
9144       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9145           || (ascii_compatible
9146               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9147         return Qnil;
9148       p = CHAR_POS_ADDR (from);
9149       pend = CHAR_POS_ADDR (to);
9150       if (from < GPT && to >= GPT)
9151         stop = GPT_ADDR;
9152       else
9153         stop = pend;
9154     }
9155   else
9156     {
9157       CHECK_STRING (string);
9158       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9159       if (! STRING_MULTIBYTE (string))
9160         return Qnil;
9161       p = SDATA (string) + string_char_to_byte (string, from);
9162       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9163       if (ascii_compatible && (to - from) == (pend - p))
9164         return Qnil;
9165     }
9166
9167   if (NILP (count))
9168     n = 1;
9169   else
9170     {
9171       CHECK_NATNUM (count);
9172       n = XINT (count);
9173     }
9174
9175   positions = Qnil;
9176   charset_map_loaded = 0;
9177   while (1)
9178     {
9179       int c;
9180
9181       if (ascii_compatible)
9182         while (p < stop && ASCII_CHAR_P (*p))
9183           p++, from++;
9184       if (p >= stop)
9185         {
9186           if (p >= pend)
9187             break;
9188           stop = pend;
9189           p = GAP_END_ADDR;
9190         }
9191
9192       c = STRING_CHAR_ADVANCE (p);
9193       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9194           && ! char_charset (translate_char (translation_table, c),
9195                              charset_list, NULL))
9196         {
9197           positions = Fcons (make_number (from), positions);
9198           n--;
9199           if (n == 0)
9200             break;
9201         }
9202
9203       from++;
9204       if (charset_map_loaded && NILP (string))
9205         {
9206           p = CHAR_POS_ADDR (from);
9207           pend = CHAR_POS_ADDR (to);
9208           if (from < GPT && to >= GPT)
9209             stop = GPT_ADDR;
9210           else
9211             stop = pend;
9212           charset_map_loaded = 0;
9213         }
9214     }
9215
9216   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9217 }
9218
9219
9220 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9221        Scheck_coding_systems_region, 3, 3, 0,
9222        doc: /* Check if the region is encodable by coding systems.
9223
9224 START and END are buffer positions specifying the region.
9225 CODING-SYSTEM-LIST is a list of coding systems to check.
9226
9227 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9228 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9229 whole region, POS0, POS1, ... are buffer positions where non-encodable
9230 characters are found.
9231
9232 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9233 value is nil.
9234
9235 START may be a string.  In that case, check if the string is
9236 encodable, and the value contains indices to the string instead of
9237 buffer positions.  END is ignored.
9238
9239 If the current buffer (or START if it is a string) is unibyte, the value
9240 is nil.  */)
9241   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9242 {
9243   Lisp_Object list;
9244   ptrdiff_t start_byte, end_byte;
9245   ptrdiff_t pos;
9246   const unsigned char *p, *pbeg, *pend;
9247   int c;
9248   Lisp_Object tail, elt, attrs;
9249
9250   if (STRINGP (start))
9251     {
9252       if (!STRING_MULTIBYTE (start)
9253           || SCHARS (start) == SBYTES (start))
9254         return Qnil;
9255       start_byte = 0;
9256       end_byte = SBYTES (start);
9257       pos = 0;
9258     }
9259   else
9260     {
9261       CHECK_NUMBER_COERCE_MARKER (start);
9262       CHECK_NUMBER_COERCE_MARKER (end);
9263       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9264         args_out_of_range (start, end);
9265       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9266         return Qnil;
9267       start_byte = CHAR_TO_BYTE (XINT (start));
9268       end_byte = CHAR_TO_BYTE (XINT (end));
9269       if (XINT (end) - XINT (start) == end_byte - start_byte)
9270         return Qnil;
9271
9272       if (XINT (start) < GPT && XINT (end) > GPT)
9273         {
9274           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9275             move_gap_both (XINT (start), start_byte);
9276           else
9277             move_gap_both (XINT (end), end_byte);
9278         }
9279       pos = XINT (start);
9280     }
9281
9282   list = Qnil;
9283   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9284     {
9285       elt = XCAR (tail);
9286       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9287       ASET (attrs, coding_attr_trans_tbl,
9288             get_translation_table (attrs, 1, NULL));
9289       list = Fcons (list2 (elt, attrs), list);
9290     }
9291
9292   if (STRINGP (start))
9293     p = pbeg = SDATA (start);
9294   else
9295     p = pbeg = BYTE_POS_ADDR (start_byte);
9296   pend = p + (end_byte - start_byte);
9297
9298   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9299   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9300
9301   while (p < pend)
9302     {
9303       if (ASCII_CHAR_P (*p))
9304         p++;
9305       else
9306         {
9307           c = STRING_CHAR_ADVANCE (p);
9308
9309           charset_map_loaded = 0;
9310           for (tail = list; CONSP (tail); tail = XCDR (tail))
9311             {
9312               elt = XCDR (XCAR (tail));
9313               if (! char_encodable_p (c, XCAR (elt)))
9314                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9315             }
9316           if (charset_map_loaded)
9317             {
9318               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9319
9320               if (STRINGP (start))
9321                 pbeg = SDATA (start);
9322               else
9323                 pbeg = BYTE_POS_ADDR (start_byte);
9324               p = pbeg + p_offset;
9325               pend = pbeg + pend_offset;
9326             }
9327         }
9328       pos++;
9329     }
9330
9331   tail = list;
9332   list = Qnil;
9333   for (; CONSP (tail); tail = XCDR (tail))
9334     {
9335       elt = XCAR (tail);
9336       if (CONSP (XCDR (XCDR (elt))))
9337         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9338                       list);
9339     }
9340
9341   return list;
9342 }
9343
9344
9345 static Lisp_Object
9346 code_convert_region (Lisp_Object start, Lisp_Object end,
9347                      Lisp_Object coding_system, Lisp_Object dst_object,
9348                      bool encodep, bool norecord)
9349 {
9350   struct coding_system coding;
9351   ptrdiff_t from, from_byte, to, to_byte;
9352   Lisp_Object src_object;
9353
9354   if (NILP (coding_system))
9355     coding_system = Qno_conversion;
9356   else
9357     CHECK_CODING_SYSTEM (coding_system);
9358   src_object = Fcurrent_buffer ();
9359   if (NILP (dst_object))
9360     dst_object = src_object;
9361   else if (! EQ (dst_object, Qt))
9362     CHECK_BUFFER (dst_object);
9363
9364   validate_region (&start, &end);
9365   from = XFASTINT (start);
9366   from_byte = CHAR_TO_BYTE (from);
9367   to = XFASTINT (end);
9368   to_byte = CHAR_TO_BYTE (to);
9369
9370   setup_coding_system (coding_system, &coding);
9371   coding.mode |= CODING_MODE_LAST_BLOCK;
9372
9373   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9374     {
9375       struct buffer *buf = XBUFFER (dst_object);
9376       ptrdiff_t buf_pt = BUF_PT (buf);
9377
9378       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9379     }
9380
9381   if (encodep)
9382     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9383                           dst_object);
9384   else
9385     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9386                           dst_object);
9387   if (! norecord)
9388     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9389
9390   return (BUFFERP (dst_object)
9391           ? make_number (coding.produced_char)
9392           : coding.dst_object);
9393 }
9394
9395
9396 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9397        3, 4, "r\nzCoding system: ",
9398        doc: /* Decode the current region from the specified coding system.
9399 When called from a program, takes four arguments:
9400         START, END, CODING-SYSTEM, and DESTINATION.
9401 START and END are buffer positions.
9402
9403 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9404 If nil, the region between START and END is replaced by the decoded text.
9405 If buffer, the decoded text is inserted in that buffer after point (point
9406 does not move).
9407 In those cases, the length of the decoded text is returned.
9408 If DESTINATION is t, the decoded text is returned.
9409
9410 This function sets `last-coding-system-used' to the precise coding system
9411 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9412 not fully specified.)  */)
9413   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9414 {
9415   return code_convert_region (start, end, coding_system, destination, 0, 0);
9416 }
9417
9418 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9419        3, 4, "r\nzCoding system: ",
9420        doc: /* Encode the current region by specified coding system.
9421 When called from a program, takes four arguments:
9422         START, END, CODING-SYSTEM and DESTINATION.
9423 START and END are buffer positions.
9424
9425 Optional 4th argument DESTINATION specifies where the encoded text goes.
9426 If nil, the region between START and END is replaced by the encoded text.
9427 If buffer, the encoded text is inserted in that buffer after point (point
9428 does not move).
9429 In those cases, the length of the encoded text is returned.
9430 If DESTINATION is t, the encoded text is returned.
9431
9432 This function sets `last-coding-system-used' to the precise coding system
9433 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9434 not fully specified.)  */)
9435   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9436 {
9437   return code_convert_region (start, end, coding_system, destination, 1, 0);
9438 }
9439
9440 Lisp_Object
9441 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9442                      Lisp_Object dst_object, bool encodep, bool nocopy,
9443                      bool norecord)
9444 {
9445   struct coding_system coding;
9446   ptrdiff_t chars, bytes;
9447
9448   CHECK_STRING (string);
9449   if (NILP (coding_system))
9450     {
9451       if (! norecord)
9452         Vlast_coding_system_used = Qno_conversion;
9453       if (NILP (dst_object))
9454         return (nocopy ? Fcopy_sequence (string) : string);
9455     }
9456
9457   if (NILP (coding_system))
9458     coding_system = Qno_conversion;
9459   else
9460     CHECK_CODING_SYSTEM (coding_system);
9461   if (NILP (dst_object))
9462     dst_object = Qt;
9463   else if (! EQ (dst_object, Qt))
9464     CHECK_BUFFER (dst_object);
9465
9466   setup_coding_system (coding_system, &coding);
9467   coding.mode |= CODING_MODE_LAST_BLOCK;
9468   chars = SCHARS (string);
9469   bytes = SBYTES (string);
9470
9471   if (BUFFERP (dst_object))
9472     {
9473       struct buffer *buf = XBUFFER (dst_object);
9474       ptrdiff_t buf_pt = BUF_PT (buf);
9475
9476       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9477     }
9478
9479   if (encodep)
9480     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9481   else
9482     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9483   if (! norecord)
9484     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9485
9486   return (BUFFERP (dst_object)
9487           ? make_number (coding.produced_char)
9488           : coding.dst_object);
9489 }
9490
9491
9492 /* Encode or decode STRING according to CODING_SYSTEM.
9493    Do not set Vlast_coding_system_used.
9494
9495    This function is called only from macros DECODE_FILE and
9496    ENCODE_FILE, thus we ignore character composition.  */
9497
9498 Lisp_Object
9499 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9500                               bool encodep)
9501 {
9502   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9503 }
9504
9505 /* Encode or decode a file name, to or from a unibyte string suitable
9506    for passing to C library functions.  */
9507 Lisp_Object
9508 decode_file_name (Lisp_Object fname)
9509 {
9510 #ifdef WINDOWSNT
9511   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9512      converts the file names either to UTF-16LE or to the system ANSI
9513      codepage internally, depending on the underlying OS; see w32.c.  */
9514   if (! NILP (Fcoding_system_p (Qutf_8)))
9515     return code_convert_string_norecord (fname, Qutf_8, 0);
9516   return fname;
9517 #else  /* !WINDOWSNT */
9518   if (! NILP (Vfile_name_coding_system))
9519     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9520   else if (! NILP (Vdefault_file_name_coding_system))
9521     return code_convert_string_norecord (fname,
9522                                          Vdefault_file_name_coding_system, 0);
9523   else
9524     return fname;
9525 #endif
9526 }
9527
9528 Lisp_Object
9529 encode_file_name (Lisp_Object fname)
9530 {
9531   /* This is especially important during bootstrap and dumping, when
9532      file-name encoding is not yet known, and therefore any non-ASCII
9533      file names are unibyte strings, and could only be thrashed if we
9534      try to encode them.  */
9535   if (!STRING_MULTIBYTE (fname))
9536     return fname;
9537 #ifdef WINDOWSNT
9538   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9539      converts the file names either to UTF-16LE or to the system ANSI
9540      codepage internally, depending on the underlying OS; see w32.c.  */
9541   if (! NILP (Fcoding_system_p (Qutf_8)))
9542     return code_convert_string_norecord (fname, Qutf_8, 1);
9543   return fname;
9544 #else  /* !WINDOWSNT */
9545   if (! NILP (Vfile_name_coding_system))
9546     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9547   else if (! NILP (Vdefault_file_name_coding_system))
9548     return code_convert_string_norecord (fname,
9549                                          Vdefault_file_name_coding_system, 1);
9550   else
9551     return fname;
9552 #endif
9553 }
9554
9555 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9556        2, 4, 0,
9557        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9558
9559 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9560 if the decoding operation is trivial.
9561
9562 Optional fourth arg BUFFER non-nil means that the decoded text is
9563 inserted in that buffer after point (point does not move).  In this
9564 case, the return value is the length of the decoded text.
9565
9566 This function sets `last-coding-system-used' to the precise coding system
9567 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9568 not fully specified.)  */)
9569   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9570 {
9571   return code_convert_string (string, coding_system, buffer,
9572                               0, ! NILP (nocopy), 0);
9573 }
9574
9575 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9576        2, 4, 0,
9577        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9578
9579 Optional third arg NOCOPY non-nil means it is OK to return STRING
9580 itself if the encoding operation is trivial.
9581
9582 Optional fourth arg BUFFER non-nil means that the encoded text is
9583 inserted in that buffer after point (point does not move).  In this
9584 case, the return value is the length of the encoded text.
9585
9586 This function sets `last-coding-system-used' to the precise coding system
9587 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9588 not fully specified.)  */)
9589   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9590 {
9591   return code_convert_string (string, coding_system, buffer,
9592                               1, ! NILP (nocopy), 0);
9593 }
9594
9595 \f
9596 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9597        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9598 Return the corresponding character.  */)
9599   (Lisp_Object code)
9600 {
9601   Lisp_Object spec, attrs, val;
9602   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9603   EMACS_INT ch;
9604   int c;
9605
9606   CHECK_NATNUM (code);
9607   ch = XFASTINT (code);
9608   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9609   attrs = AREF (spec, 0);
9610
9611   if (ASCII_CHAR_P (ch)
9612       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9613     return code;
9614
9615   val = CODING_ATTR_CHARSET_LIST (attrs);
9616   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9617   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9618   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9619
9620   if (ch <= 0x7F)
9621     {
9622       c = ch;
9623       charset = charset_roman;
9624     }
9625   else if (ch >= 0xA0 && ch < 0xDF)
9626     {
9627       c = ch - 0x80;
9628       charset = charset_kana;
9629     }
9630   else
9631     {
9632       EMACS_INT c1 = ch >> 8;
9633       int c2 = ch & 0xFF;
9634
9635       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9636           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9637         error ("Invalid code: %"pI"d", ch);
9638       c = ch;
9639       SJIS_TO_JIS (c);
9640       charset = charset_kanji;
9641     }
9642   c = DECODE_CHAR (charset, c);
9643   if (c < 0)
9644     error ("Invalid code: %"pI"d", ch);
9645   return make_number (c);
9646 }
9647
9648
9649 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9650        doc: /* Encode a Japanese character CH to shift_jis encoding.
9651 Return the corresponding code in SJIS.  */)
9652   (Lisp_Object ch)
9653 {
9654   Lisp_Object spec, attrs, charset_list;
9655   int c;
9656   struct charset *charset;
9657   unsigned code;
9658
9659   CHECK_CHARACTER (ch);
9660   c = XFASTINT (ch);
9661   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9662   attrs = AREF (spec, 0);
9663
9664   if (ASCII_CHAR_P (c)
9665       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9666     return ch;
9667
9668   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9669   charset = char_charset (c, charset_list, &code);
9670   if (code == CHARSET_INVALID_CODE (charset))
9671     error ("Can't encode by shift_jis encoding: %c", c);
9672   JIS_TO_SJIS (code);
9673
9674   return make_number (code);
9675 }
9676
9677 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9678        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9679 Return the corresponding character.  */)
9680   (Lisp_Object code)
9681 {
9682   Lisp_Object spec, attrs, val;
9683   struct charset *charset_roman, *charset_big5, *charset;
9684   EMACS_INT ch;
9685   int c;
9686
9687   CHECK_NATNUM (code);
9688   ch = XFASTINT (code);
9689   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9690   attrs = AREF (spec, 0);
9691
9692   if (ASCII_CHAR_P (ch)
9693       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9694     return code;
9695
9696   val = CODING_ATTR_CHARSET_LIST (attrs);
9697   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9698   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9699
9700   if (ch <= 0x7F)
9701     {
9702       c = ch;
9703       charset = charset_roman;
9704     }
9705   else
9706     {
9707       EMACS_INT b1 = ch >> 8;
9708       int b2 = ch & 0x7F;
9709       if (b1 < 0xA1 || b1 > 0xFE
9710           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9711         error ("Invalid code: %"pI"d", ch);
9712       c = ch;
9713       charset = charset_big5;
9714     }
9715   c = DECODE_CHAR (charset, c);
9716   if (c < 0)
9717     error ("Invalid code: %"pI"d", ch);
9718   return make_number (c);
9719 }
9720
9721 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9722        doc: /* Encode the Big5 character CH to BIG5 coding system.
9723 Return the corresponding character code in Big5.  */)
9724   (Lisp_Object ch)
9725 {
9726   Lisp_Object spec, attrs, charset_list;
9727   struct charset *charset;
9728   int c;
9729   unsigned code;
9730
9731   CHECK_CHARACTER (ch);
9732   c = XFASTINT (ch);
9733   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9734   attrs = AREF (spec, 0);
9735   if (ASCII_CHAR_P (c)
9736       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9737     return ch;
9738
9739   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9740   charset = char_charset (c, charset_list, &code);
9741   if (code == CHARSET_INVALID_CODE (charset))
9742     error ("Can't encode by Big5 encoding: %c", c);
9743
9744   return make_number (code);
9745 }
9746
9747 \f
9748 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9749        Sset_terminal_coding_system_internal, 1, 2, 0,
9750        doc: /* Internal use only.  */)
9751   (Lisp_Object coding_system, Lisp_Object terminal)
9752 {
9753   struct terminal *term = decode_live_terminal (terminal);
9754   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9755   CHECK_SYMBOL (coding_system);
9756   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9757   /* We had better not send unsafe characters to terminal.  */
9758   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9759   /* Character composition should be disabled.  */
9760   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9761   terminal_coding->src_multibyte = 1;
9762   terminal_coding->dst_multibyte = 0;
9763   tset_charset_list
9764     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9765             ? coding_charset_list (terminal_coding)
9766             : list1 (make_number (charset_ascii))));
9767   return Qnil;
9768 }
9769
9770 DEFUN ("set-safe-terminal-coding-system-internal",
9771        Fset_safe_terminal_coding_system_internal,
9772        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9773        doc: /* Internal use only.  */)
9774   (Lisp_Object coding_system)
9775 {
9776   CHECK_SYMBOL (coding_system);
9777   setup_coding_system (Fcheck_coding_system (coding_system),
9778                        &safe_terminal_coding);
9779   /* Character composition should be disabled.  */
9780   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9781   safe_terminal_coding.src_multibyte = 1;
9782   safe_terminal_coding.dst_multibyte = 0;
9783   return Qnil;
9784 }
9785
9786 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9787        Sterminal_coding_system, 0, 1, 0,
9788        doc: /* Return coding system specified for terminal output on the given terminal.
9789 TERMINAL may be a terminal object, a frame, or nil for the selected
9790 frame's terminal device.  */)
9791   (Lisp_Object terminal)
9792 {
9793   struct coding_system *terminal_coding
9794     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
9795   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9796
9797   /* For backward compatibility, return nil if it is `undecided'.  */
9798   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9799 }
9800
9801 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9802        Sset_keyboard_coding_system_internal, 1, 2, 0,
9803        doc: /* Internal use only.  */)
9804   (Lisp_Object coding_system, Lisp_Object terminal)
9805 {
9806   struct terminal *t = decode_live_terminal (terminal);
9807   CHECK_SYMBOL (coding_system);
9808   if (NILP (coding_system))
9809     coding_system = Qno_conversion;
9810   else
9811     Fcheck_coding_system (coding_system);
9812   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9813   /* Character composition should be disabled.  */
9814   TERMINAL_KEYBOARD_CODING (t)->common_flags
9815     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9816   return Qnil;
9817 }
9818
9819 DEFUN ("keyboard-coding-system",
9820        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9821        doc: /* Return coding system specified for decoding keyboard input.  */)
9822   (Lisp_Object terminal)
9823 {
9824   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9825                          (decode_live_terminal (terminal))->id);
9826 }
9827
9828 \f
9829 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9830        Sfind_operation_coding_system,  1, MANY, 0,
9831        doc: /* Choose a coding system for an operation based on the target name.
9832 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9833 DECODING-SYSTEM is the coding system to use for decoding
9834 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9835 for encoding (in case OPERATION does encoding).
9836
9837 The first argument OPERATION specifies an I/O primitive:
9838   For file I/O, `insert-file-contents' or `write-region'.
9839   For process I/O, `call-process', `call-process-region', or `start-process'.
9840   For network I/O, `open-network-stream'.
9841
9842 The remaining arguments should be the same arguments that were passed
9843 to the primitive.  Depending on which primitive, one of those arguments
9844 is selected as the TARGET.  For example, if OPERATION does file I/O,
9845 whichever argument specifies the file name is TARGET.
9846
9847 TARGET has a meaning which depends on OPERATION:
9848   For file I/O, TARGET is a file name (except for the special case below).
9849   For process I/O, TARGET is a process name.
9850   For network I/O, TARGET is a service name or a port number.
9851
9852 This function looks up what is specified for TARGET in
9853 `file-coding-system-alist', `process-coding-system-alist',
9854 or `network-coding-system-alist' depending on OPERATION.
9855 They may specify a coding system, a cons of coding systems,
9856 or a function symbol to call.
9857 In the last case, we call the function with one argument,
9858 which is a list of all the arguments given to this function.
9859 If the function can't decide a coding system, it can return
9860 `undecided' so that the normal code-detection is performed.
9861
9862 If OPERATION is `insert-file-contents', the argument corresponding to
9863 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9864 file name to look up, and BUFFER is a buffer that contains the file's
9865 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9866 function to call for FILENAME, that function should examine the
9867 contents of BUFFER instead of reading the file.
9868
9869 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9870   (ptrdiff_t nargs, Lisp_Object *args)
9871 {
9872   Lisp_Object operation, target_idx, target, val;
9873   register Lisp_Object chain;
9874
9875   if (nargs < 2)
9876     error ("Too few arguments");
9877   operation = args[0];
9878   if (!SYMBOLP (operation)
9879       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9880     error ("Invalid first argument");
9881   if (nargs <= 1 + XFASTINT (target_idx))
9882     error ("Too few arguments for operation `%s'",
9883            SDATA (SYMBOL_NAME (operation)));
9884   target = args[XFASTINT (target_idx) + 1];
9885   if (!(STRINGP (target)
9886         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9887             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9888         || (EQ (operation, Qopen_network_stream)
9889             && (INTEGERP (target) || EQ (target, Qt)))))
9890     error ("Invalid argument %"pI"d of operation `%s'",
9891            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9892   if (CONSP (target))
9893     target = XCAR (target);
9894
9895   chain = ((EQ (operation, Qinsert_file_contents)
9896             || EQ (operation, Qwrite_region))
9897            ? Vfile_coding_system_alist
9898            : (EQ (operation, Qopen_network_stream)
9899               ? Vnetwork_coding_system_alist
9900               : Vprocess_coding_system_alist));
9901   if (NILP (chain))
9902     return Qnil;
9903
9904   for (; CONSP (chain); chain = XCDR (chain))
9905     {
9906       Lisp_Object elt;
9907
9908       elt = XCAR (chain);
9909       if (CONSP (elt)
9910           && ((STRINGP (target)
9911                && STRINGP (XCAR (elt))
9912                && fast_string_match (XCAR (elt), target) >= 0)
9913               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9914         {
9915           val = XCDR (elt);
9916           /* Here, if VAL is both a valid coding system and a valid
9917              function symbol, we return VAL as a coding system.  */
9918           if (CONSP (val))
9919             return val;
9920           if (! SYMBOLP (val))
9921             return Qnil;
9922           if (! NILP (Fcoding_system_p (val)))
9923             return Fcons (val, val);
9924           if (! NILP (Ffboundp (val)))
9925             {
9926               /* We use call1 rather than safe_call1
9927                  so as to get bug reports about functions called here
9928                  which don't handle the current interface.  */
9929               val = call1 (val, Flist (nargs, args));
9930               if (CONSP (val))
9931                 return val;
9932               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9933                 return Fcons (val, val);
9934             }
9935           return Qnil;
9936         }
9937     }
9938   return Qnil;
9939 }
9940
9941 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9942        Sset_coding_system_priority, 0, MANY, 0,
9943        doc: /* Assign higher priority to the coding systems given as arguments.
9944 If multiple coding systems belong to the same category,
9945 all but the first one are ignored.
9946
9947 usage: (set-coding-system-priority &rest coding-systems)  */)
9948   (ptrdiff_t nargs, Lisp_Object *args)
9949 {
9950   ptrdiff_t i, j;
9951   bool changed[coding_category_max];
9952   enum coding_category priorities[coding_category_max];
9953
9954   memset (changed, 0, sizeof changed);
9955
9956   for (i = j = 0; i < nargs; i++)
9957     {
9958       enum coding_category category;
9959       Lisp_Object spec, attrs;
9960
9961       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9962       attrs = AREF (spec, 0);
9963       category = XINT (CODING_ATTR_CATEGORY (attrs));
9964       if (changed[category])
9965         /* Ignore this coding system because a coding system of the
9966            same category already had a higher priority.  */
9967         continue;
9968       changed[category] = 1;
9969       priorities[j++] = category;
9970       if (coding_categories[category].id >= 0
9971           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9972         setup_coding_system (args[i], &coding_categories[category]);
9973       Fset (AREF (Vcoding_category_table, category), args[i]);
9974     }
9975
9976   /* Now we have decided top J priorities.  Reflect the order of the
9977      original priorities to the remaining priorities.  */
9978
9979   for (i = j, j = 0; i < coding_category_max; i++, j++)
9980     {
9981       while (j < coding_category_max
9982              && changed[coding_priorities[j]])
9983         j++;
9984       if (j == coding_category_max)
9985         emacs_abort ();
9986       priorities[i] = coding_priorities[j];
9987     }
9988
9989   memcpy (coding_priorities, priorities, sizeof priorities);
9990
9991   /* Update `coding-category-list'.  */
9992   Vcoding_category_list = Qnil;
9993   for (i = coding_category_max; i-- > 0; )
9994     Vcoding_category_list
9995       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9996                Vcoding_category_list);
9997
9998   return Qnil;
9999 }
10000
10001 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
10002        Scoding_system_priority_list, 0, 1, 0,
10003        doc: /* Return a list of coding systems ordered by their priorities.
10004 The list contains a subset of coding systems; i.e. coding systems
10005 assigned to each coding category (see `coding-category-list').
10006
10007 HIGHESTP non-nil means just return the highest priority one.  */)
10008   (Lisp_Object highestp)
10009 {
10010   int i;
10011   Lisp_Object val;
10012
10013   for (i = 0, val = Qnil; i < coding_category_max; i++)
10014     {
10015       enum coding_category category = coding_priorities[i];
10016       int id = coding_categories[category].id;
10017       Lisp_Object attrs;
10018
10019       if (id < 0)
10020         continue;
10021       attrs = CODING_ID_ATTRS (id);
10022       if (! NILP (highestp))
10023         return CODING_ATTR_BASE_NAME (attrs);
10024       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10025     }
10026   return Fnreverse (val);
10027 }
10028
10029 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10030
10031 static Lisp_Object
10032 make_subsidiaries (Lisp_Object base)
10033 {
10034   Lisp_Object subsidiaries;
10035   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10036   USE_SAFE_ALLOCA;
10037   char *buf = SAFE_ALLOCA (base_name_len + 6);
10038   int i;
10039
10040   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10041   subsidiaries = make_uninit_vector (3);
10042   for (i = 0; i < 3; i++)
10043     {
10044       strcpy (buf + base_name_len, suffixes[i]);
10045       ASET (subsidiaries, i, intern (buf));
10046     }
10047   SAFE_FREE ();
10048   return subsidiaries;
10049 }
10050
10051
10052 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10053        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10054        doc: /* For internal use only.
10055 usage: (define-coding-system-internal ...)  */)
10056   (ptrdiff_t nargs, Lisp_Object *args)
10057 {
10058   Lisp_Object name;
10059   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10060   Lisp_Object attrs;            /* Vector of attributes.  */
10061   Lisp_Object eol_type;
10062   Lisp_Object aliases;
10063   Lisp_Object coding_type, charset_list, safe_charsets;
10064   enum coding_category category;
10065   Lisp_Object tail, val;
10066   int max_charset_id = 0;
10067   int i;
10068
10069   if (nargs < coding_arg_max)
10070     goto short_args;
10071
10072   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10073
10074   name = args[coding_arg_name];
10075   CHECK_SYMBOL (name);
10076   ASET (attrs, coding_attr_base_name, name);
10077
10078   val = args[coding_arg_mnemonic];
10079   if (! STRINGP (val))
10080     CHECK_CHARACTER (val);
10081   ASET (attrs, coding_attr_mnemonic, val);
10082
10083   coding_type = args[coding_arg_coding_type];
10084   CHECK_SYMBOL (coding_type);
10085   ASET (attrs, coding_attr_type, coding_type);
10086
10087   charset_list = args[coding_arg_charset_list];
10088   if (SYMBOLP (charset_list))
10089     {
10090       if (EQ (charset_list, Qiso_2022))
10091         {
10092           if (! EQ (coding_type, Qiso_2022))
10093             error ("Invalid charset-list");
10094           charset_list = Viso_2022_charset_list;
10095         }
10096       else if (EQ (charset_list, Qemacs_mule))
10097         {
10098           if (! EQ (coding_type, Qemacs_mule))
10099             error ("Invalid charset-list");
10100           charset_list = Vemacs_mule_charset_list;
10101         }
10102       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10103         {
10104           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10105             error ("Invalid charset-list");
10106           if (max_charset_id < XFASTINT (XCAR (tail)))
10107             max_charset_id = XFASTINT (XCAR (tail));
10108         }
10109     }
10110   else
10111     {
10112       charset_list = Fcopy_sequence (charset_list);
10113       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10114         {
10115           struct charset *charset;
10116
10117           val = XCAR (tail);
10118           CHECK_CHARSET_GET_CHARSET (val, charset);
10119           if (EQ (coding_type, Qiso_2022)
10120               ? CHARSET_ISO_FINAL (charset) < 0
10121               : EQ (coding_type, Qemacs_mule)
10122               ? CHARSET_EMACS_MULE_ID (charset) < 0
10123               : 0)
10124             error ("Can't handle charset `%s'",
10125                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10126
10127           XSETCAR (tail, make_number (charset->id));
10128           if (max_charset_id < charset->id)
10129             max_charset_id = charset->id;
10130         }
10131     }
10132   ASET (attrs, coding_attr_charset_list, charset_list);
10133
10134   safe_charsets = make_uninit_string (max_charset_id + 1);
10135   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10136   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10137     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10138   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10139
10140   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10141
10142   val = args[coding_arg_decode_translation_table];
10143   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10144     CHECK_SYMBOL (val);
10145   ASET (attrs, coding_attr_decode_tbl, val);
10146
10147   val = args[coding_arg_encode_translation_table];
10148   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10149     CHECK_SYMBOL (val);
10150   ASET (attrs, coding_attr_encode_tbl, val);
10151
10152   val = args[coding_arg_post_read_conversion];
10153   CHECK_SYMBOL (val);
10154   ASET (attrs, coding_attr_post_read, val);
10155
10156   val = args[coding_arg_pre_write_conversion];
10157   CHECK_SYMBOL (val);
10158   ASET (attrs, coding_attr_pre_write, val);
10159
10160   val = args[coding_arg_default_char];
10161   if (NILP (val))
10162     ASET (attrs, coding_attr_default_char, make_number (' '));
10163   else
10164     {
10165       CHECK_CHARACTER (val);
10166       ASET (attrs, coding_attr_default_char, val);
10167     }
10168
10169   val = args[coding_arg_for_unibyte];
10170   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10171
10172   val = args[coding_arg_plist];
10173   CHECK_LIST (val);
10174   ASET (attrs, coding_attr_plist, val);
10175
10176   if (EQ (coding_type, Qcharset))
10177     {
10178       /* Generate a lisp vector of 256 elements.  Each element is nil,
10179          integer, or a list of charset IDs.
10180
10181          If Nth element is nil, the byte code N is invalid in this
10182          coding system.
10183
10184          If Nth element is a number NUM, N is the first byte of a
10185          charset whose ID is NUM.
10186
10187          If Nth element is a list of charset IDs, N is the first byte
10188          of one of them.  The list is sorted by dimensions of the
10189          charsets.  A charset of smaller dimension comes first. */
10190       val = Fmake_vector (make_number (256), Qnil);
10191
10192       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10193         {
10194           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10195           int dim = CHARSET_DIMENSION (charset);
10196           int idx = (dim - 1) * 4;
10197
10198           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10199             ASET (attrs, coding_attr_ascii_compat, Qt);
10200
10201           for (i = charset->code_space[idx];
10202                i <= charset->code_space[idx + 1]; i++)
10203             {
10204               Lisp_Object tmp, tmp2;
10205               int dim2;
10206
10207               tmp = AREF (val, i);
10208               if (NILP (tmp))
10209                 tmp = XCAR (tail);
10210               else if (NUMBERP (tmp))
10211                 {
10212                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10213                   if (dim < dim2)
10214                     tmp = list2 (XCAR (tail), tmp);
10215                   else
10216                     tmp = list2 (tmp, XCAR (tail));
10217                 }
10218               else
10219                 {
10220                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10221                     {
10222                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10223                       if (dim < dim2)
10224                         break;
10225                     }
10226                   if (NILP (tmp2))
10227                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10228                   else
10229                     {
10230                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10231                       XSETCAR (tmp2, XCAR (tail));
10232                     }
10233                 }
10234               ASET (val, i, tmp);
10235             }
10236         }
10237       ASET (attrs, coding_attr_charset_valids, val);
10238       category = coding_category_charset;
10239     }
10240   else if (EQ (coding_type, Qccl))
10241     {
10242       Lisp_Object valids;
10243
10244       if (nargs < coding_arg_ccl_max)
10245         goto short_args;
10246
10247       val = args[coding_arg_ccl_decoder];
10248       CHECK_CCL_PROGRAM (val);
10249       if (VECTORP (val))
10250         val = Fcopy_sequence (val);
10251       ASET (attrs, coding_attr_ccl_decoder, val);
10252
10253       val = args[coding_arg_ccl_encoder];
10254       CHECK_CCL_PROGRAM (val);
10255       if (VECTORP (val))
10256         val = Fcopy_sequence (val);
10257       ASET (attrs, coding_attr_ccl_encoder, val);
10258
10259       val = args[coding_arg_ccl_valids];
10260       valids = Fmake_string (make_number (256), make_number (0), Qnil);
10261       for (tail = val; CONSP (tail); tail = XCDR (tail))
10262         {
10263           int from, to;
10264
10265           val = XCAR (tail);
10266           if (INTEGERP (val))
10267             {
10268               if (! (0 <= XINT (val) && XINT (val) <= 255))
10269                 args_out_of_range_3 (val, make_number (0), make_number (255));
10270               from = to = XINT (val);
10271             }
10272           else
10273             {
10274               CHECK_CONS (val);
10275               CHECK_NATNUM_CAR (val);
10276               CHECK_NUMBER_CDR (val);
10277               if (XINT (XCAR (val)) > 255)
10278                 args_out_of_range_3 (XCAR (val),
10279                                      make_number (0), make_number (255));
10280               from = XINT (XCAR (val));
10281               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10282                 args_out_of_range_3 (XCDR (val),
10283                                      XCAR (val), make_number (255));
10284               to = XINT (XCDR (val));
10285             }
10286           for (i = from; i <= to; i++)
10287             SSET (valids, i, 1);
10288         }
10289       ASET (attrs, coding_attr_ccl_valids, valids);
10290
10291       category = coding_category_ccl;
10292     }
10293   else if (EQ (coding_type, Qutf_16))
10294     {
10295       Lisp_Object bom, endian;
10296
10297       ASET (attrs, coding_attr_ascii_compat, Qnil);
10298
10299       if (nargs < coding_arg_utf16_max)
10300         goto short_args;
10301
10302       bom = args[coding_arg_utf16_bom];
10303       if (! NILP (bom) && ! EQ (bom, Qt))
10304         {
10305           CHECK_CONS (bom);
10306           val = XCAR (bom);
10307           CHECK_CODING_SYSTEM (val);
10308           val = XCDR (bom);
10309           CHECK_CODING_SYSTEM (val);
10310         }
10311       ASET (attrs, coding_attr_utf_bom, bom);
10312
10313       endian = args[coding_arg_utf16_endian];
10314       CHECK_SYMBOL (endian);
10315       if (NILP (endian))
10316         endian = Qbig;
10317       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10318         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10319       ASET (attrs, coding_attr_utf_16_endian, endian);
10320
10321       category = (CONSP (bom)
10322                   ? coding_category_utf_16_auto
10323                   : NILP (bom)
10324                   ? (EQ (endian, Qbig)
10325                      ? coding_category_utf_16_be_nosig
10326                      : coding_category_utf_16_le_nosig)
10327                   : (EQ (endian, Qbig)
10328                      ? coding_category_utf_16_be
10329                      : coding_category_utf_16_le));
10330     }
10331   else if (EQ (coding_type, Qiso_2022))
10332     {
10333       Lisp_Object initial, reg_usage, request, flags;
10334
10335       if (nargs < coding_arg_iso2022_max)
10336         goto short_args;
10337
10338       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10339       CHECK_VECTOR (initial);
10340       for (i = 0; i < 4; i++)
10341         {
10342           val = AREF (initial, i);
10343           if (! NILP (val))
10344             {
10345               struct charset *charset;
10346
10347               CHECK_CHARSET_GET_CHARSET (val, charset);
10348               ASET (initial, i, make_number (CHARSET_ID (charset)));
10349               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10350                 ASET (attrs, coding_attr_ascii_compat, Qt);
10351             }
10352           else
10353             ASET (initial, i, make_number (-1));
10354         }
10355
10356       reg_usage = args[coding_arg_iso2022_reg_usage];
10357       CHECK_CONS (reg_usage);
10358       CHECK_NUMBER_CAR (reg_usage);
10359       CHECK_NUMBER_CDR (reg_usage);
10360
10361       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10362       for (tail = request; CONSP (tail); tail = XCDR (tail))
10363         {
10364           int id;
10365           Lisp_Object tmp1;
10366
10367           val = XCAR (tail);
10368           CHECK_CONS (val);
10369           tmp1 = XCAR (val);
10370           CHECK_CHARSET_GET_ID (tmp1, id);
10371           CHECK_NATNUM_CDR (val);
10372           if (XINT (XCDR (val)) >= 4)
10373             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10374           XSETCAR (val, make_number (id));
10375         }
10376
10377       flags = args[coding_arg_iso2022_flags];
10378       CHECK_NATNUM (flags);
10379       i = XINT (flags) & INT_MAX;
10380       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10381         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10382       flags = make_number (i);
10383
10384       ASET (attrs, coding_attr_iso_initial, initial);
10385       ASET (attrs, coding_attr_iso_usage, reg_usage);
10386       ASET (attrs, coding_attr_iso_request, request);
10387       ASET (attrs, coding_attr_iso_flags, flags);
10388       setup_iso_safe_charsets (attrs);
10389
10390       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10391         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10392                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10393                     ? coding_category_iso_7_else
10394                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10395                     ? coding_category_iso_7
10396                     : coding_category_iso_7_tight);
10397       else
10398         {
10399           int id = XINT (AREF (initial, 1));
10400
10401           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10402                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10403                        || id < 0)
10404                       ? coding_category_iso_8_else
10405                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10406                       ? coding_category_iso_8_1
10407                       : coding_category_iso_8_2);
10408         }
10409       if (category != coding_category_iso_8_1
10410           && category != coding_category_iso_8_2)
10411         ASET (attrs, coding_attr_ascii_compat, Qnil);
10412     }
10413   else if (EQ (coding_type, Qemacs_mule))
10414     {
10415       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10416         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10417       ASET (attrs, coding_attr_ascii_compat, Qt);
10418       category = coding_category_emacs_mule;
10419     }
10420   else if (EQ (coding_type, Qshift_jis))
10421     {
10422
10423       struct charset *charset;
10424
10425       if (XINT (Flength (charset_list)) != 3
10426           && XINT (Flength (charset_list)) != 4)
10427         error ("There should be three or four charsets");
10428
10429       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10430       if (CHARSET_DIMENSION (charset) != 1)
10431         error ("Dimension of charset %s is not one",
10432                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10433       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10434         ASET (attrs, coding_attr_ascii_compat, Qt);
10435
10436       charset_list = XCDR (charset_list);
10437       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10438       if (CHARSET_DIMENSION (charset) != 1)
10439         error ("Dimension of charset %s is not one",
10440                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10441
10442       charset_list = XCDR (charset_list);
10443       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10444       if (CHARSET_DIMENSION (charset) != 2)
10445         error ("Dimension of charset %s is not two",
10446                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10447
10448       charset_list = XCDR (charset_list);
10449       if (! NILP (charset_list))
10450         {
10451           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10452           if (CHARSET_DIMENSION (charset) != 2)
10453             error ("Dimension of charset %s is not two",
10454                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10455         }
10456
10457       category = coding_category_sjis;
10458       Vsjis_coding_system = name;
10459     }
10460   else if (EQ (coding_type, Qbig5))
10461     {
10462       struct charset *charset;
10463
10464       if (XINT (Flength (charset_list)) != 2)
10465         error ("There should be just two charsets");
10466
10467       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10468       if (CHARSET_DIMENSION (charset) != 1)
10469         error ("Dimension of charset %s is not one",
10470                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10471       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10472         ASET (attrs, coding_attr_ascii_compat, Qt);
10473
10474       charset_list = XCDR (charset_list);
10475       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10476       if (CHARSET_DIMENSION (charset) != 2)
10477         error ("Dimension of charset %s is not two",
10478                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10479
10480       category = coding_category_big5;
10481       Vbig5_coding_system = name;
10482     }
10483   else if (EQ (coding_type, Qraw_text))
10484     {
10485       category = coding_category_raw_text;
10486       ASET (attrs, coding_attr_ascii_compat, Qt);
10487     }
10488   else if (EQ (coding_type, Qutf_8))
10489     {
10490       Lisp_Object bom;
10491
10492       if (nargs < coding_arg_utf8_max)
10493         goto short_args;
10494
10495       bom = args[coding_arg_utf8_bom];
10496       if (! NILP (bom) && ! EQ (bom, Qt))
10497         {
10498           CHECK_CONS (bom);
10499           val = XCAR (bom);
10500           CHECK_CODING_SYSTEM (val);
10501           val = XCDR (bom);
10502           CHECK_CODING_SYSTEM (val);
10503         }
10504       ASET (attrs, coding_attr_utf_bom, bom);
10505       if (NILP (bom))
10506         ASET (attrs, coding_attr_ascii_compat, Qt);
10507
10508       category = (CONSP (bom) ? coding_category_utf_8_auto
10509                   : NILP (bom) ? coding_category_utf_8_nosig
10510                   : coding_category_utf_8_sig);
10511     }
10512   else if (EQ (coding_type, Qundecided))
10513     {
10514       if (nargs < coding_arg_undecided_max)
10515         goto short_args;
10516       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10517             args[coding_arg_undecided_inhibit_null_byte_detection]);
10518       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10519             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10520       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10521             args[coding_arg_undecided_prefer_utf_8]);
10522       category = coding_category_undecided;
10523     }
10524   else
10525     error ("Invalid coding system type: %s",
10526            SDATA (SYMBOL_NAME (coding_type)));
10527
10528   ASET (attrs, coding_attr_category, make_number (category));
10529   ASET (attrs, coding_attr_plist,
10530         Fcons (QCcategory,
10531                Fcons (AREF (Vcoding_category_table, category),
10532                       CODING_ATTR_PLIST (attrs))));
10533   ASET (attrs, coding_attr_plist,
10534         Fcons (QCascii_compatible_p,
10535                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10536                       CODING_ATTR_PLIST (attrs))));
10537
10538   eol_type = args[coding_arg_eol_type];
10539   if (! NILP (eol_type)
10540       && ! EQ (eol_type, Qunix)
10541       && ! EQ (eol_type, Qdos)
10542       && ! EQ (eol_type, Qmac))
10543     error ("Invalid eol-type");
10544
10545   aliases = list1 (name);
10546
10547   if (NILP (eol_type))
10548     {
10549       eol_type = make_subsidiaries (name);
10550       for (i = 0; i < 3; i++)
10551         {
10552           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10553
10554           this_name = AREF (eol_type, i);
10555           this_aliases = list1 (this_name);
10556           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10557           this_spec = make_uninit_vector (3);
10558           ASET (this_spec, 0, attrs);
10559           ASET (this_spec, 1, this_aliases);
10560           ASET (this_spec, 2, this_eol_type);
10561           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10562           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10563           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist, Qnil);
10564           if (NILP (val))
10565             Vcoding_system_alist
10566               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10567                        Vcoding_system_alist);
10568         }
10569     }
10570
10571   spec_vec = make_uninit_vector (3);
10572   ASET (spec_vec, 0, attrs);
10573   ASET (spec_vec, 1, aliases);
10574   ASET (spec_vec, 2, eol_type);
10575
10576   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10577   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10578   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist, Qnil);
10579   if (NILP (val))
10580     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10581                                   Vcoding_system_alist);
10582
10583   {
10584     int id = coding_categories[category].id;
10585
10586     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10587       setup_coding_system (name, &coding_categories[category]);
10588   }
10589
10590   return Qnil;
10591
10592  short_args:
10593   Fsignal (Qwrong_number_of_arguments,
10594            Fcons (intern ("define-coding-system-internal"),
10595                   make_number (nargs)));
10596 }
10597
10598
10599 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10600        3, 3, 0,
10601        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10602   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10603 {
10604   Lisp_Object spec, attrs;
10605
10606   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10607   attrs = AREF (spec, 0);
10608   if (EQ (prop, QCmnemonic))
10609     {
10610       if (! STRINGP (val))
10611         CHECK_CHARACTER (val);
10612       ASET (attrs, coding_attr_mnemonic, val);
10613     }
10614   else if (EQ (prop, QCdefault_char))
10615     {
10616       if (NILP (val))
10617         val = make_number (' ');
10618       else
10619         CHECK_CHARACTER (val);
10620       ASET (attrs, coding_attr_default_char, val);
10621     }
10622   else if (EQ (prop, QCdecode_translation_table))
10623     {
10624       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10625         CHECK_SYMBOL (val);
10626       ASET (attrs, coding_attr_decode_tbl, val);
10627     }
10628   else if (EQ (prop, QCencode_translation_table))
10629     {
10630       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10631         CHECK_SYMBOL (val);
10632       ASET (attrs, coding_attr_encode_tbl, val);
10633     }
10634   else if (EQ (prop, QCpost_read_conversion))
10635     {
10636       CHECK_SYMBOL (val);
10637       ASET (attrs, coding_attr_post_read, val);
10638     }
10639   else if (EQ (prop, QCpre_write_conversion))
10640     {
10641       CHECK_SYMBOL (val);
10642       ASET (attrs, coding_attr_pre_write, val);
10643     }
10644   else if (EQ (prop, QCascii_compatible_p))
10645     {
10646       ASET (attrs, coding_attr_ascii_compat, val);
10647     }
10648
10649   ASET (attrs, coding_attr_plist,
10650         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10651   return val;
10652 }
10653
10654
10655 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10656        Sdefine_coding_system_alias, 2, 2, 0,
10657        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10658   (Lisp_Object alias, Lisp_Object coding_system)
10659 {
10660   Lisp_Object spec, aliases, eol_type, val;
10661
10662   CHECK_SYMBOL (alias);
10663   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10664   aliases = AREF (spec, 1);
10665   /* ALIASES should be a list of length more than zero, and the first
10666      element is a base coding system.  Append ALIAS at the tail of the
10667      list.  */
10668   while (!NILP (XCDR (aliases)))
10669     aliases = XCDR (aliases);
10670   XSETCDR (aliases, list1 (alias));
10671
10672   eol_type = AREF (spec, 2);
10673   if (VECTORP (eol_type))
10674     {
10675       Lisp_Object subsidiaries;
10676       int i;
10677
10678       subsidiaries = make_subsidiaries (alias);
10679       for (i = 0; i < 3; i++)
10680         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10681                                      AREF (eol_type, i));
10682     }
10683
10684   Fputhash (alias, spec, Vcoding_system_hash_table);
10685   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10686   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist, Qnil);
10687   if (NILP (val))
10688     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10689                                   Vcoding_system_alist);
10690
10691   return Qnil;
10692 }
10693
10694 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10695        1, 1, 0,
10696        doc: /* Return the base of CODING-SYSTEM.
10697 Any alias or subsidiary coding system is not a base coding system.  */)
10698   (Lisp_Object coding_system)
10699 {
10700   Lisp_Object spec, attrs;
10701
10702   if (NILP (coding_system))
10703     return (Qno_conversion);
10704   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10705   attrs = AREF (spec, 0);
10706   return CODING_ATTR_BASE_NAME (attrs);
10707 }
10708
10709 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10710        1, 1, 0,
10711        doc: /* Return the property list of CODING-SYSTEM.  */)
10712   (Lisp_Object coding_system)
10713 {
10714   Lisp_Object spec, attrs;
10715
10716   if (NILP (coding_system))
10717     coding_system = Qno_conversion;
10718   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10719   attrs = AREF (spec, 0);
10720   return CODING_ATTR_PLIST (attrs);
10721 }
10722
10723
10724 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10725        1, 1, 0,
10726        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10727   (Lisp_Object coding_system)
10728 {
10729   Lisp_Object spec;
10730
10731   if (NILP (coding_system))
10732     coding_system = Qno_conversion;
10733   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10734   return AREF (spec, 1);
10735 }
10736
10737 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10738        Scoding_system_eol_type, 1, 1, 0,
10739        doc: /* Return eol-type of CODING-SYSTEM.
10740 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10741
10742 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10743 and CR respectively.
10744
10745 A vector value indicates that a format of end-of-line should be
10746 detected automatically.  Nth element of the vector is the subsidiary
10747 coding system whose eol-type is N.  */)
10748   (Lisp_Object coding_system)
10749 {
10750   Lisp_Object spec, eol_type;
10751   int n;
10752
10753   if (NILP (coding_system))
10754     coding_system = Qno_conversion;
10755   if (! CODING_SYSTEM_P (coding_system))
10756     return Qnil;
10757   spec = CODING_SYSTEM_SPEC (coding_system);
10758   eol_type = AREF (spec, 2);
10759   if (VECTORP (eol_type))
10760     return Fcopy_sequence (eol_type);
10761   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10762   return make_number (n);
10763 }
10764
10765 #endif /* emacs */
10766
10767 \f
10768 /*** 9. Post-amble ***/
10769
10770 void
10771 init_coding_once (void)
10772 {
10773   int i;
10774
10775   for (i = 0; i < coding_category_max; i++)
10776     {
10777       coding_categories[i].id = -1;
10778       coding_priorities[i] = i;
10779     }
10780
10781   /* ISO2022 specific initialize routine.  */
10782   for (i = 0; i < 0x20; i++)
10783     iso_code_class[i] = ISO_control_0;
10784   for (i = 0x21; i < 0x7F; i++)
10785     iso_code_class[i] = ISO_graphic_plane_0;
10786   for (i = 0x80; i < 0xA0; i++)
10787     iso_code_class[i] = ISO_control_1;
10788   for (i = 0xA1; i < 0xFF; i++)
10789     iso_code_class[i] = ISO_graphic_plane_1;
10790   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10791   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10792   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10793   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10794   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10795   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10796   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10797   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10798   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10799
10800   for (i = 0; i < 256; i++)
10801     {
10802       emacs_mule_bytes[i] = 1;
10803     }
10804   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10805   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10806   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10807   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10808 }
10809
10810 #ifdef emacs
10811
10812 void
10813 syms_of_coding (void)
10814 {
10815   staticpro (&Vcoding_system_hash_table);
10816   Vcoding_system_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
10817
10818   staticpro (&Vsjis_coding_system);
10819   Vsjis_coding_system = Qnil;
10820
10821   staticpro (&Vbig5_coding_system);
10822   Vbig5_coding_system = Qnil;
10823
10824   staticpro (&Vcode_conversion_reused_workbuf);
10825   Vcode_conversion_reused_workbuf = Qnil;
10826
10827   staticpro (&Vcode_conversion_workbuf_name);
10828   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10829
10830   reused_workbuf_in_use = 0;
10831
10832   DEFSYM (Qcharset, "charset");
10833   DEFSYM (Qtarget_idx, "target-idx");
10834   DEFSYM (Qcoding_system_history, "coding-system-history");
10835   Fset (Qcoding_system_history, Qnil);
10836
10837   /* Target FILENAME is the first argument.  */
10838   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10839   /* Target FILENAME is the third argument.  */
10840   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10841
10842   DEFSYM (Qcall_process, "call-process");
10843   /* Target PROGRAM is the first argument.  */
10844   Fput (Qcall_process, Qtarget_idx, make_number (0));
10845
10846   DEFSYM (Qcall_process_region, "call-process-region");
10847   /* Target PROGRAM is the third argument.  */
10848   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10849
10850   DEFSYM (Qstart_process, "start-process");
10851   /* Target PROGRAM is the third argument.  */
10852   Fput (Qstart_process, Qtarget_idx, make_number (2));
10853
10854   DEFSYM (Qopen_network_stream, "open-network-stream");
10855   /* Target SERVICE is the fourth argument.  */
10856   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10857
10858   DEFSYM (Qunix, "unix");
10859   DEFSYM (Qdos, "dos");
10860   DEFSYM (Qmac, "mac");
10861
10862   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10863   DEFSYM (Qundecided, "undecided");
10864   DEFSYM (Qno_conversion, "no-conversion");
10865   DEFSYM (Qraw_text, "raw-text");
10866
10867   DEFSYM (Qiso_2022, "iso-2022");
10868
10869   DEFSYM (Qutf_8, "utf-8");
10870   DEFSYM (Qutf_8_unix, "utf-8-unix");
10871   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10872
10873 #if defined (WINDOWSNT) || defined (CYGWIN)
10874   /* No, not utf-16-le: that one has a BOM.  */
10875   DEFSYM (Qutf_16le, "utf-16le");
10876 #endif
10877
10878   DEFSYM (Qutf_16, "utf-16");
10879   DEFSYM (Qbig, "big");
10880   DEFSYM (Qlittle, "little");
10881
10882   DEFSYM (Qshift_jis, "shift-jis");
10883   DEFSYM (Qbig5, "big5");
10884
10885   DEFSYM (Qcoding_system_p, "coding-system-p");
10886
10887   /* Error signaled when there's a problem with detecting a coding system.  */
10888   DEFSYM (Qcoding_system_error, "coding-system-error");
10889   Fput (Qcoding_system_error, Qerror_conditions,
10890         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10891   Fput (Qcoding_system_error, Qerror_message,
10892         build_pure_c_string ("Invalid coding system"));
10893
10894   DEFSYM (Qtranslation_table, "translation-table");
10895   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10896   DEFSYM (Qtranslation_table_id, "translation-table-id");
10897
10898   /* Coding system emacs-mule and raw-text are for converting only
10899      end-of-line format.  */
10900   DEFSYM (Qemacs_mule, "emacs-mule");
10901
10902   DEFSYM (QCcategory, ":category");
10903   DEFSYM (QCmnemonic, ":mnemonic");
10904   DEFSYM (QCdefault_char, ":default-char");
10905   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10906   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10907   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10908   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10909   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10910
10911   Vcoding_category_table
10912     = Fmake_vector (make_number (coding_category_max), Qnil);
10913   staticpro (&Vcoding_category_table);
10914   /* Followings are target of code detection.  */
10915   ASET (Vcoding_category_table, coding_category_iso_7,
10916         intern_c_string ("coding-category-iso-7"));
10917   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10918         intern_c_string ("coding-category-iso-7-tight"));
10919   ASET (Vcoding_category_table, coding_category_iso_8_1,
10920         intern_c_string ("coding-category-iso-8-1"));
10921   ASET (Vcoding_category_table, coding_category_iso_8_2,
10922         intern_c_string ("coding-category-iso-8-2"));
10923   ASET (Vcoding_category_table, coding_category_iso_7_else,
10924         intern_c_string ("coding-category-iso-7-else"));
10925   ASET (Vcoding_category_table, coding_category_iso_8_else,
10926         intern_c_string ("coding-category-iso-8-else"));
10927   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10928         intern_c_string ("coding-category-utf-8-auto"));
10929   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10930         intern_c_string ("coding-category-utf-8"));
10931   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10932         intern_c_string ("coding-category-utf-8-sig"));
10933   ASET (Vcoding_category_table, coding_category_utf_16_be,
10934         intern_c_string ("coding-category-utf-16-be"));
10935   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10936         intern_c_string ("coding-category-utf-16-auto"));
10937   ASET (Vcoding_category_table, coding_category_utf_16_le,
10938         intern_c_string ("coding-category-utf-16-le"));
10939   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10940         intern_c_string ("coding-category-utf-16-be-nosig"));
10941   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10942         intern_c_string ("coding-category-utf-16-le-nosig"));
10943   ASET (Vcoding_category_table, coding_category_charset,
10944         intern_c_string ("coding-category-charset"));
10945   ASET (Vcoding_category_table, coding_category_sjis,
10946         intern_c_string ("coding-category-sjis"));
10947   ASET (Vcoding_category_table, coding_category_big5,
10948         intern_c_string ("coding-category-big5"));
10949   ASET (Vcoding_category_table, coding_category_ccl,
10950         intern_c_string ("coding-category-ccl"));
10951   ASET (Vcoding_category_table, coding_category_emacs_mule,
10952         intern_c_string ("coding-category-emacs-mule"));
10953   /* Followings are NOT target of code detection.  */
10954   ASET (Vcoding_category_table, coding_category_raw_text,
10955         intern_c_string ("coding-category-raw-text"));
10956   ASET (Vcoding_category_table, coding_category_undecided,
10957         intern_c_string ("coding-category-undecided"));
10958
10959   DEFSYM (Qinsufficient_source, "insufficient-source");
10960   DEFSYM (Qinvalid_source, "invalid-source");
10961   DEFSYM (Qinterrupted, "interrupted");
10962
10963   /* If a symbol has this property, evaluate the value to define the
10964      symbol as a coding system.  */
10965   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10966
10967   defsubr (&Scoding_system_p);
10968   defsubr (&Sread_coding_system);
10969   defsubr (&Sread_non_nil_coding_system);
10970   defsubr (&Scheck_coding_system);
10971   defsubr (&Sdetect_coding_region);
10972   defsubr (&Sdetect_coding_string);
10973   defsubr (&Sfind_coding_systems_region_internal);
10974   defsubr (&Sunencodable_char_position);
10975   defsubr (&Scheck_coding_systems_region);
10976   defsubr (&Sdecode_coding_region);
10977   defsubr (&Sencode_coding_region);
10978   defsubr (&Sdecode_coding_string);
10979   defsubr (&Sencode_coding_string);
10980   defsubr (&Sdecode_sjis_char);
10981   defsubr (&Sencode_sjis_char);
10982   defsubr (&Sdecode_big5_char);
10983   defsubr (&Sencode_big5_char);
10984   defsubr (&Sset_terminal_coding_system_internal);
10985   defsubr (&Sset_safe_terminal_coding_system_internal);
10986   defsubr (&Sterminal_coding_system);
10987   defsubr (&Sset_keyboard_coding_system_internal);
10988   defsubr (&Skeyboard_coding_system);
10989   defsubr (&Sfind_operation_coding_system);
10990   defsubr (&Sset_coding_system_priority);
10991   defsubr (&Sdefine_coding_system_internal);
10992   defsubr (&Sdefine_coding_system_alias);
10993   defsubr (&Scoding_system_put);
10994   defsubr (&Scoding_system_base);
10995   defsubr (&Scoding_system_plist);
10996   defsubr (&Scoding_system_aliases);
10997   defsubr (&Scoding_system_eol_type);
10998   defsubr (&Scoding_system_priority_list);
10999
11000   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
11001                doc: /* List of coding systems.
11002
11003 Do not alter the value of this variable manually.  This variable should be
11004 updated by the functions `define-coding-system' and
11005 `define-coding-system-alias'.  */);
11006   Vcoding_system_list = Qnil;
11007
11008   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
11009                doc: /* Alist of coding system names.
11010 Each element is one element list of coding system name.
11011 This variable is given to `completing-read' as COLLECTION argument.
11012
11013 Do not alter the value of this variable manually.  This variable should be
11014 updated by the functions `make-coding-system' and
11015 `define-coding-system-alias'.  */);
11016   Vcoding_system_alist = Qnil;
11017
11018   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
11019                doc: /* List of coding-categories (symbols) ordered by priority.
11020
11021 On detecting a coding system, Emacs tries code detection algorithms
11022 associated with each coding-category one by one in this order.  When
11023 one algorithm agrees with a byte sequence of source text, the coding
11024 system bound to the corresponding coding-category is selected.
11025
11026 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
11027   {
11028     int i;
11029
11030     Vcoding_category_list = Qnil;
11031     for (i = coding_category_max - 1; i >= 0; i--)
11032       Vcoding_category_list
11033         = Fcons (AREF (Vcoding_category_table, i),
11034                  Vcoding_category_list);
11035   }
11036
11037   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11038                doc: /* Specify the coding system for read operations.
11039 It is useful to bind this variable with `let', but do not set it globally.
11040 If the value is a coding system, it is used for decoding on read operation.
11041 If not, an appropriate element is used from one of the coding system alists.
11042 There are three such tables: `file-coding-system-alist',
11043 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11044   Vcoding_system_for_read = Qnil;
11045
11046   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11047                doc: /* Specify the coding system for write operations.
11048 Programs bind this variable with `let', but you should not set it globally.
11049 If the value is a coding system, it is used for encoding of output,
11050 when writing it to a file and when sending it to a file or subprocess.
11051
11052 If this does not specify a coding system, an appropriate element
11053 is used from one of the coding system alists.
11054 There are three such tables: `file-coding-system-alist',
11055 `process-coding-system-alist', and `network-coding-system-alist'.
11056 For output to files, if the above procedure does not specify a coding system,
11057 the value of `buffer-file-coding-system' is used.  */);
11058   Vcoding_system_for_write = Qnil;
11059
11060   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11061                doc: /*
11062 Coding system used in the latest file or process I/O.  */);
11063   Vlast_coding_system_used = Qnil;
11064
11065   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11066                doc: /*
11067 Error status of the last code conversion.
11068
11069 When an error was detected in the last code conversion, this variable
11070 is set to one of the following symbols.
11071   `insufficient-source'
11072   `inconsistent-eol'
11073   `invalid-source'
11074   `interrupted'
11075   `insufficient-memory'
11076 When no error was detected, the value doesn't change.  So, to check
11077 the error status of a code conversion by this variable, you must
11078 explicitly set this variable to nil before performing code
11079 conversion.  */);
11080   Vlast_code_conversion_error = Qnil;
11081
11082   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11083                doc: /*
11084 Non-nil means always inhibit code conversion of end-of-line format.
11085 See info node `Coding Systems' and info node `Text and Binary' concerning
11086 such conversion.  */);
11087   inhibit_eol_conversion = 0;
11088
11089   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11090                doc: /*
11091 Non-nil means process buffer inherits coding system of process output.
11092 Bind it to t if the process output is to be treated as if it were a file
11093 read from some filesystem.  */);
11094   inherit_process_coding_system = 0;
11095
11096   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11097                doc: /*
11098 Alist to decide a coding system to use for a file I/O operation.
11099 The format is ((PATTERN . VAL) ...),
11100 where PATTERN is a regular expression matching a file name,
11101 VAL is a coding system, a cons of coding systems, or a function symbol.
11102 If VAL is a coding system, it is used for both decoding and encoding
11103 the file contents.
11104 If VAL is a cons of coding systems, the car part is used for decoding,
11105 and the cdr part is used for encoding.
11106 If VAL is a function symbol, the function must return a coding system
11107 or a cons of coding systems which are used as above.  The function is
11108 called with an argument that is a list of the arguments with which
11109 `find-operation-coding-system' was called.  If the function can't decide
11110 a coding system, it can return `undecided' so that the normal
11111 code-detection is performed.
11112
11113 See also the function `find-operation-coding-system'
11114 and the variable `auto-coding-alist'.  */);
11115   Vfile_coding_system_alist = Qnil;
11116
11117   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11118                doc: /*
11119 Alist to decide a coding system to use for a process I/O operation.
11120 The format is ((PATTERN . VAL) ...),
11121 where PATTERN is a regular expression matching a program name,
11122 VAL is a coding system, a cons of coding systems, or a function symbol.
11123 If VAL is a coding system, it is used for both decoding what received
11124 from the program and encoding what sent to the program.
11125 If VAL is a cons of coding systems, the car part is used for decoding,
11126 and the cdr part is used for encoding.
11127 If VAL is a function symbol, the function must return a coding system
11128 or a cons of coding systems which are used as above.
11129
11130 See also the function `find-operation-coding-system'.  */);
11131   Vprocess_coding_system_alist = Qnil;
11132
11133   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11134                doc: /*
11135 Alist to decide a coding system to use for a network I/O operation.
11136 The format is ((PATTERN . VAL) ...),
11137 where PATTERN is a regular expression matching a network service name
11138 or is a port number to connect to,
11139 VAL is a coding system, a cons of coding systems, or a function symbol.
11140 If VAL is a coding system, it is used for both decoding what received
11141 from the network stream and encoding what sent to the network stream.
11142 If VAL is a cons of coding systems, the car part is used for decoding,
11143 and the cdr part is used for encoding.
11144 If VAL is a function symbol, the function must return a coding system
11145 or a cons of coding systems which are used as above.
11146
11147 See also the function `find-operation-coding-system'.  */);
11148   Vnetwork_coding_system_alist = Qnil;
11149
11150   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11151                doc: /* Coding system to use with system messages.
11152 Also used for decoding keyboard input on X Window system, and for
11153 encoding standard output and error streams.  */);
11154   Vlocale_coding_system = Qnil;
11155
11156   /* The eol mnemonics are reset in startup.el system-dependently.  */
11157   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11158                doc: /*
11159 String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11160   eol_mnemonic_unix = build_pure_c_string (":");
11161
11162   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11163                doc: /*
11164 String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11165   eol_mnemonic_dos = build_pure_c_string ("\\");
11166
11167   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11168                doc: /*
11169 String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11170   eol_mnemonic_mac = build_pure_c_string ("/");
11171
11172   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11173                doc: /*
11174 String displayed in mode line when end-of-line format is not yet determined.  */);
11175   eol_mnemonic_undecided = build_pure_c_string (":");
11176
11177   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11178                doc: /*
11179 Non-nil enables character translation while encoding and decoding.  */);
11180   Venable_character_translation = Qt;
11181
11182   DEFVAR_LISP ("standard-translation-table-for-decode",
11183                Vstandard_translation_table_for_decode,
11184                doc: /* Table for translating characters while decoding.  */);
11185   Vstandard_translation_table_for_decode = Qnil;
11186
11187   DEFVAR_LISP ("standard-translation-table-for-encode",
11188                Vstandard_translation_table_for_encode,
11189                doc: /* Table for translating characters while encoding.  */);
11190   Vstandard_translation_table_for_encode = Qnil;
11191
11192   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11193                doc: /* Alist of charsets vs revision numbers.
11194 While encoding, if a charset (car part of an element) is found,
11195 designate it with the escape sequence identifying revision (cdr part
11196 of the element).  */);
11197   Vcharset_revision_table = Qnil;
11198
11199   DEFVAR_LISP ("default-process-coding-system",
11200                Vdefault_process_coding_system,
11201                doc: /* Cons of coding systems used for process I/O by default.
11202 The car part is used for decoding a process output,
11203 the cdr part is used for encoding a text to be sent to a process.  */);
11204   Vdefault_process_coding_system = Qnil;
11205
11206   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11207                doc: /*
11208 Table of extra Latin codes in the range 128..159 (inclusive).
11209 This is a vector of length 256.
11210 If Nth element is non-nil, the existence of code N in a file
11211 \(or output of subprocess) doesn't prevent it to be detected as
11212 a coding system of ISO 2022 variant which has a flag
11213 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11214 or reading output of a subprocess.
11215 Only 128th through 159th elements have a meaning.  */);
11216   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11217
11218   DEFVAR_LISP ("select-safe-coding-system-function",
11219                Vselect_safe_coding_system_function,
11220                doc: /*
11221 Function to call to select safe coding system for encoding a text.
11222
11223 If set, this function is called to force a user to select a proper
11224 coding system which can encode the text in the case that a default
11225 coding system used in each operation can't encode the text.  The
11226 function should take care that the buffer is not modified while
11227 the coding system is being selected.
11228
11229 The default value is `select-safe-coding-system' (which see).  */);
11230   Vselect_safe_coding_system_function = Qnil;
11231
11232   DEFVAR_BOOL ("coding-system-require-warning",
11233                coding_system_require_warning,
11234                doc: /* Internal use only.
11235 If non-nil, on writing a file, `select-safe-coding-system-function' is
11236 called even if `coding-system-for-write' is non-nil.  The command
11237 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11238   coding_system_require_warning = 0;
11239
11240
11241   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11242                inhibit_iso_escape_detection,
11243                doc: /*
11244 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11245
11246 When Emacs reads text, it tries to detect how the text is encoded.
11247 This code detection is sensitive to escape sequences.  If Emacs sees
11248 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11249 of the ISO2022 encodings, and decodes text by the corresponding coding
11250 system (e.g. `iso-2022-7bit').
11251
11252 However, there may be a case that you want to read escape sequences in
11253 a file as is.  In such a case, you can set this variable to non-nil.
11254 Then the code detection will ignore any escape sequences, and no text is
11255 detected as encoded in some ISO-2022 encoding.  The result is that all
11256 escape sequences become visible in a buffer.
11257
11258 The default value is nil, and it is strongly recommended not to change
11259 it.  That is because many Emacs Lisp source files that contain
11260 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11261 in Emacs's distribution, and they won't be decoded correctly on
11262 reading if you suppress escape sequence detection.
11263
11264 The other way to read escape sequences in a file without decoding is
11265 to explicitly specify some coding system that doesn't use ISO-2022
11266 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11267   inhibit_iso_escape_detection = 0;
11268
11269   DEFVAR_BOOL ("inhibit-null-byte-detection",
11270                inhibit_null_byte_detection,
11271                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11272 By default, Emacs treats it as binary data, and does not attempt to
11273 decode it.  The effect is as if you specified `no-conversion' for
11274 reading that text.
11275
11276 Set this to non-nil when a regular text happens to include null bytes.
11277 Examples are Index nodes of Info files and null-byte delimited output
11278 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11279 decode text as usual.  */);
11280   inhibit_null_byte_detection = 0;
11281
11282   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11283                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11284 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11285   disable_ascii_optimization = 0;
11286
11287   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11288                doc: /* Char table for translating self-inserting characters.
11289 This is applied to the result of input methods, not their input.
11290 See also `keyboard-translate-table'.
11291
11292 Use of this variable for character code unification was rendered
11293 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11294 internal character representation.  */);
11295   Vtranslation_table_for_input = Qnil;
11296
11297   Lisp_Object args[coding_arg_undecided_max];
11298   memclear (args, sizeof args);
11299
11300   Lisp_Object plist[] =
11301     {
11302       QCname,
11303       args[coding_arg_name] = Qno_conversion,
11304       QCmnemonic,
11305       args[coding_arg_mnemonic] = make_number ('='),
11306       intern_c_string (":coding-type"),
11307       args[coding_arg_coding_type] = Qraw_text,
11308       QCascii_compatible_p,
11309       args[coding_arg_ascii_compatible_p] = Qt,
11310       QCdefault_char,
11311       args[coding_arg_default_char] = make_number (0),
11312       intern_c_string (":for-unibyte"),
11313       args[coding_arg_for_unibyte] = Qt,
11314       intern_c_string (":docstring"),
11315       (build_pure_c_string
11316        ("Do no conversion.\n"
11317         "\n"
11318         "When you visit a file with this coding, the file is read into a\n"
11319         "unibyte buffer as is, thus each byte of a file is treated as a\n"
11320         "character.")),
11321       intern_c_string (":eol-type"),
11322       args[coding_arg_eol_type] = Qunix,
11323     };
11324   args[coding_arg_plist] = CALLMANY (Flist, plist);
11325   Fdefine_coding_system_internal (coding_arg_max, args);
11326
11327   plist[1] = args[coding_arg_name] = Qundecided;
11328   plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11329   plist[5] = args[coding_arg_coding_type] = Qundecided;
11330   /* This is already set.
11331      plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11332   plist[8] = intern_c_string (":charset-list");
11333   plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11334   plist[11] = args[coding_arg_for_unibyte] = Qnil;
11335   plist[13] = build_pure_c_string ("No conversion on encoding, "
11336                                    "automatic conversion on decoding.");
11337   plist[15] = args[coding_arg_eol_type] = Qnil;
11338   args[coding_arg_plist] = CALLMANY (Flist, plist);
11339   args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11340   args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11341   Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11342
11343   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11344
11345   for (int i = 0; i < coding_category_max; i++)
11346     Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11347
11348 #if defined (DOS_NT)
11349   system_eol_type = Qdos;
11350 #else
11351   system_eol_type = Qunix;
11352 #endif
11353   staticpro (&system_eol_type);
11354 }
11355 #endif /* emacs */