src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2018 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or (at
  16 your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  Classic Mac OS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce an encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "termhooks.h"
 301
 302 Lisp_Object Vcoding_system_hash_table;
 303
 304 /* Format of end-of-line decided by system.  This is Qunix on
 305    Unix and Mac, Qdos on DOS/Windows.
 306    This has an effect only for external encoding (i.e. for output to
 307    file and process), not for in-buffer or Lisp string encoding.  */
 308 static Lisp_Object system_eol_type;
 309
 310 #ifdef emacs
 311
 312 /* Coding-systems are handed between Emacs Lisp programs and C internal
 313    routines by the following three variables.  */
 314 /* Coding system to be used to encode text for terminal display when
 315    terminal coding system is nil.  */
 316 struct coding_system safe_terminal_coding;
 317
 318 #endif /* emacs */
 319
 320 /* Two special coding systems.  */
 321 static Lisp_Object Vsjis_coding_system;
 322 static Lisp_Object Vbig5_coding_system;
 323
 324 /* ISO2022 section */
 325
 326 #define CODING_ISO_INITIAL(coding, reg)                 \
 327   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 328                      coding_attr_iso_initial),          \
 329                reg)))
 330
 331
 332 #define CODING_ISO_REQUEST(coding, charset_id)          \
 333   (((charset_id) <= (coding)->max_charset_id            \
 334     ? ((coding)->safe_charsets[charset_id] != 255       \
 335        ? (coding)->safe_charsets[charset_id]            \
 336        : -1)                                            \
 337     : -1))
 338
 339
 340 #define CODING_ISO_FLAGS(coding)        \
 341   ((coding)->spec.iso_2022.flags)
 342 #define CODING_ISO_DESIGNATION(coding, reg)     \
 343   ((coding)->spec.iso_2022.current_designation[reg])
 344 #define CODING_ISO_INVOCATION(coding, plane)    \
 345   ((coding)->spec.iso_2022.current_invocation[plane])
 346 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 347   ((coding)->spec.iso_2022.single_shifting)
 348 #define CODING_ISO_BOL(coding)  \
 349   ((coding)->spec.iso_2022.bol)
 350 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 351   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1       \
 352    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
 353 #define CODING_ISO_CMP_STATUS(coding)   \
 354   (&(coding)->spec.iso_2022.cmp_status)
 355 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 356   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 357 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 358   ((coding)->spec.iso_2022.embedded_utf_8)
 359
 360 /* Control characters of ISO2022.  */
 361                         /* code */      /* function */
 362 #define ISO_CODE_SO     0x0E            /* shift-out */
 363 #define ISO_CODE_SI     0x0F            /* shift-in */
 364 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 365 #define ISO_CODE_ESC    0x1B            /* escape */
 366 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 367 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 368 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 369
 370 /* All code (1-byte) of ISO2022 is classified into one of the
 371    followings.  */
 372 enum iso_code_class_type
 373   {
 374     ISO_control_0,              /* Control codes in the range
 375                                    0x00..0x1F and 0x7F, except for the
 376                                    following 5 codes.  */
 377     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 378     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 379     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 380     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 381     ISO_control_1,              /* Control codes in the range
 382                                    0x80..0x9F, except for the
 383                                    following 3 codes.  */
 384     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 385     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 386     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 387     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 388     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 389     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 390     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 391   };
 392
 393 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 394     `iso-flags' attribute of an iso2022 coding system.  */
 395
 396 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 397    instead of the correct short-form sequence (e.g. ESC $ A).  */
 398 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 399
 400 /* If set, reset graphic planes and registers at end-of-line to the
 401    initial state.  */
 402 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 403
 404 /* If set, reset graphic planes and registers before any control
 405    characters to the initial state.  */
 406 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 407
 408 /* If set, encode by 7-bit environment.  */
 409 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 410
 411 /* If set, use locking-shift function.  */
 412 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 413
 414 /* If set, use single-shift function.  Overwrite
 415    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 416 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 417
 418 /* If set, use designation escape sequence.  */
 419 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 420
 421 /* If set, produce revision number sequence.  */
 422 #define CODING_ISO_FLAG_REVISION        0x0080
 423
 424 /* If set, produce ISO6429's direction specifying sequence.  */
 425 #define CODING_ISO_FLAG_DIRECTION       0x0100
 426
 427 /* If set, assume designation states are reset at beginning of line on
 428    output.  */
 429 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 430
 431 /* If set, designation sequence should be placed at beginning of line
 432    on output.  */
 433 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 434
 435 /* If set, do not encode unsafe characters on output.  */
 436 #define CODING_ISO_FLAG_SAFE            0x0800
 437
 438 /* If set, extra latin codes (128..159) are accepted as a valid code
 439    on input.  */
 440 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 441
 442 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 443
 444 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 445
 446 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 447
 448 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 449
 450 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 451
 452 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 453
 454 /* A character to be produced on output if encoding of the original
 455    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 456 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 457
 458 /* UTF-8 section */
 459 #define CODING_UTF_8_BOM(coding)        \
 460   ((coding)->spec.utf_8_bom)
 461
 462 /* UTF-16 section */
 463 #define CODING_UTF_16_BOM(coding)       \
 464   ((coding)->spec.utf_16.bom)
 465
 466 #define CODING_UTF_16_ENDIAN(coding)    \
 467   ((coding)->spec.utf_16.endian)
 468
 469 #define CODING_UTF_16_SURROGATE(coding) \
 470   ((coding)->spec.utf_16.surrogate)
 471
 472
 473 /* CCL section */
 474 #define CODING_CCL_DECODER(coding)      \
 475   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 476 #define CODING_CCL_ENCODER(coding)      \
 477   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 478 #define CODING_CCL_VALIDS(coding)                                          \
 479   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 480
 481 /* Index for each coding category in `coding_categories' */
 482
 483 enum coding_category
 484   {
 485     coding_category_iso_7,
 486     coding_category_iso_7_tight,
 487     coding_category_iso_8_1,
 488     coding_category_iso_8_2,
 489     coding_category_iso_7_else,
 490     coding_category_iso_8_else,
 491     coding_category_utf_8_auto,
 492     coding_category_utf_8_nosig,
 493     coding_category_utf_8_sig,
 494     coding_category_utf_16_auto,
 495     coding_category_utf_16_be,
 496     coding_category_utf_16_le,
 497     coding_category_utf_16_be_nosig,
 498     coding_category_utf_16_le_nosig,
 499     coding_category_charset,
 500     coding_category_sjis,
 501     coding_category_big5,
 502     coding_category_ccl,
 503     coding_category_emacs_mule,
 504     /* All above are targets of code detection.  */
 505     coding_category_raw_text,
 506     coding_category_undecided,
 507     coding_category_max
 508   };
 509
 510 /* Definitions of flag bits used in detect_coding_XXXX.  */
 511 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 512 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 513 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 514 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 515 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 516 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 517 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 518 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 519 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 520 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 521 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 522 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 523 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 524 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 525 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 526 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 527 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 528 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 529 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 530 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 531
 532 /* This value is returned if detect_coding_mask () find nothing other
 533    than ASCII characters.  */
 534 #define CATEGORY_MASK_ANY               \
 535   (CATEGORY_MASK_ISO_7                  \
 536    | CATEGORY_MASK_ISO_7_TIGHT          \
 537    | CATEGORY_MASK_ISO_8_1              \
 538    | CATEGORY_MASK_ISO_8_2              \
 539    | CATEGORY_MASK_ISO_7_ELSE           \
 540    | CATEGORY_MASK_ISO_8_ELSE           \
 541    | CATEGORY_MASK_UTF_8_AUTO           \
 542    | CATEGORY_MASK_UTF_8_NOSIG          \
 543    | CATEGORY_MASK_UTF_8_SIG            \
 544    | CATEGORY_MASK_UTF_16_AUTO          \
 545    | CATEGORY_MASK_UTF_16_BE            \
 546    | CATEGORY_MASK_UTF_16_LE            \
 547    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 548    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 549    | CATEGORY_MASK_CHARSET              \
 550    | CATEGORY_MASK_SJIS                 \
 551    | CATEGORY_MASK_BIG5                 \
 552    | CATEGORY_MASK_CCL                  \
 553    | CATEGORY_MASK_EMACS_MULE)
 554
 555
 556 #define CATEGORY_MASK_ISO_7BIT \
 557   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 558
 559 #define CATEGORY_MASK_ISO_8BIT \
 560   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 561
 562 #define CATEGORY_MASK_ISO_ELSE \
 563   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 564
 565 #define CATEGORY_MASK_ISO_ESCAPE        \
 566   (CATEGORY_MASK_ISO_7                  \
 567    | CATEGORY_MASK_ISO_7_TIGHT          \
 568    | CATEGORY_MASK_ISO_7_ELSE           \
 569    | CATEGORY_MASK_ISO_8_ELSE)
 570
 571 #define CATEGORY_MASK_ISO       \
 572   (  CATEGORY_MASK_ISO_7BIT     \
 573      | CATEGORY_MASK_ISO_8BIT   \
 574      | CATEGORY_MASK_ISO_ELSE)
 575
 576 #define CATEGORY_MASK_UTF_16            \
 577   (CATEGORY_MASK_UTF_16_AUTO            \
 578    | CATEGORY_MASK_UTF_16_BE            \
 579    | CATEGORY_MASK_UTF_16_LE            \
 580    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 581    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 582
 583 #define CATEGORY_MASK_UTF_8     \
 584   (CATEGORY_MASK_UTF_8_AUTO     \
 585    | CATEGORY_MASK_UTF_8_NOSIG  \
 586    | CATEGORY_MASK_UTF_8_SIG)
 587
 588 /* Table of coding categories (Lisp symbols).  This variable is for
 589    internal use only.  */
 590 static Lisp_Object Vcoding_category_table;
 591
 592 /* Table of coding-categories ordered by priority.  */
 593 static enum coding_category coding_priorities[coding_category_max];
 594
 595 /* Nth element is a coding context for the coding system bound to the
 596    Nth coding category.  */
 597 static struct coding_system coding_categories[coding_category_max];
 598
 599 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 600
 601 static int
 602 encode_inhibit_flag (Lisp_Object flag)
 603 {
 604   return NILP (flag) ? -1 : EQ (flag, Qt);
 605 }
 606
 607 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 608    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 609
 610 static bool
 611 inhibit_flag (int encoded_flag, bool var)
 612 {
 613   return 0 < encoded_flag + var;
 614 }
 615
 616 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 617   do {                                                  \
 618     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 619     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 620   } while (0)
 621
 622 static void
 623 CHECK_NATNUM_CAR (Lisp_Object x)
 624 {
 625   Lisp_Object tmp = XCAR (x);
 626   CHECK_NATNUM (tmp);
 627   XSETCAR (x, tmp);
 628 }
 629
 630 static void
 631 CHECK_NATNUM_CDR (Lisp_Object x)
 632 {
 633   Lisp_Object tmp = XCDR (x);
 634   CHECK_NATNUM (tmp);
 635   XSETCDR (x, tmp);
 636 }
 637
 638 /* True if CODING's destination can be grown.  */
 639
 640 static bool
 641 growable_destination (struct coding_system *coding)
 642 {
 643   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
 644 }
 645
 646
 647 /* Safely get one byte from the source text pointed by SRC which ends
 648    at SRC_END, and set C to that byte.  If there are not enough bytes
 649    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 650    and a multibyte character is found at SRC, set C to the
 651    negative value of the character code.  The caller should declare
 652    and set these variables appropriately in advance:
 653         src, src_end, multibytep */
 654
 655 #define ONE_MORE_BYTE(c)                                \
 656   do {                                                  \
 657     if (src == src_end)                                 \
 658       {                                                 \
 659         if (src_base < src)                             \
 660           record_conversion_result                      \
 661             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 662         goto no_more_source;                            \
 663       }                                                 \
 664     c = *src++;                                         \
 665     if (multibytep && (c & 0x80))                       \
 666       {                                                 \
 667         if ((c & 0xFE) == 0xC0)                         \
 668           c = ((c & 1) << 6) | *src++;                  \
 669         else                                            \
 670           {                                             \
 671             src--;                                      \
 672             c = - string_char (src, &src, NULL);        \
 673             record_conversion_result                    \
 674               (coding, CODING_RESULT_INVALID_SRC);      \
 675           }                                             \
 676       }                                                 \
 677     consumed_chars++;                                   \
 678   } while (0)
 679
 680 /* Safely get two bytes from the source text pointed by SRC which ends
 681    at SRC_END, and set C1 and C2 to those bytes while skipping the
 682    heading multibyte characters.  If there are not enough bytes in the
 683    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 684    a multibyte character is found for C2, set C2 to the negative value
 685    of the character code.  The caller should declare and set these
 686    variables appropriately in advance:
 687         src, src_end, multibytep
 688    It is intended that this macro is used in detect_coding_utf_16.  */
 689
 690 #define TWO_MORE_BYTES(c1, c2)                          \
 691   do {                                                  \
 692     do {                                                \
 693       if (src == src_end)                               \
 694         goto no_more_source;                            \
 695       c1 = *src++;                                      \
 696       if (multibytep && (c1 & 0x80))                    \
 697         {                                               \
 698           if ((c1 & 0xFE) == 0xC0)                      \
 699             c1 = ((c1 & 1) << 6) | *src++;              \
 700           else                                          \
 701             {                                           \
 702               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 703               c1 = -1;                                  \
 704             }                                           \
 705         }                                               \
 706     } while (c1 < 0);                                   \
 707     if (src == src_end)                                 \
 708       goto no_more_source;                              \
 709     c2 = *src++;                                        \
 710     if (multibytep && (c2 & 0x80))                      \
 711       {                                                 \
 712         if ((c2 & 0xFE) == 0xC0)                        \
 713           c2 = ((c2 & 1) << 6) | *src++;                \
 714         else                                            \
 715           c2 = -1;                                      \
 716       }                                                 \
 717   } while (0)
 718
 719
 720 /* Store a byte C in the place pointed by DST and increment DST to the
 721    next free point, and increment PRODUCED_CHARS.  The caller should
 722    assure that C is 0..127, and declare and set the variable `dst'
 723    appropriately in advance.
 724 */
 725
 726
 727 #define EMIT_ONE_ASCII_BYTE(c)  \
 728   do {                          \
 729     produced_chars++;           \
 730     *dst++ = (c);               \
 731   } while (0)
 732
 733
 734 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 735
 736 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 737   do {                                  \
 738     produced_chars += 2;                \
 739     *dst++ = (c1), *dst++ = (c2);       \
 740   } while (0)
 741
 742
 743 /* Store a byte C in the place pointed by DST and increment DST to the
 744    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 745    store in an appropriate multibyte form.  The caller should
 746    declare and set the variables `dst' and `multibytep' appropriately
 747    in advance.  */
 748
 749 #define EMIT_ONE_BYTE(c)                \
 750   do {                                  \
 751     produced_chars++;                   \
 752     if (multibytep)                     \
 753       {                                 \
 754         unsigned ch = (c);              \
 755         if (ch >= 0x80)                 \
 756           ch = BYTE8_TO_CHAR (ch);      \
 757         CHAR_STRING_ADVANCE (ch, dst);  \
 758       }                                 \
 759     else                                \
 760       *dst++ = (c);                     \
 761   } while (0)
 762
 763
 764 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 765
 766 #define EMIT_TWO_BYTES(c1, c2)          \
 767   do {                                  \
 768     produced_chars += 2;                \
 769     if (multibytep)                     \
 770       {                                 \
 771         unsigned ch;                    \
 772                                         \
 773         ch = (c1);                      \
 774         if (ch >= 0x80)                 \
 775           ch = BYTE8_TO_CHAR (ch);      \
 776         CHAR_STRING_ADVANCE (ch, dst);  \
 777         ch = (c2);                      \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781       }                                 \
 782     else                                \
 783       {                                 \
 784         *dst++ = (c1);                  \
 785         *dst++ = (c2);                  \
 786       }                                 \
 787   } while (0)
 788
 789
 790 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 791   do {                                  \
 792     EMIT_ONE_BYTE (c1);                 \
 793     EMIT_TWO_BYTES (c2, c3);            \
 794   } while (0)
 795
 796
 797 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 798   do {                                          \
 799     EMIT_TWO_BYTES (c1, c2);                    \
 800     EMIT_TWO_BYTES (c3, c4);                    \
 801   } while (0)
 802
 803
 804 static void
 805 record_conversion_result (struct coding_system *coding,
 806                           enum coding_result_code result)
 807 {
 808   coding->result = result;
 809   switch (result)
 810     {
 811     case CODING_RESULT_INSUFFICIENT_SRC:
 812       Vlast_code_conversion_error = Qinsufficient_source;
 813       break;
 814     case CODING_RESULT_INVALID_SRC:
 815       Vlast_code_conversion_error = Qinvalid_source;
 816       break;
 817     case CODING_RESULT_INTERRUPT:
 818       Vlast_code_conversion_error = Qinterrupted;
 819       break;
 820     case CODING_RESULT_INSUFFICIENT_DST:
 821       /* Don't record this error in Vlast_code_conversion_error
 822          because it happens just temporarily and is resolved when the
 823          whole conversion is finished.  */
 824       break;
 825     case CODING_RESULT_SUCCESS:
 826       break;
 827     default:
 828       Vlast_code_conversion_error = intern ("Unknown error");
 829     }
 830 }
 831
 832 /* These wrapper macros are used to preserve validity of pointers into
 833    buffer text across calls to decode_char, encode_char, etc, which
 834    could cause relocation of buffers if it loads a charset map,
 835    because loading a charset map allocates large structures.  */
 836
 837 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 838   do {                                                                       \
 839     ptrdiff_t offset;                                                        \
 840                                                                              \
 841     charset_map_loaded = 0;                                                  \
 842     c = DECODE_CHAR (charset, code);                                         \
 843     if (charset_map_loaded                                                   \
 844         && (offset = coding_change_source (coding)))                         \
 845       {                                                                      \
 846         src += offset;                                                       \
 847         src_base += offset;                                                  \
 848         src_end += offset;                                                   \
 849       }                                                                      \
 850   } while (0)
 851
 852 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 853   do {                                                                  \
 854     ptrdiff_t offset;                                                   \
 855                                                                         \
 856     charset_map_loaded = 0;                                             \
 857     code = ENCODE_CHAR (charset, c);                                    \
 858     if (charset_map_loaded                                              \
 859         && (offset = coding_change_destination (coding)))               \
 860       {                                                                 \
 861         dst += offset;                                                  \
 862         dst_end += offset;                                              \
 863       }                                                                 \
 864   } while (0)
 865
 866 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 867   do {                                                                  \
 868     ptrdiff_t offset;                                                   \
 869                                                                         \
 870     charset_map_loaded = 0;                                             \
 871     charset = char_charset (c, charset_list, code_return);              \
 872     if (charset_map_loaded                                              \
 873         && (offset = coding_change_destination (coding)))               \
 874       {                                                                 \
 875         dst += offset;                                                  \
 876         dst_end += offset;                                              \
 877       }                                                                 \
 878   } while (0)
 879
 880 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 881   do {                                                                  \
 882     ptrdiff_t offset;                                                   \
 883                                                                         \
 884     charset_map_loaded = 0;                                             \
 885     result = CHAR_CHARSET_P (c, charset);                               \
 886     if (charset_map_loaded                                              \
 887         && (offset = coding_change_destination (coding)))               \
 888       {                                                                 \
 889         dst += offset;                                                  \
 890         dst_end += offset;                                              \
 891       }                                                                 \
 892   } while (0)
 893
 894
 895 /* If there are at least BYTES length of room at dst, allocate memory
 896    for coding->destination and update dst and dst_end.  We don't have
 897    to take care of coding->source which will be relocated.  It is
 898    handled by calling coding_set_source in encode_coding.  */
 899
 900 #define ASSURE_DESTINATION(bytes)                               \
 901   do {                                                          \
 902     if (dst + (bytes) >= dst_end)                               \
 903       {                                                         \
 904         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 905                                                                 \
 906         dst = alloc_destination (coding, more_bytes, dst);      \
 907         dst_end = coding->destination + coding->dst_bytes;      \
 908       }                                                         \
 909   } while (0)
 910
 911
 912 /* Store multibyte form of the character C in P, and advance P to the
 913    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 914    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 915    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 916
 917 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 918
 919 /* Return the character code of character whose multibyte form is at
 920    P, and advance P to the end of the multibyte form.  This used to be
 921    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 922    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 923
 924 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 925
 926 /* Set coding->source from coding->src_object.  */
 927
 928 static void
 929 coding_set_source (struct coding_system *coding)
 930 {
 931   if (BUFFERP (coding->src_object))
 932     {
 933       struct buffer *buf = XBUFFER (coding->src_object);
 934
 935       if (coding->src_pos < 0)
 936         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 937       else
 938         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 939     }
 940   else if (STRINGP (coding->src_object))
 941     {
 942       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 943     }
 944   else
 945     {
 946       /* Otherwise, the source is C string and is never relocated
 947          automatically.  Thus we don't have to update anything.  */
 948     }
 949 }
 950
 951
 952 /* Set coding->source from coding->src_object, and return how many
 953    bytes coding->source was changed.  */
 954
 955 static ptrdiff_t
 956 coding_change_source (struct coding_system *coding)
 957 {
 958   const unsigned char *orig = coding->source;
 959   coding_set_source (coding);
 960   return coding->source - orig;
 961 }
 962
 963
 964 /* Set coding->destination from coding->dst_object.  */
 965
 966 static void
 967 coding_set_destination (struct coding_system *coding)
 968 {
 969   if (BUFFERP (coding->dst_object))
 970     {
 971       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 972         {
 973           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 974           coding->dst_bytes = (GAP_END_ADDR
 975                                - (coding->src_bytes - coding->consumed)
 976                                - coding->destination);
 977         }
 978       else
 979         {
 980           /* We are sure that coding->dst_pos_byte is before the gap
 981              of the buffer. */
 982           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 983                                  + coding->dst_pos_byte - BEG_BYTE);
 984           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 985                                - coding->destination);
 986         }
 987     }
 988   else
 989     {
 990       /* Otherwise, the destination is C string and is never relocated
 991          automatically.  Thus we don't have to update anything.  */
 992     }
 993 }
 994
 995
 996 /* Set coding->destination from coding->dst_object, and return how
 997    many bytes coding->destination was changed.  */
 998
 999 static ptrdiff_t
1000 coding_change_destination (struct coding_system *coding)
1001 {
1002   const unsigned char *orig = coding->destination;
1003   coding_set_destination (coding);
1004   return coding->destination - orig;
1005 }
1006
1007
1008 static void
1009 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1010 {
1011   ptrdiff_t newbytes;
1012   if (INT_ADD_WRAPV (coding->dst_bytes, bytes, &newbytes)
1013       || SIZE_MAX < newbytes)
1014     string_overflow ();
1015   coding->destination = xrealloc (coding->destination, newbytes);
1016   coding->dst_bytes = newbytes;
1017 }
1018
1019 static void
1020 coding_alloc_by_making_gap (struct coding_system *coding,
1021                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1022 {
1023   if (EQ (coding->src_object, coding->dst_object))
1024     {
1025       /* The gap may contain the produced data at the head and not-yet
1026          consumed data at the tail.  To preserve those data, we at
1027          first make the gap size to zero, then increase the gap
1028          size.  */
1029       ptrdiff_t add = GAP_SIZE;
1030
1031       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1032       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1033       make_gap (bytes);
1034       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1035       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1036     }
1037   else
1038     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1039 }
1040
1041
1042 static unsigned char *
1043 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1044                    unsigned char *dst)
1045 {
1046   ptrdiff_t offset = dst - coding->destination;
1047
1048   if (BUFFERP (coding->dst_object))
1049     {
1050       struct buffer *buf = XBUFFER (coding->dst_object);
1051
1052       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1053     }
1054   else
1055     coding_alloc_by_realloc (coding, nbytes);
1056   coding_set_destination (coding);
1057   dst = coding->destination + offset;
1058   return dst;
1059 }
1060
1061 /** Macros for annotations.  */
1062
1063 /* An annotation data is stored in the array coding->charbuf in this
1064    format:
1065      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1066    LENGTH is the number of elements in the annotation.
1067    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1068    NCHARS is the number of characters in the text annotated.
1069
1070    The format of the following elements depend on ANNOTATION_MASK.
1071
1072    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1073    follows:
1074      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1075
1076    NBYTES is the number of bytes specified in the header part of
1077    old-style emacs-mule encoding, or 0 for the other kind of
1078    composition.
1079
1080    METHOD is one of enum composition_method.
1081
1082    Optional COMPOSITION-COMPONENTS are characters and composition
1083    rules.
1084
1085    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1086    follows.
1087
1088    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1089    recover from an invalid annotation, and should be skipped by
1090    produce_annotation.  */
1091
1092 /* Maximum length of the header of annotation data.  */
1093 #define MAX_ANNOTATION_LENGTH 5
1094
1095 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1096   do {                                                  \
1097     *(buf)++ = -(len);                                  \
1098     *(buf)++ = (mask);                                  \
1099     *(buf)++ = (nchars);                                \
1100     coding->annotated = 1;                              \
1101   } while (0);
1102
1103 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1104   do {                                                                      \
1105     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1106     *buf++ = nbytes;                                                        \
1107     *buf++ = method;                                                        \
1108   } while (0)
1109
1110
1111 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1112   do {                                                                  \
1113     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1114     *buf++ = id;                                                        \
1115   } while (0)
1116
1117
1118 /* Bitmasks for coding->eol_seen.  */
1119
1120 #define EOL_SEEN_NONE   0
1121 #define EOL_SEEN_LF     1
1122 #define EOL_SEEN_CR     2
1123 #define EOL_SEEN_CRLF   4
1124
1125 \f
1126 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1127
1128
1129
1130 \f
1131 /*** 3. UTF-8 ***/
1132
1133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1134    Return true if a text is encoded in UTF-8.  */
1135
1136 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1137 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1138 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1139 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1140 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1141 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1142
1143 #define UTF_8_BOM_1 0xEF
1144 #define UTF_8_BOM_2 0xBB
1145 #define UTF_8_BOM_3 0xBF
1146
1147 /* Unlike the other detect_coding_XXX, this function counts the number
1148    of characters and checks the EOL format.  */
1149
1150 static bool
1151 detect_coding_utf_8 (struct coding_system *coding,
1152                      struct coding_detection_info *detect_info)
1153 {
1154   const unsigned char *src = coding->source, *src_base;
1155   const unsigned char *src_end = coding->source + coding->src_bytes;
1156   bool multibytep = coding->src_multibyte;
1157   ptrdiff_t consumed_chars = 0;
1158   bool bom_found = 0;
1159   ptrdiff_t nchars = coding->head_ascii;
1160   int eol_seen = coding->eol_seen;
1161
1162   detect_info->checked |= CATEGORY_MASK_UTF_8;
1163   /* A coding system of this category is always ASCII compatible.  */
1164   src += nchars;
1165
1166   if (src == coding->source     /* BOM should be at the head.  */
1167       && src + 3 < src_end      /* BOM is 3-byte long.  */
1168       && src[0] == UTF_8_BOM_1
1169       && src[1] == UTF_8_BOM_2
1170       && src[2] == UTF_8_BOM_3)
1171     {
1172       bom_found = 1;
1173       src += 3;
1174       nchars++;
1175     }
1176
1177   while (1)
1178     {
1179       int c, c1, c2, c3, c4;
1180
1181       src_base = src;
1182       ONE_MORE_BYTE (c);
1183       if (c < 0 || UTF_8_1_OCTET_P (c))
1184         {
1185           nchars++;
1186           if (c == '\r')
1187             {
1188               if (src < src_end && *src == '\n')
1189                 {
1190                   eol_seen |= EOL_SEEN_CRLF;
1191                   src++;
1192                   nchars++;
1193                 }
1194               else
1195                 eol_seen |= EOL_SEEN_CR;
1196             }
1197           else if (c == '\n')
1198             eol_seen |= EOL_SEEN_LF;
1199           continue;
1200         }
1201       ONE_MORE_BYTE (c1);
1202       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1203         break;
1204       if (UTF_8_2_OCTET_LEADING_P (c))
1205         {
1206           nchars++;
1207           continue;
1208         }
1209       ONE_MORE_BYTE (c2);
1210       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1211         break;
1212       if (UTF_8_3_OCTET_LEADING_P (c))
1213         {
1214           nchars++;
1215           continue;
1216         }
1217       ONE_MORE_BYTE (c3);
1218       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1219         break;
1220       if (UTF_8_4_OCTET_LEADING_P (c))
1221         {
1222           nchars++;
1223           continue;
1224         }
1225       ONE_MORE_BYTE (c4);
1226       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1227         break;
1228       if (UTF_8_5_OCTET_LEADING_P (c)
1229           /* If we ever need to increase MAX_CHAR, the below may need
1230              to be reviewed.  */
1231           && c < MAX_MULTIBYTE_LEADING_CODE)
1232         {
1233           nchars++;
1234           continue;
1235         }
1236       break;
1237     }
1238   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1239   return 0;
1240
1241  no_more_source:
1242   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1243     {
1244       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1245       return 0;
1246     }
1247   if (bom_found)
1248     {
1249       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1250       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1251     }
1252   else
1253     {
1254       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1255       if (nchars < src_end - coding->source)
1256         /* The found characters are less than source bytes, which
1257            means that we found a valid non-ASCII characters.  */
1258         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1259     }
1260   coding->detected_utf8_bytes = src_base - coding->source;
1261   coding->detected_utf8_chars = nchars;
1262   return 1;
1263 }
1264
1265
1266 static void
1267 decode_coding_utf_8 (struct coding_system *coding)
1268 {
1269   const unsigned char *src = coding->source + coding->consumed;
1270   const unsigned char *src_end = coding->source + coding->src_bytes;
1271   const unsigned char *src_base;
1272   int *charbuf = coding->charbuf + coding->charbuf_used;
1273   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1274   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1275   bool multibytep = coding->src_multibyte;
1276   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1277   bool eol_dos
1278     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1279   int byte_after_cr = -1;
1280
1281   if (bom != utf_without_bom)
1282     {
1283       int c1, c2, c3;
1284
1285       src_base = src;
1286       ONE_MORE_BYTE (c1);
1287       if (! UTF_8_3_OCTET_LEADING_P (c1))
1288         src = src_base;
1289       else
1290         {
1291           ONE_MORE_BYTE (c2);
1292           if (! UTF_8_EXTRA_OCTET_P (c2))
1293             src = src_base;
1294           else
1295             {
1296               ONE_MORE_BYTE (c3);
1297               if (! UTF_8_EXTRA_OCTET_P (c3))
1298                 src = src_base;
1299               else
1300                 {
1301                   if ((c1 != UTF_8_BOM_1)
1302                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1303                     src = src_base;
1304                   else
1305                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1306                 }
1307             }
1308         }
1309     }
1310   CODING_UTF_8_BOM (coding) = utf_without_bom;
1311
1312   while (1)
1313     {
1314       int c, c1, c2, c3, c4, c5;
1315
1316       src_base = src;
1317       consumed_chars_base = consumed_chars;
1318
1319       if (charbuf >= charbuf_end)
1320         {
1321           if (byte_after_cr >= 0)
1322             src_base--;
1323           break;
1324         }
1325
1326       /* In the simple case, rapidly handle ordinary characters */
1327       if (multibytep && ! eol_dos
1328           && charbuf < charbuf_end - 6 && src < src_end - 6)
1329         {
1330           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1331             {
1332               c1 = *src;
1333               if (c1 & 0x80)
1334                 break;
1335               src++;
1336               consumed_chars++;
1337               *charbuf++ = c1;
1338
1339               c1 = *src;
1340               if (c1 & 0x80)
1341                 break;
1342               src++;
1343               consumed_chars++;
1344               *charbuf++ = c1;
1345
1346               c1 = *src;
1347               if (c1 & 0x80)
1348                 break;
1349               src++;
1350               consumed_chars++;
1351               *charbuf++ = c1;
1352
1353               c1 = *src;
1354               if (c1 & 0x80)
1355                 break;
1356               src++;
1357               consumed_chars++;
1358               *charbuf++ = c1;
1359             }
1360           /* If we handled at least one character, restart the main loop.  */
1361           if (src != src_base)
1362             continue;
1363         }
1364
1365       if (byte_after_cr >= 0)
1366         c1 = byte_after_cr, byte_after_cr = -1;
1367       else
1368         ONE_MORE_BYTE (c1);
1369       if (c1 < 0)
1370         {
1371           c = - c1;
1372         }
1373       else if (UTF_8_1_OCTET_P (c1))
1374         {
1375           if (eol_dos && c1 == '\r')
1376             ONE_MORE_BYTE (byte_after_cr);
1377           c = c1;
1378         }
1379       else
1380         {
1381           ONE_MORE_BYTE (c2);
1382           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1383             goto invalid_code;
1384           if (UTF_8_2_OCTET_LEADING_P (c1))
1385             {
1386               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1387               /* Reject overlong sequences here and below.  Encoders
1388                  producing them are incorrect, they can be misleading,
1389                  and they mess up read/write invariance.  */
1390               if (c < 128)
1391                 goto invalid_code;
1392             }
1393           else
1394             {
1395               ONE_MORE_BYTE (c3);
1396               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1397                 goto invalid_code;
1398               if (UTF_8_3_OCTET_LEADING_P (c1))
1399                 {
1400                   c = (((c1 & 0xF) << 12)
1401                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1402                   if (c < 0x800
1403                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1404                     goto invalid_code;
1405                 }
1406               else
1407                 {
1408                   ONE_MORE_BYTE (c4);
1409                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1410                     goto invalid_code;
1411                   if (UTF_8_4_OCTET_LEADING_P (c1))
1412                     {
1413                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1414                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1415                     if (c < 0x10000)
1416                       goto invalid_code;
1417                     }
1418                   else
1419                     {
1420                       ONE_MORE_BYTE (c5);
1421                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1422                         goto invalid_code;
1423                       if (UTF_8_5_OCTET_LEADING_P (c1))
1424                         {
1425                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1426                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1427                                | (c5 & 0x3F));
1428                           if ((c > MAX_CHAR) || (c < 0x200000))
1429                             goto invalid_code;
1430                         }
1431                       else
1432                         goto invalid_code;
1433                     }
1434                 }
1435             }
1436         }
1437
1438       *charbuf++ = c;
1439       continue;
1440
1441     invalid_code:
1442       src = src_base;
1443       consumed_chars = consumed_chars_base;
1444       ONE_MORE_BYTE (c);
1445       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1446     }
1447
1448  no_more_source:
1449   coding->consumed_char += consumed_chars_base;
1450   coding->consumed = src_base - coding->source;
1451   coding->charbuf_used = charbuf - coding->charbuf;
1452 }
1453
1454
1455 bool
1456 encode_coding_utf_8 (struct coding_system *coding)
1457 {
1458   bool multibytep = coding->dst_multibyte;
1459   int *charbuf = coding->charbuf;
1460   int *charbuf_end = charbuf + coding->charbuf_used;
1461   unsigned char *dst = coding->destination + coding->produced;
1462   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1463   ptrdiff_t produced_chars = 0;
1464   int c;
1465
1466   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1467     {
1468       ASSURE_DESTINATION (3);
1469       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1470       CODING_UTF_8_BOM (coding) = utf_without_bom;
1471     }
1472
1473   if (multibytep)
1474     {
1475       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1476
1477       while (charbuf < charbuf_end)
1478         {
1479           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1480
1481           ASSURE_DESTINATION (safe_room);
1482           c = *charbuf++;
1483           if (CHAR_BYTE8_P (c))
1484             {
1485               c = CHAR_TO_BYTE8 (c);
1486               EMIT_ONE_BYTE (c);
1487             }
1488           else
1489             {
1490               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1491               for (p = str; p < pend; p++)
1492                 EMIT_ONE_BYTE (*p);
1493             }
1494         }
1495     }
1496   else
1497     {
1498       int safe_room = MAX_MULTIBYTE_LENGTH;
1499
1500       while (charbuf < charbuf_end)
1501         {
1502           ASSURE_DESTINATION (safe_room);
1503           c = *charbuf++;
1504           if (CHAR_BYTE8_P (c))
1505             *dst++ = CHAR_TO_BYTE8 (c);
1506           else
1507             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1508         }
1509       produced_chars = dst - (coding->destination + coding->produced);
1510     }
1511   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1512   coding->produced_char += produced_chars;
1513   coding->produced = dst - coding->destination;
1514   return 0;
1515 }
1516
1517
1518 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1519    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1520
1521 static bool
1522 detect_coding_utf_16 (struct coding_system *coding,
1523                       struct coding_detection_info *detect_info)
1524 {
1525   const unsigned char *src = coding->source;
1526   const unsigned char *src_end = coding->source + coding->src_bytes;
1527   bool multibytep = coding->src_multibyte;
1528   int c1, c2;
1529
1530   detect_info->checked |= CATEGORY_MASK_UTF_16;
1531   if (coding->mode & CODING_MODE_LAST_BLOCK
1532       && (coding->src_chars & 1))
1533     {
1534       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1535       return 0;
1536     }
1537
1538   TWO_MORE_BYTES (c1, c2);
1539   if ((c1 == 0xFF) && (c2 == 0xFE))
1540     {
1541       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1542                              | CATEGORY_MASK_UTF_16_AUTO);
1543       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1544                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1545                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1546     }
1547   else if ((c1 == 0xFE) && (c2 == 0xFF))
1548     {
1549       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1550                              | CATEGORY_MASK_UTF_16_AUTO);
1551       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1552                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1553                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1554     }
1555   else if (c2 < 0)
1556     {
1557       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1558       return 0;
1559     }
1560   else
1561     {
1562       /* We check the dispersion of Eth and Oth bytes where E is even and
1563          O is odd.  If both are high, we assume binary data.*/
1564       unsigned char e[256], o[256];
1565       unsigned e_num = 1, o_num = 1;
1566
1567       memset (e, 0, 256);
1568       memset (o, 0, 256);
1569       e[c1] = 1;
1570       o[c2] = 1;
1571
1572       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1573                                 |CATEGORY_MASK_UTF_16_BE
1574                                 | CATEGORY_MASK_UTF_16_LE);
1575
1576       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1577              != CATEGORY_MASK_UTF_16)
1578         {
1579           TWO_MORE_BYTES (c1, c2);
1580           if (c2 < 0)
1581             break;
1582           if (! e[c1])
1583             {
1584               e[c1] = 1;
1585               e_num++;
1586               if (e_num >= 128)
1587                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1588             }
1589           if (! o[c2])
1590             {
1591               o[c2] = 1;
1592               o_num++;
1593               if (o_num >= 128)
1594                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1595             }
1596         }
1597       return 0;
1598     }
1599
1600  no_more_source:
1601   return 1;
1602 }
1603
1604 static void
1605 decode_coding_utf_16 (struct coding_system *coding)
1606 {
1607   const unsigned char *src = coding->source + coding->consumed;
1608   const unsigned char *src_end = coding->source + coding->src_bytes;
1609   const unsigned char *src_base;
1610   int *charbuf = coding->charbuf + coding->charbuf_used;
1611   /* We may produces at most 3 chars in one loop.  */
1612   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1613   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1614   bool multibytep = coding->src_multibyte;
1615   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1616   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1617   int surrogate = CODING_UTF_16_SURROGATE (coding);
1618   bool eol_dos
1619     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1620   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1621
1622   if (bom == utf_with_bom)
1623     {
1624       int c, c1, c2;
1625
1626       src_base = src;
1627       ONE_MORE_BYTE (c1);
1628       ONE_MORE_BYTE (c2);
1629       c = (c1 << 8) | c2;
1630
1631       if (endian == utf_16_big_endian
1632           ? c != 0xFEFF : c != 0xFFFE)
1633         {
1634           /* The first two bytes are not BOM.  Treat them as bytes
1635              for a normal character.  */
1636           src = src_base;
1637         }
1638       CODING_UTF_16_BOM (coding) = utf_without_bom;
1639     }
1640   else if (bom == utf_detect_bom)
1641     {
1642       /* We have already tried to detect BOM and failed in
1643          detect_coding.  */
1644       CODING_UTF_16_BOM (coding) = utf_without_bom;
1645     }
1646
1647   while (1)
1648     {
1649       int c, c1, c2;
1650
1651       src_base = src;
1652       consumed_chars_base = consumed_chars;
1653
1654       if (charbuf >= charbuf_end)
1655         {
1656           if (byte_after_cr1 >= 0)
1657             src_base -= 2;
1658           break;
1659         }
1660
1661       if (byte_after_cr1 >= 0)
1662         c1 = byte_after_cr1, byte_after_cr1 = -1;
1663       else
1664         ONE_MORE_BYTE (c1);
1665       if (c1 < 0)
1666         {
1667           *charbuf++ = -c1;
1668           continue;
1669         }
1670       if (byte_after_cr2 >= 0)
1671         c2 = byte_after_cr2, byte_after_cr2 = -1;
1672       else
1673         ONE_MORE_BYTE (c2);
1674       if (c2 < 0)
1675         {
1676           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1677           *charbuf++ = -c2;
1678           continue;
1679         }
1680       c = (endian == utf_16_big_endian
1681            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1682
1683       if (surrogate)
1684         {
1685           if (! UTF_16_LOW_SURROGATE_P (c))
1686             {
1687               if (endian == utf_16_big_endian)
1688                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1689               else
1690                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1691               *charbuf++ = c1;
1692               *charbuf++ = c2;
1693               if (UTF_16_HIGH_SURROGATE_P (c))
1694                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1695               else
1696                 *charbuf++ = c;
1697             }
1698           else
1699             {
1700               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1701               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1702               *charbuf++ = 0x10000 + c;
1703             }
1704         }
1705       else
1706         {
1707           if (UTF_16_HIGH_SURROGATE_P (c))
1708             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1709           else
1710             {
1711               if (eol_dos && c == '\r')
1712                 {
1713                   ONE_MORE_BYTE (byte_after_cr1);
1714                   ONE_MORE_BYTE (byte_after_cr2);
1715                 }
1716               *charbuf++ = c;
1717             }
1718         }
1719     }
1720
1721  no_more_source:
1722   coding->consumed_char += consumed_chars_base;
1723   coding->consumed = src_base - coding->source;
1724   coding->charbuf_used = charbuf - coding->charbuf;
1725 }
1726
1727 static bool
1728 encode_coding_utf_16 (struct coding_system *coding)
1729 {
1730   bool multibytep = coding->dst_multibyte;
1731   int *charbuf = coding->charbuf;
1732   int *charbuf_end = charbuf + coding->charbuf_used;
1733   unsigned char *dst = coding->destination + coding->produced;
1734   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1735   int safe_room = 8;
1736   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1737   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1738   ptrdiff_t produced_chars = 0;
1739   int c;
1740
1741   if (bom != utf_without_bom)
1742     {
1743       ASSURE_DESTINATION (safe_room);
1744       if (big_endian)
1745         EMIT_TWO_BYTES (0xFE, 0xFF);
1746       else
1747         EMIT_TWO_BYTES (0xFF, 0xFE);
1748       CODING_UTF_16_BOM (coding) = utf_without_bom;
1749     }
1750
1751   while (charbuf < charbuf_end)
1752     {
1753       ASSURE_DESTINATION (safe_room);
1754       c = *charbuf++;
1755       if (c > MAX_UNICODE_CHAR)
1756         c = coding->default_char;
1757
1758       if (c < 0x10000)
1759         {
1760           if (big_endian)
1761             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1762           else
1763             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1764         }
1765       else
1766         {
1767           int c1, c2;
1768
1769           c -= 0x10000;
1770           c1 = (c >> 10) + 0xD800;
1771           c2 = (c & 0x3FF) + 0xDC00;
1772           if (big_endian)
1773             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1774           else
1775             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1776         }
1777     }
1778   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1779   coding->produced = dst - coding->destination;
1780   coding->produced_char += produced_chars;
1781   return 0;
1782 }
1783
1784 \f
1785 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1786
1787 /* Emacs' internal format for representation of multiple character
1788    sets is a kind of multi-byte encoding, i.e. characters are
1789    represented by variable-length sequences of one-byte codes.
1790
1791    ASCII characters and control characters (e.g. `tab', `newline') are
1792    represented by one-byte sequences which are their ASCII codes, in
1793    the range 0x00 through 0x7F.
1794
1795    8-bit characters of the range 0x80..0x9F are represented by
1796    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1797    code + 0x20).
1798
1799    8-bit characters of the range 0xA0..0xFF are represented by
1800    one-byte sequences which are their 8-bit code.
1801
1802    The other characters are represented by a sequence of `base
1803    leading-code', optional `extended leading-code', and one or two
1804    `position-code's.  The length of the sequence is determined by the
1805    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1806    whereas extended leading-code and position-code take the range 0xA0
1807    through 0xFF.  See `charset.h' for more details about leading-code
1808    and position-code.
1809
1810    --- CODE RANGE of Emacs' internal format ---
1811    character set        range
1812    -------------        -----
1813    ascii                0x00..0x7F
1814    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1815    eight-bit-graphic    0xA0..0xBF
1816    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1817    ---------------------------------------------
1818
1819    As this is the internal character representation, the format is
1820    usually not used externally (i.e. in a file or in a data sent to a
1821    process).  But, it is possible to have a text externally in this
1822    format (i.e. by encoding by the coding system `emacs-mule').
1823
1824    In that case, a sequence of one-byte codes has a slightly different
1825    form.
1826
1827    At first, all characters in eight-bit-control are represented by
1828    one-byte sequences which are their 8-bit code.
1829
1830    Next, character composition data are represented by the byte
1831    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1832    where,
1833         METHOD is 0xF2 plus one of composition method (enum
1834         composition_method),
1835
1836         BYTES is 0xA0 plus a byte length of this composition data,
1837
1838         CHARS is 0xA0 plus a number of characters composed by this
1839         data,
1840
1841         COMPONENTs are characters of multibyte form or composition
1842         rules encoded by two-byte of ASCII codes.
1843
1844    In addition, for backward compatibility, the following formats are
1845    also recognized as composition data on decoding.
1846
1847    0x80 MSEQ ...
1848    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1849
1850    Here,
1851         MSEQ is a multibyte form but in these special format:
1852           ASCII: 0xA0 ASCII_CODE+0x80,
1853           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1854         RULE is a one byte code of the range 0xA0..0xF0 that
1855         represents a composition rule.
1856   */
1857
1858 char emacs_mule_bytes[256];
1859
1860
1861 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1862    Return true if a text is encoded in 'emacs-mule'.  */
1863
1864 static bool
1865 detect_coding_emacs_mule (struct coding_system *coding,
1866                           struct coding_detection_info *detect_info)
1867 {
1868   const unsigned char *src = coding->source, *src_base;
1869   const unsigned char *src_end = coding->source + coding->src_bytes;
1870   bool multibytep = coding->src_multibyte;
1871   ptrdiff_t consumed_chars = 0;
1872   int c;
1873   int found = 0;
1874
1875   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1876   /* A coding system of this category is always ASCII compatible.  */
1877   src += coding->head_ascii;
1878
1879   while (1)
1880     {
1881       src_base = src;
1882       ONE_MORE_BYTE (c);
1883       if (c < 0)
1884         continue;
1885       if (c == 0x80)
1886         {
1887           /* Perhaps the start of composite character.  We simply skip
1888              it because analyzing it is too heavy for detecting.  But,
1889              at least, we check that the composite character
1890              constitutes of more than 4 bytes.  */
1891           const unsigned char *src_start;
1892
1893         repeat:
1894           src_start = src;
1895           do
1896             {
1897               ONE_MORE_BYTE (c);
1898             }
1899           while (c >= 0xA0);
1900
1901           if (src - src_start <= 4)
1902             break;
1903           found = CATEGORY_MASK_EMACS_MULE;
1904           if (c == 0x80)
1905             goto repeat;
1906         }
1907
1908       if (c < 0x80)
1909         {
1910           if (c < 0x20
1911               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1912             break;
1913         }
1914       else
1915         {
1916           int more_bytes = emacs_mule_bytes[c] - 1;
1917
1918           while (more_bytes > 0)
1919             {
1920               ONE_MORE_BYTE (c);
1921               if (c < 0xA0)
1922                 {
1923                   src--;        /* Unread the last byte.  */
1924                   break;
1925                 }
1926               more_bytes--;
1927             }
1928           if (more_bytes != 0)
1929             break;
1930           found = CATEGORY_MASK_EMACS_MULE;
1931         }
1932     }
1933   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1934   return 0;
1935
1936  no_more_source:
1937   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1938     {
1939       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1940       return 0;
1941     }
1942   detect_info->found |= found;
1943   return 1;
1944 }
1945
1946
1947 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1948    character.  If CMP_STATUS indicates that we must expect MSEQ or
1949    RULE described above, decode it and return the negative value of
1950    the decoded character or rule.  If an invalid byte is found, return
1951    -1.  If SRC is too short, return -2.  */
1952
1953 static int
1954 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1955                  int *nbytes, int *nchars, int *id,
1956                  struct composition_status *cmp_status)
1957 {
1958   const unsigned char *src_end = coding->source + coding->src_bytes;
1959   const unsigned char *src_base = src;
1960   bool multibytep = coding->src_multibyte;
1961   int charset_ID;
1962   unsigned code;
1963   int c;
1964   ptrdiff_t consumed_chars = 0;
1965   bool mseq_found = 0;
1966
1967   ONE_MORE_BYTE (c);
1968   if (c < 0)
1969     {
1970       c = -c;
1971       charset_ID = emacs_mule_charset[0];
1972     }
1973   else
1974     {
1975       if (c >= 0xA0)
1976         {
1977           if (cmp_status->state != COMPOSING_NO
1978               && cmp_status->old_form)
1979             {
1980               if (cmp_status->state == COMPOSING_CHAR)
1981                 {
1982                   if (c == 0xA0)
1983                     {
1984                       ONE_MORE_BYTE (c);
1985                       c -= 0x80;
1986                       if (c < 0)
1987                         goto invalid_code;
1988                     }
1989                   else
1990                     c -= 0x20;
1991                   mseq_found = 1;
1992                 }
1993               else
1994                 {
1995                   *nbytes = src - src_base;
1996                   *nchars = consumed_chars;
1997                   return -c;
1998                 }
1999             }
2000           else
2001             goto invalid_code;
2002         }
2003
2004       switch (emacs_mule_bytes[c])
2005         {
2006         case 2:
2007           if ((charset_ID = emacs_mule_charset[c]) < 0)
2008             goto invalid_code;
2009           ONE_MORE_BYTE (c);
2010           if (c < 0xA0)
2011             goto invalid_code;
2012           code = c & 0x7F;
2013           break;
2014
2015         case 3:
2016           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2017               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2018             {
2019               ONE_MORE_BYTE (c);
2020               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2021                 goto invalid_code;
2022               ONE_MORE_BYTE (c);
2023               if (c < 0xA0)
2024                 goto invalid_code;
2025               code = c & 0x7F;
2026             }
2027           else
2028             {
2029               if ((charset_ID = emacs_mule_charset[c]) < 0)
2030                 goto invalid_code;
2031               ONE_MORE_BYTE (c);
2032               if (c < 0xA0)
2033                 goto invalid_code;
2034               code = (c & 0x7F) << 8;
2035               ONE_MORE_BYTE (c);
2036               if (c < 0xA0)
2037                 goto invalid_code;
2038               code |= c & 0x7F;
2039             }
2040           break;
2041
2042         case 4:
2043           ONE_MORE_BYTE (c);
2044           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2045             goto invalid_code;
2046           ONE_MORE_BYTE (c);
2047           if (c < 0xA0)
2048             goto invalid_code;
2049           code = (c & 0x7F) << 8;
2050           ONE_MORE_BYTE (c);
2051           if (c < 0xA0)
2052             goto invalid_code;
2053           code |= c & 0x7F;
2054           break;
2055
2056         case 1:
2057           code = c;
2058           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2059           break;
2060
2061         default:
2062           emacs_abort ();
2063         }
2064       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2065                           CHARSET_FROM_ID (charset_ID), code, c);
2066       if (c < 0)
2067         goto invalid_code;
2068     }
2069   *nbytes = src - src_base;
2070   *nchars = consumed_chars;
2071   if (id)
2072     *id = charset_ID;
2073   return (mseq_found ? -c : c);
2074
2075  no_more_source:
2076   return -2;
2077
2078  invalid_code:
2079   return -1;
2080 }
2081
2082
2083 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2084
2085 /* Handle these composition sequence ('|': the end of header elements,
2086    BYTES and CHARS >= 0xA0):
2087
2088    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2089    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2090    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2091
2092    and these old form:
2093
2094    (4) relative composition: 0x80 | MSEQ ... MSEQ
2095    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2096
2097    When the starter 0x80 and the following header elements are found,
2098    this annotation header is produced.
2099
2100         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2101
2102    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2103    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2104
2105    Then, upon reading the following elements, these codes are produced
2106    until the composition end is found:
2107
2108    (1) CHAR ... CHAR
2109    (2) ALT ... ALT CHAR ... CHAR
2110    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2111    (4) CHAR ... CHAR
2112    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2113
2114    When the composition end is found, LENGTH and NCHARS in the
2115    annotation header is updated as below:
2116
2117    (1) LENGTH: unchanged, NCHARS: unchanged
2118    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2119    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2120    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2121    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2122
2123    If an error is found while composing, the annotation header is
2124    changed to the original composition header (plus filler -1s) as
2125    below:
2126
2127    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2128    (5)          [ 0x80 0xFF -1 -1- -1 ]
2129
2130    and the sequence [ -2 DECODED-RULE ] is changed to the original
2131    byte sequence as below:
2132         o the original byte sequence is B: [ B -1 ]
2133         o the original byte sequence is B1 B2: [ B1 B2 ]
2134
2135    Most of the routines are implemented by macros because many
2136    variables and labels in the caller decode_coding_emacs_mule must be
2137    accessible, and they are usually called just once (thus doesn't
2138    increase the size of compiled object).  */
2139
2140 /* Decode a composition rule represented by C as a component of
2141    composition sequence of Emacs 20 style.  Set RULE to the decoded
2142    rule. */
2143
2144 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2145   do {                                                  \
2146     int gref, nref;                                     \
2147                                                         \
2148     c -= 0xA0;                                          \
2149     if (c < 0 || c >= 81)                               \
2150       goto invalid_code;                                \
2151     gref = c / 9, nref = c % 9;                         \
2152     if (gref == 4) gref = 10;                           \
2153     if (nref == 4) nref = 10;                           \
2154     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2155   } while (0)
2156
2157
2158 /* Decode a composition rule represented by C and the following byte
2159    at SRC as a component of composition sequence of Emacs 21 style.
2160    Set RULE to the decoded rule.  */
2161
2162 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2163   do {                                                  \
2164     int gref, nref;                                     \
2165                                                         \
2166     gref = c - 0x20;                                    \
2167     if (gref < 0 || gref >= 81)                         \
2168       goto invalid_code;                                \
2169     ONE_MORE_BYTE (c);                                  \
2170     nref = c - 0x20;                                    \
2171     if (nref < 0 || nref >= 81)                         \
2172       goto invalid_code;                                \
2173     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2174   } while (0)
2175
2176
2177 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2178    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2179    byte length of this composition information, CHARS is the number of
2180    characters composed by this composition.  */
2181
2182 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2183   do {                                                                  \
2184     enum composition_method method = c - 0xF2;                          \
2185     int nbytes, nchars;                                                 \
2186                                                                         \
2187     ONE_MORE_BYTE (c);                                                  \
2188     if (c < 0)                                                          \
2189       goto invalid_code;                                                \
2190     nbytes = c - 0xA0;                                                  \
2191     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2192       goto invalid_code;                                                \
2193     ONE_MORE_BYTE (c);                                                  \
2194     nchars = c - 0xA0;                                                  \
2195     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2196       goto invalid_code;                                                \
2197     cmp_status->old_form = 0;                                           \
2198     cmp_status->method = method;                                        \
2199     if (method == COMPOSITION_RELATIVE)                                 \
2200       cmp_status->state = COMPOSING_CHAR;                               \
2201     else                                                                \
2202       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2203     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2204     cmp_status->nchars = nchars;                                        \
2205     cmp_status->ncomps = nbytes - 4;                                    \
2206     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2207   } while (0)
2208
2209
2210 /* Start of Emacs 20 style format for relative composition.  */
2211
2212 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2213   do {                                                          \
2214     cmp_status->old_form = 1;                                   \
2215     cmp_status->method = COMPOSITION_RELATIVE;                  \
2216     cmp_status->state = COMPOSING_CHAR;                         \
2217     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2218     cmp_status->nchars = cmp_status->ncomps = 0;                \
2219     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2220   } while (0)
2221
2222
2223 /* Start of Emacs 20 style format for rule-base composition.  */
2224
2225 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2226   do {                                                          \
2227     cmp_status->old_form = 1;                                   \
2228     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2229     cmp_status->state = COMPOSING_CHAR;                         \
2230     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2231     cmp_status->nchars = cmp_status->ncomps = 0;                \
2232     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2233   } while (0)
2234
2235
2236 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2237   do {                                                  \
2238     const unsigned char *current_src = src;             \
2239                                                         \
2240     ONE_MORE_BYTE (c);                                  \
2241     if (c < 0)                                          \
2242       goto invalid_code;                                \
2243     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2244         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2245       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2246     else if (c < 0xA0)                                  \
2247       goto invalid_code;                                \
2248     else if (c < 0xC0)                                  \
2249       {                                                 \
2250         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2251         /* Re-read C as a composition component.  */    \
2252         src = current_src;                              \
2253       }                                                 \
2254     else if (c == 0xFF)                                 \
2255       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2256     else                                                \
2257       goto invalid_code;                                \
2258   } while (0)
2259
2260 #define EMACS_MULE_COMPOSITION_END()                            \
2261   do {                                                          \
2262     int idx = - cmp_status->length;                             \
2263                                                                 \
2264     if (cmp_status->old_form)                                   \
2265       charbuf[idx + 2] = cmp_status->nchars;                    \
2266     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2267       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2268     cmp_status->state = COMPOSING_NO;                           \
2269   } while (0)
2270
2271
2272 static int
2273 emacs_mule_finish_composition (int *charbuf,
2274                                struct composition_status *cmp_status)
2275 {
2276   int idx = - cmp_status->length;
2277   int new_chars;
2278
2279   if (cmp_status->old_form && cmp_status->nchars > 0)
2280     {
2281       charbuf[idx + 2] = cmp_status->nchars;
2282       new_chars = 0;
2283       if (cmp_status->method == COMPOSITION_WITH_RULE
2284           && cmp_status->state == COMPOSING_CHAR)
2285         {
2286           /* The last rule was invalid.  */
2287           int rule = charbuf[-1] + 0xA0;
2288
2289           charbuf[-2] = BYTE8_TO_CHAR (rule);
2290           charbuf[-1] = -1;
2291           new_chars = 1;
2292         }
2293     }
2294   else
2295     {
2296       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2297
2298       if (cmp_status->method == COMPOSITION_WITH_RULE)
2299         {
2300           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2301           charbuf[idx++] = -3;
2302           charbuf[idx++] = 0;
2303           new_chars = 1;
2304         }
2305       else
2306         {
2307           int nchars = charbuf[idx + 1] + 0xA0;
2308           int nbytes = charbuf[idx + 2] + 0xA0;
2309
2310           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2311           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2312           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2313           charbuf[idx++] = -1;
2314           new_chars = 4;
2315         }
2316     }
2317   cmp_status->state = COMPOSING_NO;
2318   return new_chars;
2319 }
2320
2321 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2322   do {                                                                    \
2323     if (cmp_status->state != COMPOSING_NO)                                \
2324       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2325   } while (0)
2326
2327
2328 static void
2329 decode_coding_emacs_mule (struct coding_system *coding)
2330 {
2331   const unsigned char *src = coding->source + coding->consumed;
2332   const unsigned char *src_end = coding->source + coding->src_bytes;
2333   const unsigned char *src_base;
2334   int *charbuf = coding->charbuf + coding->charbuf_used;
2335   /* We may produce two annotations (charset and composition) in one
2336      loop and one more charset annotation at the end.  */
2337   int *charbuf_end
2338     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2339       /* We can produce up to 2 characters in a loop.  */
2340       - 1;
2341   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2342   bool multibytep = coding->src_multibyte;
2343   ptrdiff_t char_offset = coding->produced_char;
2344   ptrdiff_t last_offset = char_offset;
2345   int last_id = charset_ascii;
2346   bool eol_dos
2347     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2348   int byte_after_cr = -1;
2349   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2350
2351   if (cmp_status->state != COMPOSING_NO)
2352     {
2353       int i;
2354
2355       if (charbuf_end - charbuf < cmp_status->length)
2356         emacs_abort ();
2357       for (i = 0; i < cmp_status->length; i++)
2358         *charbuf++ = cmp_status->carryover[i];
2359       coding->annotated = 1;
2360     }
2361
2362   while (1)
2363     {
2364       int c;
2365       int id UNINIT;
2366
2367       src_base = src;
2368       consumed_chars_base = consumed_chars;
2369
2370       if (charbuf >= charbuf_end)
2371         {
2372           if (byte_after_cr >= 0)
2373             src_base--;
2374           break;
2375         }
2376
2377       if (byte_after_cr >= 0)
2378         c = byte_after_cr, byte_after_cr = -1;
2379       else
2380         ONE_MORE_BYTE (c);
2381
2382       if (c < 0 || c == 0x80)
2383         {
2384           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2385           if (c < 0)
2386             {
2387               *charbuf++ = -c;
2388               char_offset++;
2389             }
2390           else
2391             DECODE_EMACS_MULE_COMPOSITION_START ();
2392           continue;
2393         }
2394
2395       if (c < 0x80)
2396         {
2397           if (eol_dos && c == '\r')
2398             ONE_MORE_BYTE (byte_after_cr);
2399           id = charset_ascii;
2400           if (cmp_status->state != COMPOSING_NO)
2401             {
2402               if (cmp_status->old_form)
2403                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2404               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2405                 cmp_status->ncomps--;
2406             }
2407         }
2408       else
2409         {
2410           int nchars UNINIT, nbytes UNINIT;
2411           /* emacs_mule_char can load a charset map from a file, which
2412              allocates a large structure and might cause buffer text
2413              to be relocated as result.  Thus, we need to remember the
2414              original pointer to buffer text, and fix up all related
2415              pointers after the call.  */
2416           const unsigned char *orig = coding->source;
2417           ptrdiff_t offset;
2418
2419           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2420                                cmp_status);
2421           offset = coding->source - orig;
2422           if (offset)
2423             {
2424               src += offset;
2425               src_base += offset;
2426               src_end += offset;
2427             }
2428           if (c < 0)
2429             {
2430               if (c == -1)
2431                 goto invalid_code;
2432               if (c == -2)
2433                 break;
2434             }
2435           src = src_base + nbytes;
2436           consumed_chars = consumed_chars_base + nchars;
2437           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2438             cmp_status->ncomps -= nchars;
2439         }
2440
2441       /* Now if C >= 0, we found a normally encoded character, if C <
2442          0, we found an old-style composition component character or
2443          rule.  */
2444
2445       if (cmp_status->state == COMPOSING_NO)
2446         {
2447           if (last_id != id)
2448             {
2449               if (last_id != charset_ascii)
2450                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2451                                   last_id);
2452               last_id = id;
2453               last_offset = char_offset;
2454             }
2455           *charbuf++ = c;
2456           char_offset++;
2457         }
2458       else if (cmp_status->state == COMPOSING_CHAR)
2459         {
2460           if (cmp_status->old_form)
2461             {
2462               if (c >= 0)
2463                 {
2464                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2465                   *charbuf++ = c;
2466                   char_offset++;
2467                 }
2468               else
2469                 {
2470                   *charbuf++ = -c;
2471                   cmp_status->nchars++;
2472                   cmp_status->length++;
2473                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2474                     EMACS_MULE_COMPOSITION_END ();
2475                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2476                     cmp_status->state = COMPOSING_RULE;
2477                 }
2478             }
2479           else
2480             {
2481               *charbuf++ = c;
2482               cmp_status->length++;
2483               cmp_status->nchars--;
2484               if (cmp_status->nchars == 0)
2485                 EMACS_MULE_COMPOSITION_END ();
2486             }
2487         }
2488       else if (cmp_status->state == COMPOSING_RULE)
2489         {
2490           int rule;
2491
2492           if (c >= 0)
2493             {
2494               EMACS_MULE_COMPOSITION_END ();
2495               *charbuf++ = c;
2496               char_offset++;
2497             }
2498           else
2499             {
2500               c = -c;
2501               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2502               if (rule < 0)
2503                 goto invalid_code;
2504               *charbuf++ = -2;
2505               *charbuf++ = rule;
2506               cmp_status->length += 2;
2507               cmp_status->state = COMPOSING_CHAR;
2508             }
2509         }
2510       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2511         {
2512           *charbuf++ = c;
2513           cmp_status->length++;
2514           if (cmp_status->ncomps == 0)
2515             cmp_status->state = COMPOSING_CHAR;
2516           else if (cmp_status->ncomps > 0)
2517             {
2518               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2519                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2520             }
2521           else
2522             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2523         }
2524       else                      /* COMPOSING_COMPONENT_RULE */
2525         {
2526           int rule;
2527
2528           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2529           if (rule < 0)
2530             goto invalid_code;
2531           *charbuf++ = -2;
2532           *charbuf++ = rule;
2533           cmp_status->length += 2;
2534           cmp_status->ncomps--;
2535           if (cmp_status->ncomps > 0)
2536             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2537           else
2538             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2539         }
2540       continue;
2541
2542     invalid_code:
2543       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2544       src = src_base;
2545       consumed_chars = consumed_chars_base;
2546       ONE_MORE_BYTE (c);
2547       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2548       char_offset++;
2549     }
2550
2551  no_more_source:
2552   if (cmp_status->state != COMPOSING_NO)
2553     {
2554       if (coding->mode & CODING_MODE_LAST_BLOCK)
2555         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2556       else
2557         {
2558           int i;
2559
2560           charbuf -= cmp_status->length;
2561           for (i = 0; i < cmp_status->length; i++)
2562             cmp_status->carryover[i] = charbuf[i];
2563         }
2564     }
2565   if (last_id != charset_ascii)
2566     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2567   coding->consumed_char += consumed_chars_base;
2568   coding->consumed = src_base - coding->source;
2569   coding->charbuf_used = charbuf - coding->charbuf;
2570 }
2571
2572
2573 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2574   do {                                          \
2575     if (id < 0xA0)                              \
2576       codes[0] = id, codes[1] = 0;              \
2577     else if (id < 0xE0)                         \
2578       codes[0] = 0x9A, codes[1] = id;           \
2579     else if (id < 0xF0)                         \
2580       codes[0] = 0x9B, codes[1] = id;           \
2581     else if (id < 0xF5)                         \
2582       codes[0] = 0x9C, codes[1] = id;           \
2583     else                                        \
2584       codes[0] = 0x9D, codes[1] = id;           \
2585   } while (0);
2586
2587
2588 static bool
2589 encode_coding_emacs_mule (struct coding_system *coding)
2590 {
2591   bool multibytep = coding->dst_multibyte;
2592   int *charbuf = coding->charbuf;
2593   int *charbuf_end = charbuf + coding->charbuf_used;
2594   unsigned char *dst = coding->destination + coding->produced;
2595   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2596   int safe_room = 8;
2597   ptrdiff_t produced_chars = 0;
2598   Lisp_Object attrs, charset_list;
2599   int c;
2600   int preferred_charset_id = -1;
2601
2602   CODING_GET_INFO (coding, attrs, charset_list);
2603   if (! EQ (charset_list, Vemacs_mule_charset_list))
2604     {
2605       charset_list = Vemacs_mule_charset_list;
2606       ASET (attrs, coding_attr_charset_list, charset_list);
2607     }
2608
2609   while (charbuf < charbuf_end)
2610     {
2611       ASSURE_DESTINATION (safe_room);
2612       c = *charbuf++;
2613
2614       if (c < 0)
2615         {
2616           /* Handle an annotation.  */
2617           switch (*charbuf)
2618             {
2619             case CODING_ANNOTATE_COMPOSITION_MASK:
2620               /* Not yet implemented.  */
2621               break;
2622             case CODING_ANNOTATE_CHARSET_MASK:
2623               preferred_charset_id = charbuf[3];
2624               if (preferred_charset_id >= 0
2625                   && NILP (Fmemq (make_number (preferred_charset_id),
2626                                   charset_list)))
2627                 preferred_charset_id = -1;
2628               break;
2629             default:
2630               emacs_abort ();
2631             }
2632           charbuf += -c - 1;
2633           continue;
2634         }
2635
2636       if (ASCII_CHAR_P (c))
2637         EMIT_ONE_ASCII_BYTE (c);
2638       else if (CHAR_BYTE8_P (c))
2639         {
2640           c = CHAR_TO_BYTE8 (c);
2641           EMIT_ONE_BYTE (c);
2642         }
2643       else
2644         {
2645           struct charset *charset;
2646           unsigned code;
2647           int dimension;
2648           int emacs_mule_id;
2649           unsigned char leading_codes[2];
2650
2651           if (preferred_charset_id >= 0)
2652             {
2653               bool result;
2654
2655               charset = CHARSET_FROM_ID (preferred_charset_id);
2656               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2657               if (result)
2658                 code = ENCODE_CHAR (charset, c);
2659               else
2660                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2661                                      &code, charset);
2662             }
2663           else
2664             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2665                                  &code, charset);
2666           if (! charset)
2667             {
2668               c = coding->default_char;
2669               if (ASCII_CHAR_P (c))
2670                 {
2671                   EMIT_ONE_ASCII_BYTE (c);
2672                   continue;
2673                 }
2674               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2675                                    &code, charset);
2676             }
2677           dimension = CHARSET_DIMENSION (charset);
2678           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2679           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2680           EMIT_ONE_BYTE (leading_codes[0]);
2681           if (leading_codes[1])
2682             EMIT_ONE_BYTE (leading_codes[1]);
2683           if (dimension == 1)
2684             EMIT_ONE_BYTE (code | 0x80);
2685           else
2686             {
2687               code |= 0x8080;
2688               EMIT_ONE_BYTE (code >> 8);
2689               EMIT_ONE_BYTE (code & 0xFF);
2690             }
2691         }
2692     }
2693   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2694   coding->produced_char += produced_chars;
2695   coding->produced = dst - coding->destination;
2696   return 0;
2697 }
2698
2699 \f
2700 /*** 7. ISO2022 handlers ***/
2701
2702 /* The following note describes the coding system ISO2022 briefly.
2703    Since the intention of this note is to help understand the
2704    functions in this file, some parts are NOT ACCURATE or are OVERLY
2705    SIMPLIFIED.  For thorough understanding, please refer to the
2706    original document of ISO2022.  This is equivalent to the standard
2707    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2708
2709    ISO2022 provides many mechanisms to encode several character sets
2710    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2711    is encoded using bytes less than 128.  This may make the encoded
2712    text a little bit longer, but the text passes more easily through
2713    several types of gateway, some of which strip off the MSB (Most
2714    Significant Bit).
2715
2716    There are two kinds of character sets: control character sets and
2717    graphic character sets.  The former contain control characters such
2718    as `newline' and `escape' to provide control functions (control
2719    functions are also provided by escape sequences).  The latter
2720    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2721    two control character sets and many graphic character sets.
2722
2723    Graphic character sets are classified into one of the following
2724    four classes, according to the number of bytes (DIMENSION) and
2725    number of characters in one dimension (CHARS) of the set:
2726    - DIMENSION1_CHARS94
2727    - DIMENSION1_CHARS96
2728    - DIMENSION2_CHARS94
2729    - DIMENSION2_CHARS96
2730
2731    In addition, each character set is assigned an identification tag,
2732    unique for each set, called the "final character" (denoted as <F>
2733    hereafter).  The <F> of each character set is decided by ECMA(*)
2734    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2735    (0x30..0x3F are for private use only).
2736
2737    Note (*): ECMA = European Computer Manufacturers Association
2738
2739    Here are examples of graphic character sets [NAME(<F>)]:
2740         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2741         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2742         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2743         o DIMENSION2_CHARS96 -- none for the moment
2744
2745    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2746         C0 [0x00..0x1F] -- control character plane 0
2747         GL [0x20..0x7F] -- graphic character plane 0
2748         C1 [0x80..0x9F] -- control character plane 1
2749         GR [0xA0..0xFF] -- graphic character plane 1
2750
2751    A control character set is directly designated and invoked to C0 or
2752    C1 by an escape sequence.  The most common case is that:
2753    - ISO646's  control character set is designated/invoked to C0, and
2754    - ISO6429's control character set is designated/invoked to C1,
2755    and usually these designations/invocations are omitted in encoded
2756    text.  In a 7-bit environment, only C0 can be used, and a control
2757    character for C1 is encoded by an appropriate escape sequence to
2758    fit into the environment.  All control characters for C1 are
2759    defined to have corresponding escape sequences.
2760
2761    A graphic character set is at first designated to one of four
2762    graphic registers (G0 through G3), then these graphic registers are
2763    invoked to GL or GR.  These designations and invocations can be
2764    done independently.  The most common case is that G0 is invoked to
2765    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2766    these invocations and designations are omitted in encoded text.
2767    In a 7-bit environment, only GL can be used.
2768
2769    When a graphic character set of CHARS94 is invoked to GL, codes
2770    0x20 and 0x7F of the GL area work as control characters SPACE and
2771    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2772    be used.
2773
2774    There are two ways of invocation: locking-shift and single-shift.
2775    With locking-shift, the invocation lasts until the next different
2776    invocation, whereas with single-shift, the invocation affects the
2777    following character only and doesn't affect the locking-shift
2778    state.  Invocations are done by the following control characters or
2779    escape sequences:
2780
2781    ----------------------------------------------------------------------
2782    abbrev  function                  cntrl escape seq   description
2783    ----------------------------------------------------------------------
2784    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2785    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2786    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2787    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2788    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2789    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2790    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2791    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2792    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2793    ----------------------------------------------------------------------
2794    (*) These are not used by any known coding system.
2795
2796    Control characters for these functions are defined by macros
2797    ISO_CODE_XXX in `coding.h'.
2798
2799    Designations are done by the following escape sequences:
2800    ----------------------------------------------------------------------
2801    escape sequence      description
2802    ----------------------------------------------------------------------
2803    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2804    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2805    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2806    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2807    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2808    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2809    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2810    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2811    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2812    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2813    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2814    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2815    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2816    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2817    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2818    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2819    ----------------------------------------------------------------------
2820
2821    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2822    of dimension 1, chars 94, and final character <F>, etc...
2823
2824    Note (*): Although these designations are not allowed in ISO2022,
2825    Emacs accepts them on decoding, and produces them on encoding
2826    CHARS96 character sets in a coding system which is characterized as
2827    7-bit environment, non-locking-shift, and non-single-shift.
2828
2829    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2830    '(' must be omitted.  We refer to this as "short-form" hereafter.
2831
2832    Now you may notice that there are a lot of ways of encoding the
2833    same multilingual text in ISO2022.  Actually, there exist many
2834    coding systems such as Compound Text (used in X11's inter client
2835    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2836    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2837    localized platforms), and all of these are variants of ISO2022.
2838
2839    In addition to the above, Emacs handles two more kinds of escape
2840    sequences: ISO6429's direction specification and Emacs' private
2841    sequence for specifying character composition.
2842
2843    ISO6429's direction specification takes the following form:
2844         o CSI ']'      -- end of the current direction
2845         o CSI '0' ']'  -- end of the current direction
2846         o CSI '1' ']'  -- start of left-to-right text
2847         o CSI '2' ']'  -- start of right-to-left text
2848    The control character CSI (0x9B: control sequence introducer) is
2849    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2850
2851    Character composition specification takes the following form:
2852         o ESC '0' -- start relative composition
2853         o ESC '1' -- end composition
2854         o ESC '2' -- start rule-base composition (*)
2855         o ESC '3' -- start relative composition with alternate chars  (**)
2856         o ESC '4' -- start rule-base composition with alternate chars  (**)
2857   Since these are not standard escape sequences of any ISO standard,
2858   the use of them with these meanings is restricted to Emacs only.
2859
2860   (*) This form is used only in Emacs 20.7 and older versions,
2861   but newer versions can safely decode it.
2862   (**) This form is used only in Emacs 21.1 and newer versions,
2863   and older versions can't decode it.
2864
2865   Here's a list of example usages of these composition escape
2866   sequences (categorized by `enum composition_method').
2867
2868   COMPOSITION_RELATIVE:
2869         ESC 0 CHAR [ CHAR ] ESC 1
2870   COMPOSITION_WITH_RULE:
2871         ESC 2 CHAR [ RULE CHAR ] ESC 1
2872   COMPOSITION_WITH_ALTCHARS:
2873         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2874   COMPOSITION_WITH_RULE_ALTCHARS:
2875         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2876
2877 static enum iso_code_class_type iso_code_class[256];
2878
2879 #define SAFE_CHARSET_P(coding, id)      \
2880   ((id) <= (coding)->max_charset_id     \
2881    && (coding)->safe_charsets[id] != 255)
2882
2883 static void
2884 setup_iso_safe_charsets (Lisp_Object attrs)
2885 {
2886   Lisp_Object charset_list, safe_charsets;
2887   Lisp_Object request;
2888   Lisp_Object reg_usage;
2889   Lisp_Object tail;
2890   EMACS_INT reg94, reg96;
2891   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2892   int max_charset_id;
2893
2894   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2895   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2896       && ! EQ (charset_list, Viso_2022_charset_list))
2897     {
2898       charset_list = Viso_2022_charset_list;
2899       ASET (attrs, coding_attr_charset_list, charset_list);
2900       ASET (attrs, coding_attr_safe_charsets, Qnil);
2901     }
2902
2903   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2904     return;
2905
2906   max_charset_id = 0;
2907   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2908     {
2909       int id = XINT (XCAR (tail));
2910       if (max_charset_id < id)
2911         max_charset_id = id;
2912     }
2913
2914   safe_charsets = make_uninit_string (max_charset_id + 1);
2915   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2916   request = AREF (attrs, coding_attr_iso_request);
2917   reg_usage = AREF (attrs, coding_attr_iso_usage);
2918   reg94 = XINT (XCAR (reg_usage));
2919   reg96 = XINT (XCDR (reg_usage));
2920
2921   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2922     {
2923       Lisp_Object id;
2924       Lisp_Object reg;
2925       struct charset *charset;
2926
2927       id = XCAR (tail);
2928       charset = CHARSET_FROM_ID (XINT (id));
2929       reg = Fcdr (Fassq (id, request));
2930       if (! NILP (reg))
2931         SSET (safe_charsets, XINT (id), XINT (reg));
2932       else if (charset->iso_chars_96)
2933         {
2934           if (reg96 < 4)
2935             SSET (safe_charsets, XINT (id), reg96);
2936         }
2937       else
2938         {
2939           if (reg94 < 4)
2940             SSET (safe_charsets, XINT (id), reg94);
2941         }
2942     }
2943   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2944 }
2945
2946
2947 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2948    Return true if a text is encoded in one of ISO-2022 based coding
2949    systems.  */
2950
2951 static bool
2952 detect_coding_iso_2022 (struct coding_system *coding,
2953                         struct coding_detection_info *detect_info)
2954 {
2955   const unsigned char *src = coding->source, *src_base = src;
2956   const unsigned char *src_end = coding->source + coding->src_bytes;
2957   bool multibytep = coding->src_multibyte;
2958   bool single_shifting = 0;
2959   int id;
2960   int c, c1;
2961   ptrdiff_t consumed_chars = 0;
2962   int i;
2963   int rejected = 0;
2964   int found = 0;
2965   int composition_count = -1;
2966
2967   detect_info->checked |= CATEGORY_MASK_ISO;
2968
2969   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2970     {
2971       struct coding_system *this = &(coding_categories[i]);
2972       Lisp_Object attrs, val;
2973
2974       if (this->id < 0)
2975         continue;
2976       attrs = CODING_ID_ATTRS (this->id);
2977       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2978           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2979         setup_iso_safe_charsets (attrs);
2980       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2981       this->max_charset_id = SCHARS (val) - 1;
2982       this->safe_charsets = SDATA (val);
2983     }
2984
2985   /* A coding system of this category is always ASCII compatible.  */
2986   src += coding->head_ascii;
2987
2988   while (rejected != CATEGORY_MASK_ISO)
2989     {
2990       src_base = src;
2991       ONE_MORE_BYTE (c);
2992       switch (c)
2993         {
2994         case ISO_CODE_ESC:
2995           if (inhibit_iso_escape_detection)
2996             break;
2997           single_shifting = 0;
2998           ONE_MORE_BYTE (c);
2999           if (c == 'N' || c == 'O')
3000             {
3001               /* ESC <Fe> for SS2 or SS3.  */
3002               single_shifting = 1;
3003               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3004             }
3005           else if (c == '1')
3006             {
3007               /* End of composition.  */
3008               if (composition_count < 0
3009                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3010                 /* Invalid */
3011                 break;
3012               composition_count = -1;
3013               found |= CATEGORY_MASK_ISO;
3014             }
3015           else if (c >= '0' && c <= '4')
3016             {
3017               /* ESC <Fp> for start/end composition.  */
3018               composition_count = 0;
3019             }
3020           else
3021             {
3022               if (c >= '(' && c <= '/')
3023                 {
3024                   /* Designation sequence for a charset of dimension 1.  */
3025                   ONE_MORE_BYTE (c1);
3026                   if (c1 < ' ' || c1 >= 0x80
3027                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3028                     {
3029                       /* Invalid designation sequence.  Just ignore.  */
3030                       if (c1 >= 0x80)
3031                         rejected |= (CATEGORY_MASK_ISO_7BIT
3032                                      | CATEGORY_MASK_ISO_7_ELSE);
3033                       break;
3034                     }
3035                 }
3036               else if (c == '$')
3037                 {
3038                   /* Designation sequence for a charset of dimension 2.  */
3039                   ONE_MORE_BYTE (c);
3040                   if (c >= '@' && c <= 'B')
3041                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3042                     id = iso_charset_table[1][0][c];
3043                   else if (c >= '(' && c <= '/')
3044                     {
3045                       ONE_MORE_BYTE (c1);
3046                       if (c1 < ' ' || c1 >= 0x80
3047                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3048                         {
3049                           /* Invalid designation sequence.  Just ignore.  */
3050                           if (c1 >= 0x80)
3051                             rejected |= (CATEGORY_MASK_ISO_7BIT
3052                                          | CATEGORY_MASK_ISO_7_ELSE);
3053                           break;
3054                         }
3055                     }
3056                   else
3057                     {
3058                       /* Invalid designation sequence.  Just ignore it.  */
3059                       if (c >= 0x80)
3060                         rejected |= (CATEGORY_MASK_ISO_7BIT
3061                                      | CATEGORY_MASK_ISO_7_ELSE);
3062                       break;
3063                     }
3064                 }
3065               else
3066                 {
3067                   /* Invalid escape sequence.  Just ignore it.  */
3068                   if (c >= 0x80)
3069                     rejected |= (CATEGORY_MASK_ISO_7BIT
3070                                  | CATEGORY_MASK_ISO_7_ELSE);
3071                   break;
3072                 }
3073
3074               /* We found a valid designation sequence for CHARSET.  */
3075               rejected |= CATEGORY_MASK_ISO_8BIT;
3076               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3077                                   id))
3078                 found |= CATEGORY_MASK_ISO_7;
3079               else
3080                 rejected |= CATEGORY_MASK_ISO_7;
3081               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3082                                   id))
3083                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3084               else
3085                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3086               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3087                                   id))
3088                 found |= CATEGORY_MASK_ISO_7_ELSE;
3089               else
3090                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3091               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3092                                   id))
3093                 found |= CATEGORY_MASK_ISO_8_ELSE;
3094               else
3095                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3096             }
3097           break;
3098
3099         case ISO_CODE_SO:
3100         case ISO_CODE_SI:
3101           /* Locking shift out/in.  */
3102           if (inhibit_iso_escape_detection)
3103             break;
3104           single_shifting = 0;
3105           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3106           break;
3107
3108         case ISO_CODE_CSI:
3109           /* Control sequence introducer.  */
3110           single_shifting = 0;
3111           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3112           found |= CATEGORY_MASK_ISO_8_ELSE;
3113           goto check_extra_latin;
3114
3115         case ISO_CODE_SS2:
3116         case ISO_CODE_SS3:
3117           /* Single shift.   */
3118           if (inhibit_iso_escape_detection)
3119             break;
3120           single_shifting = 0;
3121           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3122           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3123               & CODING_ISO_FLAG_SINGLE_SHIFT)
3124             {
3125               found |= CATEGORY_MASK_ISO_8_1;
3126               single_shifting = 1;
3127             }
3128           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3129               & CODING_ISO_FLAG_SINGLE_SHIFT)
3130             {
3131               found |= CATEGORY_MASK_ISO_8_2;
3132               single_shifting = 1;
3133             }
3134           if (single_shifting)
3135             break;
3136           goto check_extra_latin;
3137
3138         default:
3139           if (c < 0)
3140             continue;
3141           if (c < 0x80)
3142             {
3143               if (composition_count >= 0)
3144                 composition_count++;
3145               single_shifting = 0;
3146               break;
3147             }
3148           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3149           if (c >= 0xA0)
3150             {
3151               found |= CATEGORY_MASK_ISO_8_1;
3152               /* Check the length of succeeding codes of the range
3153                  0xA0..0FF.  If the byte length is even, we include
3154                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3155                  only when we are not single shifting.  */
3156               if (! single_shifting
3157                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3158                 {
3159                   ptrdiff_t len = 1;
3160                   while (src < src_end)
3161                     {
3162                       src_base = src;
3163                       ONE_MORE_BYTE (c);
3164                       if (c < 0xA0)
3165                         {
3166                           src = src_base;
3167                           break;
3168                         }
3169                       len++;
3170                     }
3171
3172                   if (len & 1 && src < src_end)
3173                     {
3174                       rejected |= CATEGORY_MASK_ISO_8_2;
3175                       if (composition_count >= 0)
3176                         composition_count += len;
3177                     }
3178                   else
3179                     {
3180                       found |= CATEGORY_MASK_ISO_8_2;
3181                       if (composition_count >= 0)
3182                         composition_count += len / 2;
3183                     }
3184                 }
3185               break;
3186             }
3187         check_extra_latin:
3188           if (! VECTORP (Vlatin_extra_code_table)
3189               || NILP (AREF (Vlatin_extra_code_table, c)))
3190             {
3191               rejected = CATEGORY_MASK_ISO;
3192               break;
3193             }
3194           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3195               & CODING_ISO_FLAG_LATIN_EXTRA)
3196             found |= CATEGORY_MASK_ISO_8_1;
3197           else
3198             rejected |= CATEGORY_MASK_ISO_8_1;
3199           rejected |= CATEGORY_MASK_ISO_8_2;
3200           break;
3201         }
3202     }
3203   detect_info->rejected |= CATEGORY_MASK_ISO;
3204   return 0;
3205
3206  no_more_source:
3207   detect_info->rejected |= rejected;
3208   detect_info->found |= (found & ~rejected);
3209   return 1;
3210 }
3211
3212
3213 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3214    escape sequence should be kept.  */
3215 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3216   do {                                                                  \
3217     int id, prev;                                                       \
3218                                                                         \
3219     if (final < '0' || final >= 128                                     \
3220         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3221         || !SAFE_CHARSET_P (coding, id))                                \
3222       {                                                                 \
3223         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3224         chars_96 = -1;                                                  \
3225         break;                                                          \
3226       }                                                                 \
3227     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3228     if (id == charset_jisx0201_roman)                                   \
3229       {                                                                 \
3230         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3231           id = charset_ascii;                                           \
3232       }                                                                 \
3233     else if (id == charset_jisx0208_1978)                               \
3234       {                                                                 \
3235         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3236           id = charset_jisx0208;                                        \
3237       }                                                                 \
3238     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3239     /* If there was an invalid designation to REG previously, and this  \
3240        designation is ASCII to REG, we should keep this designation     \
3241        sequence.  */                                                    \
3242     if (prev == -2 && id == charset_ascii)                              \
3243       chars_96 = -1;                                                    \
3244   } while (0)
3245
3246
3247 /* Handle these composition sequence (ALT: alternate char):
3248
3249    (1) relative composition: ESC 0 CHAR ... ESC 1
3250    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3251    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3252    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3253
3254    When the start sequence (ESC 0/2/3/4) is found, this annotation
3255    header is produced.
3256
3257         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3258
3259    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3260    produced until the end sequence (ESC 1) is found:
3261
3262    (1) CHAR ... CHAR
3263    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3264    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3265    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3266
3267    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3268    annotation header is updated as below:
3269
3270    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3271    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3272    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3273    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3274
3275    If an error is found while composing, the annotation header is
3276    changed to:
3277
3278         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3279
3280    and the sequence [ -2 DECODED-RULE ] is changed to the original
3281    byte sequence as below:
3282         o the original byte sequence is B: [ B -1 ]
3283         o the original byte sequence is B1 B2: [ B1 B2 ]
3284    and the sequence [ -1 -1 ] is changed to the original byte
3285    sequence:
3286         [ ESC '0' ]
3287 */
3288
3289 /* Decode a composition rule C1 and maybe one more byte from the
3290    source, and set RULE to the encoded composition rule.  If the rule
3291    is invalid, goto invalid_code.  */
3292
3293 #define DECODE_COMPOSITION_RULE(rule)                                   \
3294   do {                                                                  \
3295     rule = c1 - 32;                                                     \
3296     if (rule < 0)                                                       \
3297       goto invalid_code;                                                \
3298     if (rule < 81)              /* old format (before ver.21) */        \
3299       {                                                                 \
3300         int gref = (rule) / 9;                                          \
3301         int nref = (rule) % 9;                                          \
3302         if (gref == 4) gref = 10;                                       \
3303         if (nref == 4) nref = 10;                                       \
3304         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3305       }                                                                 \
3306     else                        /* new format (after ver.21) */         \
3307       {                                                                 \
3308         int b;                                                          \
3309                                                                         \
3310         ONE_MORE_BYTE (b);                                              \
3311         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3312           goto invalid_code;                                            \
3313         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3314         rule += 0x100;   /* Distinguish it from the old format.  */     \
3315       }                                                                 \
3316   } while (0)
3317
3318 #define ENCODE_COMPOSITION_RULE(rule)                           \
3319   do {                                                          \
3320     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3321                                                                 \
3322     if (rule < 0x100)           /* old format */                \
3323       {                                                         \
3324         if (gref == 10) gref = 4;                               \
3325         if (nref == 10) nref = 4;                               \
3326         charbuf[idx] = 32 + gref * 9 + nref;                    \
3327         charbuf[idx + 1] = -1;                                  \
3328         new_chars++;                                            \
3329       }                                                         \
3330     else                                /* new format */        \
3331       {                                                         \
3332         charbuf[idx] = 32 + 81 + gref;                          \
3333         charbuf[idx + 1] = 32 + nref;                           \
3334         new_chars += 2;                                         \
3335       }                                                         \
3336   } while (0)
3337
3338 /* Finish the current composition as invalid.  */
3339
3340 static int
3341 finish_composition (int *charbuf, struct composition_status *cmp_status)
3342 {
3343   int idx = - cmp_status->length;
3344   int new_chars;
3345
3346   /* Recover the original ESC sequence */
3347   charbuf[idx++] = ISO_CODE_ESC;
3348   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3349                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3350                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3351                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3352                     : '4');
3353   charbuf[idx++] = -2;
3354   charbuf[idx++] = 0;
3355   charbuf[idx++] = -1;
3356   new_chars = cmp_status->nchars;
3357   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3358     for (; idx < 0; idx++)
3359       {
3360         int elt = charbuf[idx];
3361
3362         if (elt == -2)
3363           {
3364             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3365             idx++;
3366           }
3367         else if (elt == -1)
3368           {
3369             charbuf[idx++] = ISO_CODE_ESC;
3370             charbuf[idx] = '0';
3371             new_chars += 2;
3372           }
3373       }
3374   cmp_status->state = COMPOSING_NO;
3375   return new_chars;
3376 }
3377
3378 /* If characters are under composition, finish the composition.  */
3379 #define MAYBE_FINISH_COMPOSITION()                              \
3380   do {                                                          \
3381     if (cmp_status->state != COMPOSING_NO)                      \
3382       char_offset += finish_composition (charbuf, cmp_status);  \
3383   } while (0)
3384
3385 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3386
3387    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3388    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3389    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3390    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3391
3392    Produce this annotation sequence now:
3393
3394    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3395 */
3396
3397 #define DECODE_COMPOSITION_START(c1)                                       \
3398   do {                                                                     \
3399     if (c1 == '0'                                                          \
3400         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3401              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3402             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3403                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3404       {                                                                    \
3405         *charbuf++ = -1;                                                   \
3406         *charbuf++= -1;                                                    \
3407         cmp_status->state = COMPOSING_CHAR;                                \
3408         cmp_status->length += 2;                                           \
3409       }                                                                    \
3410     else                                                                   \
3411       {                                                                    \
3412         MAYBE_FINISH_COMPOSITION ();                                       \
3413         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3414                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3415                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3416                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3417         cmp_status->state                                                  \
3418           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3419         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3420         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3421         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3422         coding->annotated = 1;                                             \
3423       }                                                                    \
3424   } while (0)
3425
3426
3427 /* Handle composition end sequence ESC 1.  */
3428
3429 #define DECODE_COMPOSITION_END()                                        \
3430   do {                                                                  \
3431     if (cmp_status->nchars == 0                                         \
3432         || ((cmp_status->state == COMPOSING_CHAR)                       \
3433             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3434       {                                                                 \
3435         MAYBE_FINISH_COMPOSITION ();                                    \
3436         goto invalid_code;                                              \
3437       }                                                                 \
3438     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3439       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3440     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3441       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3442     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3443     char_offset += cmp_status->nchars;                                  \
3444     cmp_status->state = COMPOSING_NO;                                   \
3445   } while (0)
3446
3447 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3448
3449 #define STORE_COMPOSITION_RULE(rule)    \
3450   do {                                  \
3451     *charbuf++ = -2;                    \
3452     *charbuf++ = rule;                  \
3453     cmp_status->length += 2;            \
3454     cmp_status->state--;                \
3455   } while (0)
3456
3457 /* Store a composed char or a component char C in charbuf, and update
3458    cmp_status.  */
3459
3460 #define STORE_COMPOSITION_CHAR(c)                                       \
3461   do {                                                                  \
3462     *charbuf++ = (c);                                                   \
3463     cmp_status->length++;                                               \
3464     if (cmp_status->state == COMPOSING_CHAR)                            \
3465       cmp_status->nchars++;                                             \
3466     else                                                                \
3467       cmp_status->ncomps++;                                             \
3468     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3469         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3470             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3471       cmp_status->state++;                                              \
3472   } while (0)
3473
3474
3475 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3476
3477 static void
3478 decode_coding_iso_2022 (struct coding_system *coding)
3479 {
3480   const unsigned char *src = coding->source + coding->consumed;
3481   const unsigned char *src_end = coding->source + coding->src_bytes;
3482   const unsigned char *src_base;
3483   int *charbuf = coding->charbuf + coding->charbuf_used;
3484   /* We may produce two annotations (charset and composition) in one
3485      loop and one more charset annotation at the end.  */
3486   int *charbuf_end
3487     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3488   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3489   bool multibytep = coding->src_multibyte;
3490   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3491   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3492   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3493   int charset_id_2, charset_id_3;
3494   struct charset *charset;
3495   int c;
3496   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3497   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3498   ptrdiff_t char_offset = coding->produced_char;
3499   ptrdiff_t last_offset = char_offset;
3500   int last_id = charset_ascii;
3501   bool eol_dos
3502     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3503   int byte_after_cr = -1;
3504   int i;
3505
3506   setup_iso_safe_charsets (attrs);
3507   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3508
3509   if (cmp_status->state != COMPOSING_NO)
3510     {
3511       if (charbuf_end - charbuf < cmp_status->length)
3512         emacs_abort ();
3513       for (i = 0; i < cmp_status->length; i++)
3514         *charbuf++ = cmp_status->carryover[i];
3515       coding->annotated = 1;
3516     }
3517
3518   while (1)
3519     {
3520       int c1, c2, c3;
3521
3522       src_base = src;
3523       consumed_chars_base = consumed_chars;
3524
3525       if (charbuf >= charbuf_end)
3526         {
3527           if (byte_after_cr >= 0)
3528             src_base--;
3529           break;
3530         }
3531
3532       if (byte_after_cr >= 0)
3533         c1 = byte_after_cr, byte_after_cr = -1;
3534       else
3535         ONE_MORE_BYTE (c1);
3536       if (c1 < 0)
3537         goto invalid_code;
3538
3539       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3540         {
3541           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3542           char_offset++;
3543           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3544           continue;
3545         }
3546
3547       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3548         {
3549           if (c1 == ISO_CODE_ESC)
3550             {
3551               if (src + 1 >= src_end)
3552                 goto no_more_source;
3553               *charbuf++ = ISO_CODE_ESC;
3554               char_offset++;
3555               if (src[0] == '%' && src[1] == '@')
3556                 {
3557                   src += 2;
3558                   consumed_chars += 2;
3559                   char_offset += 2;
3560                   /* We are sure charbuf can contain two more chars. */
3561                   *charbuf++ = '%';
3562                   *charbuf++ = '@';
3563                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3564                 }
3565             }
3566           else
3567             {
3568               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3569               char_offset++;
3570             }
3571           continue;
3572         }
3573
3574       if ((cmp_status->state == COMPOSING_RULE
3575            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3576           && c1 != ISO_CODE_ESC)
3577         {
3578           int rule;
3579
3580           DECODE_COMPOSITION_RULE (rule);
3581           STORE_COMPOSITION_RULE (rule);
3582           continue;
3583         }
3584
3585       /* We produce at most one character.  */
3586       switch (iso_code_class [c1])
3587         {
3588         case ISO_0x20_or_0x7F:
3589           if (charset_id_0 < 0
3590               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3591             /* This is SPACE or DEL.  */
3592             charset = CHARSET_FROM_ID (charset_ascii);
3593           else
3594             charset = CHARSET_FROM_ID (charset_id_0);
3595           break;
3596
3597         case ISO_graphic_plane_0:
3598           if (charset_id_0 < 0)
3599             charset = CHARSET_FROM_ID (charset_ascii);
3600           else
3601             charset = CHARSET_FROM_ID (charset_id_0);
3602           break;
3603
3604         case ISO_0xA0_or_0xFF:
3605           if (charset_id_1 < 0
3606               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3607               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3608             goto invalid_code;
3609           /* This is a graphic character, we fall down ... */
3610           FALLTHROUGH;
3611         case ISO_graphic_plane_1:
3612           if (charset_id_1 < 0)
3613             goto invalid_code;
3614           charset = CHARSET_FROM_ID (charset_id_1);
3615           break;
3616
3617         case ISO_control_0:
3618           if (eol_dos && c1 == '\r')
3619             ONE_MORE_BYTE (byte_after_cr);
3620           MAYBE_FINISH_COMPOSITION ();
3621           charset = CHARSET_FROM_ID (charset_ascii);
3622           break;
3623
3624         case ISO_control_1:
3625           goto invalid_code;
3626
3627         case ISO_shift_out:
3628           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3629               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3630             goto invalid_code;
3631           CODING_ISO_INVOCATION (coding, 0) = 1;
3632           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3633           continue;
3634
3635         case ISO_shift_in:
3636           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3637             goto invalid_code;
3638           CODING_ISO_INVOCATION (coding, 0) = 0;
3639           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3640           continue;
3641
3642         case ISO_single_shift_2_7:
3643           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3644             goto invalid_code;
3645           FALLTHROUGH;
3646         case ISO_single_shift_2:
3647           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3648             goto invalid_code;
3649           /* SS2 is handled as an escape sequence of ESC 'N' */
3650           c1 = 'N';
3651           goto label_escape_sequence;
3652
3653         case ISO_single_shift_3:
3654           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3655             goto invalid_code;
3656           /* SS2 is handled as an escape sequence of ESC 'O' */
3657           c1 = 'O';
3658           goto label_escape_sequence;
3659
3660         case ISO_control_sequence_introducer:
3661           /* CSI is handled as an escape sequence of ESC '[' ...  */
3662           c1 = '[';
3663           goto label_escape_sequence;
3664
3665         case ISO_escape:
3666           ONE_MORE_BYTE (c1);
3667         label_escape_sequence:
3668           /* Escape sequences handled here are invocation,
3669              designation, direction specification, and character
3670              composition specification.  */
3671           switch (c1)
3672             {
3673             case '&':           /* revision of following character set */
3674               ONE_MORE_BYTE (c1);
3675               if (!(c1 >= '@' && c1 <= '~'))
3676                 goto invalid_code;
3677               ONE_MORE_BYTE (c1);
3678               if (c1 != ISO_CODE_ESC)
3679                 goto invalid_code;
3680               ONE_MORE_BYTE (c1);
3681               goto label_escape_sequence;
3682
3683             case '$':           /* designation of 2-byte character set */
3684               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3685                 goto invalid_code;
3686               {
3687                 int reg, chars96;
3688
3689                 ONE_MORE_BYTE (c1);
3690                 if (c1 >= '@' && c1 <= 'B')
3691                   {     /* designation of JISX0208.1978, GB2312.1980,
3692                            or JISX0208.1980 */
3693                     reg = 0, chars96 = 0;
3694                   }
3695                 else if (c1 >= 0x28 && c1 <= 0x2B)
3696                   { /* designation of DIMENSION2_CHARS94 character set */
3697                     reg = c1 - 0x28, chars96 = 0;
3698                     ONE_MORE_BYTE (c1);
3699                   }
3700                 else if (c1 >= 0x2C && c1 <= 0x2F)
3701                   { /* designation of DIMENSION2_CHARS96 character set */
3702                     reg = c1 - 0x2C, chars96 = 1;
3703                     ONE_MORE_BYTE (c1);
3704                   }
3705                 else
3706                   goto invalid_code;
3707                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3708                 /* We must update these variables now.  */
3709                 if (reg == 0)
3710                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3711                 else if (reg == 1)
3712                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3713                 if (chars96 < 0)
3714                   goto invalid_code;
3715               }
3716               continue;
3717
3718             case 'n':           /* invocation of locking-shift-2 */
3719               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3720                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3721                 goto invalid_code;
3722               CODING_ISO_INVOCATION (coding, 0) = 2;
3723               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3724               continue;
3725
3726             case 'o':           /* invocation of locking-shift-3 */
3727               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3728                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3729                 goto invalid_code;
3730               CODING_ISO_INVOCATION (coding, 0) = 3;
3731               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3732               continue;
3733
3734             case 'N':           /* invocation of single-shift-2 */
3735               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3736                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3737                 goto invalid_code;
3738               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3739               if (charset_id_2 < 0)
3740                 charset = CHARSET_FROM_ID (charset_ascii);
3741               else
3742                 charset = CHARSET_FROM_ID (charset_id_2);
3743               ONE_MORE_BYTE (c1);
3744               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3745                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3746                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3747                           ? c1 >= 0x80 : c1 < 0x80)))
3748                 goto invalid_code;
3749               break;
3750
3751             case 'O':           /* invocation of single-shift-3 */
3752               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3753                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3754                 goto invalid_code;
3755               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3756               if (charset_id_3 < 0)
3757                 charset = CHARSET_FROM_ID (charset_ascii);
3758               else
3759                 charset = CHARSET_FROM_ID (charset_id_3);
3760               ONE_MORE_BYTE (c1);
3761               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3762                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3763                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3764                           ? c1 >= 0x80 : c1 < 0x80)))
3765                 goto invalid_code;
3766               break;
3767
3768             case '0': case '2': case '3': case '4': /* start composition */
3769               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3770                 goto invalid_code;
3771               if (last_id != charset_ascii)
3772                 {
3773                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3774                   last_id = charset_ascii;
3775                   last_offset = char_offset;
3776                 }
3777               DECODE_COMPOSITION_START (c1);
3778               continue;
3779
3780             case '1':           /* end composition */
3781               if (cmp_status->state == COMPOSING_NO)
3782                 goto invalid_code;
3783               DECODE_COMPOSITION_END ();
3784               continue;
3785
3786             case '[':           /* specification of direction */
3787               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3788                 goto invalid_code;
3789               /* For the moment, nested direction is not supported.
3790                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3791                  left-to-right, and nonzero means right-to-left.  */
3792               ONE_MORE_BYTE (c1);
3793               switch (c1)
3794                 {
3795                 case ']':       /* end of the current direction */
3796                   coding->mode &= ~CODING_MODE_DIRECTION;
3797                   break;
3798
3799                 case '0':       /* end of the current direction */
3800                 case '1':       /* start of left-to-right direction */
3801                   ONE_MORE_BYTE (c1);
3802                   if (c1 == ']')
3803                     coding->mode &= ~CODING_MODE_DIRECTION;
3804                   else
3805                     goto invalid_code;
3806                   break;
3807
3808                 case '2':       /* start of right-to-left direction */
3809                   ONE_MORE_BYTE (c1);
3810                   if (c1 == ']')
3811                     coding->mode |= CODING_MODE_DIRECTION;
3812                   else
3813                     goto invalid_code;
3814                   break;
3815
3816                 default:
3817                   goto invalid_code;
3818                 }
3819               continue;
3820
3821             case '%':
3822               ONE_MORE_BYTE (c1);
3823               if (c1 == '/')
3824                 {
3825                   /* CTEXT extended segment:
3826                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3827                      We keep these bytes as is for the moment.
3828                      They may be decoded by post-read-conversion.  */
3829                   int dim, M, L;
3830                   int size;
3831
3832                   ONE_MORE_BYTE (dim);
3833                   if (dim < '0' || dim > '4')
3834                     goto invalid_code;
3835                   ONE_MORE_BYTE (M);
3836                   if (M < 128)
3837                     goto invalid_code;
3838                   ONE_MORE_BYTE (L);
3839                   if (L < 128)
3840                     goto invalid_code;
3841                   size = ((M - 128) * 128) + (L - 128);
3842                   if (charbuf + 6 > charbuf_end)
3843                     goto break_loop;
3844                   *charbuf++ = ISO_CODE_ESC;
3845                   *charbuf++ = '%';
3846                   *charbuf++ = '/';
3847                   *charbuf++ = dim;
3848                   *charbuf++ = BYTE8_TO_CHAR (M);
3849                   *charbuf++ = BYTE8_TO_CHAR (L);
3850                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3851                 }
3852               else if (c1 == 'G')
3853                 {
3854                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3855                      ESC % G --UTF-8-BYTES-- ESC % @
3856                      We keep these bytes as is for the moment.
3857                      They may be decoded by post-read-conversion.  */
3858                   if (charbuf + 3 > charbuf_end)
3859                     goto break_loop;
3860                   *charbuf++ = ISO_CODE_ESC;
3861                   *charbuf++ = '%';
3862                   *charbuf++ = 'G';
3863                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3864                 }
3865               else
3866                 goto invalid_code;
3867               continue;
3868               break;
3869
3870             default:
3871               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3872                 goto invalid_code;
3873               {
3874                 int reg, chars96;
3875
3876                 if (c1 >= 0x28 && c1 <= 0x2B)
3877                   { /* designation of DIMENSION1_CHARS94 character set */
3878                     reg = c1 - 0x28, chars96 = 0;
3879                     ONE_MORE_BYTE (c1);
3880                   }
3881                 else if (c1 >= 0x2C && c1 <= 0x2F)
3882                   { /* designation of DIMENSION1_CHARS96 character set */
3883                     reg = c1 - 0x2C, chars96 = 1;
3884                     ONE_MORE_BYTE (c1);
3885                   }
3886                 else
3887                   goto invalid_code;
3888                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3889                 /* We must update these variables now.  */
3890                 if (reg == 0)
3891                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3892                 else if (reg == 1)
3893                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3894                 if (chars96 < 0)
3895                   goto invalid_code;
3896               }
3897               continue;
3898             }
3899           break;
3900
3901         default:
3902           emacs_abort ();
3903         }
3904
3905       if (cmp_status->state == COMPOSING_NO
3906           && charset->id != charset_ascii
3907           && last_id != charset->id)
3908         {
3909           if (last_id != charset_ascii)
3910             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3911           last_id = charset->id;
3912           last_offset = char_offset;
3913         }
3914
3915       /* Now we know CHARSET and 1st position code C1 of a character.
3916          Produce a decoded character while getting 2nd and 3rd
3917          position codes C2, C3 if necessary.  */
3918       if (CHARSET_DIMENSION (charset) > 1)
3919         {
3920           ONE_MORE_BYTE (c2);
3921           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3922               || ((c1 & 0x80) != (c2 & 0x80)))
3923             /* C2 is not in a valid range.  */
3924             goto invalid_code;
3925           if (CHARSET_DIMENSION (charset) == 2)
3926             c1 = (c1 << 8) | c2;
3927           else
3928             {
3929               ONE_MORE_BYTE (c3);
3930               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3931                   || ((c1 & 0x80) != (c3 & 0x80)))
3932                 /* C3 is not in a valid range.  */
3933                 goto invalid_code;
3934               c1 = (c1 << 16) | (c2 << 8) | c2;
3935             }
3936         }
3937       c1 &= 0x7F7F7F;
3938       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3939       if (c < 0)
3940         {
3941           MAYBE_FINISH_COMPOSITION ();
3942           for (; src_base < src; src_base++, char_offset++)
3943             {
3944               if (ASCII_CHAR_P (*src_base))
3945                 *charbuf++ = *src_base;
3946               else
3947                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3948             }
3949         }
3950       else if (cmp_status->state == COMPOSING_NO)
3951         {
3952           *charbuf++ = c;
3953           char_offset++;
3954         }
3955       else if ((cmp_status->state == COMPOSING_CHAR
3956                 ? cmp_status->nchars
3957                 : cmp_status->ncomps)
3958                >= MAX_COMPOSITION_COMPONENTS)
3959         {
3960           /* Too long composition.  */
3961           MAYBE_FINISH_COMPOSITION ();
3962           *charbuf++ = c;
3963           char_offset++;
3964         }
3965       else
3966         STORE_COMPOSITION_CHAR (c);
3967       continue;
3968
3969     invalid_code:
3970       MAYBE_FINISH_COMPOSITION ();
3971       src = src_base;
3972       consumed_chars = consumed_chars_base;
3973       ONE_MORE_BYTE (c);
3974       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
3975       char_offset++;
3976       /* Reset the invocation and designation status to the safest
3977          one; i.e. designate ASCII to the graphic register 0, and
3978          invoke that register to the graphic plane 0.  This typically
3979          helps the case that a designation sequence for ASCII "ESC (
3980          B" is somehow broken (e.g. broken by a newline).  */
3981       CODING_ISO_INVOCATION (coding, 0) = 0;
3982       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3983       charset_id_0 = charset_ascii;
3984       continue;
3985
3986     break_loop:
3987       break;
3988     }
3989
3990  no_more_source:
3991   if (cmp_status->state != COMPOSING_NO)
3992     {
3993       if (coding->mode & CODING_MODE_LAST_BLOCK)
3994         MAYBE_FINISH_COMPOSITION ();
3995       else
3996         {
3997           charbuf -= cmp_status->length;
3998           for (i = 0; i < cmp_status->length; i++)
3999             cmp_status->carryover[i] = charbuf[i];
4000         }
4001     }
4002   else if (last_id != charset_ascii)
4003     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4004   coding->consumed_char += consumed_chars_base;
4005   coding->consumed = src_base - coding->source;
4006   coding->charbuf_used = charbuf - coding->charbuf;
4007 }
4008
4009
4010 /* ISO2022 encoding stuff.  */
4011
4012 /*
4013    It is not enough to say just "ISO2022" on encoding, we have to
4014    specify more details.  In Emacs, each coding system of ISO2022
4015    variant has the following specifications:
4016         1. Initial designation to G0 thru G3.
4017         2. Allows short-form designation?
4018         3. ASCII should be designated to G0 before control characters?
4019         4. ASCII should be designated to G0 at end of line?
4020         5. 7-bit environment or 8-bit environment?
4021         6. Use locking-shift?
4022         7. Use Single-shift?
4023    And the following two are only for Japanese:
4024         8. Use ASCII in place of JIS0201-1976-Roman?
4025         9. Use JISX0208-1983 in place of JISX0208-1978?
4026    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4027    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4028    details.
4029 */
4030
4031 /* Produce codes (escape sequence) for designating CHARSET to graphic
4032    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4033    '@', 'A', or 'B' and the coding system CODING allows, produce
4034    designation sequence of short-form.  */
4035
4036 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4037   do {                                                                  \
4038     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4039     const char *intermediate_char_94 = "()*+";                          \
4040     const char *intermediate_char_96 = ",-./";                          \
4041     int revision = -1;                                                  \
4042                                                                         \
4043     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4044       revision = CHARSET_ISO_REVISION (charset);                        \
4045                                                                         \
4046     if (revision >= 0)                                                  \
4047       {                                                                 \
4048         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4049         EMIT_ONE_BYTE ('@' + revision);                                 \
4050       }                                                                 \
4051     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4052     if (CHARSET_DIMENSION (charset) == 1)                               \
4053       {                                                                 \
4054         int b;                                                          \
4055         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4056           b = intermediate_char_94[reg];                                \
4057         else                                                            \
4058           b = intermediate_char_96[reg];                                \
4059         EMIT_ONE_ASCII_BYTE (b);                                        \
4060       }                                                                 \
4061     else                                                                \
4062       {                                                                 \
4063         EMIT_ONE_ASCII_BYTE ('$');                                      \
4064         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4065           {                                                             \
4066             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4067                 || reg != 0                                             \
4068                 || final_char < '@' || final_char > 'B')                \
4069               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4070           }                                                             \
4071         else                                                            \
4072           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4073       }                                                                 \
4074     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4075                                                                         \
4076     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4077   } while (0)
4078
4079
4080 /* The following two macros produce codes (control character or escape
4081    sequence) for ISO2022 single-shift functions (single-shift-2 and
4082    single-shift-3).  */
4083
4084 #define ENCODE_SINGLE_SHIFT_2                                           \
4085   do {                                                                  \
4086     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4087       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4088     else                                                                \
4089       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4090     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4091   } while (0)
4092
4093
4094 #define ENCODE_SINGLE_SHIFT_3                                           \
4095   do {                                                                  \
4096     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4097       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4098     else                                                                \
4099       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4100     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4101   } while (0)
4102
4103
4104 /* The following four macros produce codes (control character or
4105    escape sequence) for ISO2022 locking-shift functions (shift-in,
4106    shift-out, locking-shift-2, and locking-shift-3).  */
4107
4108 #define ENCODE_SHIFT_IN                                 \
4109   do {                                                  \
4110     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4111     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4112   } while (0)
4113
4114
4115 #define ENCODE_SHIFT_OUT                                \
4116   do {                                                  \
4117     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4118     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4119   } while (0)
4120
4121
4122 #define ENCODE_LOCKING_SHIFT_2                          \
4123   do {                                                  \
4124     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4125     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4126   } while (0)
4127
4128
4129 #define ENCODE_LOCKING_SHIFT_3                          \
4130   do {                                                  \
4131     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4132     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4133   } while (0)
4134
4135
4136 /* Produce codes for a DIMENSION1 character whose character set is
4137    CHARSET and whose position-code is C1.  Designation and invocation
4138    sequences are also produced in advance if necessary.  */
4139
4140 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4141   do {                                                                  \
4142     int id = CHARSET_ID (charset);                                      \
4143                                                                         \
4144     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4145         && id == charset_ascii)                                         \
4146       {                                                                 \
4147         id = charset_jisx0201_roman;                                    \
4148         charset = CHARSET_FROM_ID (id);                                 \
4149       }                                                                 \
4150                                                                         \
4151     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4152       {                                                                 \
4153         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4154           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4155         else                                                            \
4156           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4157         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4158         break;                                                          \
4159       }                                                                 \
4160     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4161       {                                                                 \
4162         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4163         break;                                                          \
4164       }                                                                 \
4165     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4166       {                                                                 \
4167         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4168         break;                                                          \
4169       }                                                                 \
4170     else                                                                \
4171       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4172          must invoke it, or, at first, designate it to some graphic     \
4173          register.  Then repeat the loop to actually produce the        \
4174          character.  */                                                 \
4175       dst = encode_invocation_designation (charset, coding, dst,        \
4176                                            &produced_chars);            \
4177   } while (1)
4178
4179
4180 /* Produce codes for a DIMENSION2 character whose character set is
4181    CHARSET and whose position-codes are C1 and C2.  Designation and
4182    invocation codes are also produced in advance if necessary.  */
4183
4184 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4185   do {                                                                  \
4186     int id = CHARSET_ID (charset);                                      \
4187                                                                         \
4188     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4189         && id == charset_jisx0208)                                      \
4190       {                                                                 \
4191         id = charset_jisx0208_1978;                                     \
4192         charset = CHARSET_FROM_ID (id);                                 \
4193       }                                                                 \
4194                                                                         \
4195     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4196       {                                                                 \
4197         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4198           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4199         else                                                            \
4200           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4201         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4202         break;                                                          \
4203       }                                                                 \
4204     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4205       {                                                                 \
4206         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4207         break;                                                          \
4208       }                                                                 \
4209     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4210       {                                                                 \
4211         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4212         break;                                                          \
4213       }                                                                 \
4214     else                                                                \
4215       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4216          must invoke it, or, at first, designate it to some graphic     \
4217          register.  Then repeat the loop to actually produce the        \
4218          character.  */                                                 \
4219       dst = encode_invocation_designation (charset, coding, dst,        \
4220                                            &produced_chars);            \
4221   } while (1)
4222
4223
4224 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4225   do {                                                                     \
4226     unsigned code;                                                         \
4227     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4228                                                                            \
4229     if (CHARSET_DIMENSION (charset) == 1)                                  \
4230       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4231     else                                                                   \
4232       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4233   } while (0)
4234
4235
4236 /* Produce designation and invocation codes at a place pointed by DST
4237    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4238    Return new DST.  */
4239
4240 static unsigned char *
4241 encode_invocation_designation (struct charset *charset,
4242                                struct coding_system *coding,
4243                                unsigned char *dst, ptrdiff_t *p_nchars)
4244 {
4245   bool multibytep = coding->dst_multibyte;
4246   ptrdiff_t produced_chars = *p_nchars;
4247   int reg;                      /* graphic register number */
4248   int id = CHARSET_ID (charset);
4249
4250   /* At first, check designations.  */
4251   for (reg = 0; reg < 4; reg++)
4252     if (id == CODING_ISO_DESIGNATION (coding, reg))
4253       break;
4254
4255   if (reg >= 4)
4256     {
4257       /* CHARSET is not yet designated to any graphic registers.  */
4258       /* At first check the requested designation.  */
4259       reg = CODING_ISO_REQUEST (coding, id);
4260       if (reg < 0)
4261         /* Since CHARSET requests no special designation, designate it
4262            to graphic register 0.  */
4263         reg = 0;
4264
4265       ENCODE_DESIGNATION (charset, reg, coding);
4266     }
4267
4268   if (CODING_ISO_INVOCATION (coding, 0) != reg
4269       && CODING_ISO_INVOCATION (coding, 1) != reg)
4270     {
4271       /* Since the graphic register REG is not invoked to any graphic
4272          planes, invoke it to graphic plane 0.  */
4273       switch (reg)
4274         {
4275         case 0:                 /* graphic register 0 */
4276           ENCODE_SHIFT_IN;
4277           break;
4278
4279         case 1:                 /* graphic register 1 */
4280           ENCODE_SHIFT_OUT;
4281           break;
4282
4283         case 2:                 /* graphic register 2 */
4284           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4285             ENCODE_SINGLE_SHIFT_2;
4286           else
4287             ENCODE_LOCKING_SHIFT_2;
4288           break;
4289
4290         case 3:                 /* graphic register 3 */
4291           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4292             ENCODE_SINGLE_SHIFT_3;
4293           else
4294             ENCODE_LOCKING_SHIFT_3;
4295           break;
4296
4297         default:
4298           break;
4299         }
4300     }
4301
4302   *p_nchars = produced_chars;
4303   return dst;
4304 }
4305
4306
4307 /* Produce codes for designation and invocation to reset the graphic
4308    planes and registers to initial state.  */
4309 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4310   do {                                                                  \
4311     int reg;                                                            \
4312     struct charset *charset;                                            \
4313                                                                         \
4314     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4315       ENCODE_SHIFT_IN;                                                  \
4316     for (reg = 0; reg < 4; reg++)                                       \
4317       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4318           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4319               != CODING_ISO_INITIAL (coding, reg)))                     \
4320         {                                                               \
4321           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4322           ENCODE_DESIGNATION (charset, reg, coding);                    \
4323         }                                                               \
4324   } while (0)
4325
4326
4327 /* Produce designation sequences of charsets in the line started from
4328    CHARBUF to a place pointed by DST, and return the number of
4329    produced bytes.  DST should not directly point a buffer text area
4330    which may be relocated by char_charset call.
4331
4332    If the current block ends before any end-of-line, we may fail to
4333    find all the necessary designations.  */
4334
4335 static ptrdiff_t
4336 encode_designation_at_bol (struct coding_system *coding,
4337                            int *charbuf, int *charbuf_end,
4338                            unsigned char *dst)
4339 {
4340   unsigned char *orig = dst;
4341   struct charset *charset;
4342   /* Table of charsets to be designated to each graphic register.  */
4343   int r[4];
4344   int c, found = 0, reg;
4345   ptrdiff_t produced_chars = 0;
4346   bool multibytep = coding->dst_multibyte;
4347   Lisp_Object attrs;
4348   Lisp_Object charset_list;
4349
4350   attrs = CODING_ID_ATTRS (coding->id);
4351   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4352   if (EQ (charset_list, Qiso_2022))
4353     charset_list = Viso_2022_charset_list;
4354
4355   for (reg = 0; reg < 4; reg++)
4356     r[reg] = -1;
4357
4358   while (charbuf < charbuf_end && found < 4)
4359     {
4360       int id;
4361
4362       c = *charbuf++;
4363       if (c == '\n')
4364         break;
4365       charset = char_charset (c, charset_list, NULL);
4366       id = CHARSET_ID (charset);
4367       reg = CODING_ISO_REQUEST (coding, id);
4368       if (reg >= 0 && r[reg] < 0)
4369         {
4370           found++;
4371           r[reg] = id;
4372         }
4373     }
4374
4375   if (found)
4376     {
4377       for (reg = 0; reg < 4; reg++)
4378         if (r[reg] >= 0
4379             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4380           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4381     }
4382
4383   return dst - orig;
4384 }
4385
4386 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4387
4388 static bool
4389 encode_coding_iso_2022 (struct coding_system *coding)
4390 {
4391   bool multibytep = coding->dst_multibyte;
4392   int *charbuf = coding->charbuf;
4393   int *charbuf_end = charbuf + coding->charbuf_used;
4394   unsigned char *dst = coding->destination + coding->produced;
4395   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4396   int safe_room = 16;
4397   bool bol_designation
4398     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4399        && CODING_ISO_BOL (coding));
4400   ptrdiff_t produced_chars = 0;
4401   Lisp_Object attrs, eol_type, charset_list;
4402   bool ascii_compatible;
4403   int c;
4404   int preferred_charset_id = -1;
4405
4406   CODING_GET_INFO (coding, attrs, charset_list);
4407   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4408   if (VECTORP (eol_type))
4409     eol_type = Qunix;
4410
4411   setup_iso_safe_charsets (attrs);
4412   /* Charset list may have been changed.  */
4413   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4414   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4415
4416   ascii_compatible
4417     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4418        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4419                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4420
4421   while (charbuf < charbuf_end)
4422     {
4423       ASSURE_DESTINATION (safe_room);
4424
4425       if (bol_designation)
4426         {
4427           /* We have to produce designation sequences if any now.  */
4428           unsigned char desig_buf[16];
4429           ptrdiff_t nbytes;
4430           ptrdiff_t offset;
4431
4432           charset_map_loaded = 0;
4433           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4434                                               desig_buf);
4435           if (charset_map_loaded
4436               && (offset = coding_change_destination (coding)))
4437             {
4438               dst += offset;
4439               dst_end += offset;
4440             }
4441           memcpy (dst, desig_buf, nbytes);
4442           dst += nbytes;
4443           /* We are sure that designation sequences are all ASCII bytes.  */
4444           produced_chars += nbytes;
4445           bol_designation = 0;
4446           ASSURE_DESTINATION (safe_room);
4447         }
4448
4449       c = *charbuf++;
4450
4451       if (c < 0)
4452         {
4453           /* Handle an annotation.  */
4454           switch (*charbuf)
4455             {
4456             case CODING_ANNOTATE_COMPOSITION_MASK:
4457               /* Not yet implemented.  */
4458               break;
4459             case CODING_ANNOTATE_CHARSET_MASK:
4460               preferred_charset_id = charbuf[2];
4461               if (preferred_charset_id >= 0
4462                   && NILP (Fmemq (make_number (preferred_charset_id),
4463                                   charset_list)))
4464                 preferred_charset_id = -1;
4465               break;
4466             default:
4467               emacs_abort ();
4468             }
4469           charbuf += -c - 1;
4470           continue;
4471         }
4472
4473       /* Now encode the character C.  */
4474       if (c < 0x20 || c == 0x7F)
4475         {
4476           if (c == '\n'
4477               || (c == '\r' && EQ (eol_type, Qmac)))
4478             {
4479               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4480                 ENCODE_RESET_PLANE_AND_REGISTER ();
4481               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4482                 {
4483                   int i;
4484
4485                   for (i = 0; i < 4; i++)
4486                     CODING_ISO_DESIGNATION (coding, i)
4487                       = CODING_ISO_INITIAL (coding, i);
4488                 }
4489               bol_designation = ((CODING_ISO_FLAGS (coding)
4490                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4491                                  != 0);
4492             }
4493           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4494             ENCODE_RESET_PLANE_AND_REGISTER ();
4495           EMIT_ONE_ASCII_BYTE (c);
4496         }
4497       else if (ASCII_CHAR_P (c))
4498         {
4499           if (ascii_compatible)
4500             EMIT_ONE_ASCII_BYTE (c);
4501           else
4502             {
4503               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4504               ENCODE_ISO_CHARACTER (charset, c);
4505             }
4506         }
4507       else if (CHAR_BYTE8_P (c))
4508         {
4509           c = CHAR_TO_BYTE8 (c);
4510           EMIT_ONE_BYTE (c);
4511         }
4512       else
4513         {
4514           struct charset *charset;
4515
4516           if (preferred_charset_id >= 0)
4517             {
4518               bool result;
4519
4520               charset = CHARSET_FROM_ID (preferred_charset_id);
4521               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4522               if (! result)
4523                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4524                                      NULL, charset);
4525             }
4526           else
4527             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4528                                  NULL, charset);
4529           if (!charset)
4530             {
4531               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4532                 {
4533                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4534                   charset = CHARSET_FROM_ID (charset_ascii);
4535                 }
4536               else
4537                 {
4538                   c = coding->default_char;
4539                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4540                                        charset_list, NULL, charset);
4541                 }
4542             }
4543           ENCODE_ISO_CHARACTER (charset, c);
4544         }
4545     }
4546
4547   if (coding->mode & CODING_MODE_LAST_BLOCK
4548       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4549     {
4550       ASSURE_DESTINATION (safe_room);
4551       ENCODE_RESET_PLANE_AND_REGISTER ();
4552     }
4553   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4554   CODING_ISO_BOL (coding) = bol_designation;
4555   coding->produced_char += produced_chars;
4556   coding->produced = dst - coding->destination;
4557   return 0;
4558 }
4559
4560 \f
4561 /*** 8,9. SJIS and BIG5 handlers ***/
4562
4563 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4564    quite widely.  So, for the moment, Emacs supports them in the bare
4565    C code.  But, in the future, they may be supported only by CCL.  */
4566
4567 /* SJIS is a coding system encoding three character sets: ASCII, right
4568    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4569    as is.  A character of charset katakana-jisx0201 is encoded by
4570    "position-code + 0x80".  A character of charset japanese-jisx0208
4571    is encoded in 2-byte but two position-codes are divided and shifted
4572    so that it fit in the range below.
4573
4574    --- CODE RANGE of SJIS ---
4575    (character set)      (range)
4576    ASCII                0x00 .. 0x7F
4577    KATAKANA-JISX0201    0xA0 .. 0xDF
4578    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4579             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4580    -------------------------------
4581
4582 */
4583
4584 /* BIG5 is a coding system encoding two character sets: ASCII and
4585    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4586    character set and is encoded in two-byte.
4587
4588    --- CODE RANGE of BIG5 ---
4589    (character set)      (range)
4590    ASCII                0x00 .. 0x7F
4591    Big5 (1st byte)      0xA1 .. 0xFE
4592         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4593    --------------------------
4594
4595   */
4596
4597 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4598    Return true if a text is encoded in SJIS.  */
4599
4600 static bool
4601 detect_coding_sjis (struct coding_system *coding,
4602                     struct coding_detection_info *detect_info)
4603 {
4604   const unsigned char *src = coding->source, *src_base;
4605   const unsigned char *src_end = coding->source + coding->src_bytes;
4606   bool multibytep = coding->src_multibyte;
4607   ptrdiff_t consumed_chars = 0;
4608   int found = 0;
4609   int c;
4610   Lisp_Object attrs, charset_list;
4611   int max_first_byte_of_2_byte_code;
4612
4613   CODING_GET_INFO (coding, attrs, charset_list);
4614   max_first_byte_of_2_byte_code
4615     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4616
4617   detect_info->checked |= CATEGORY_MASK_SJIS;
4618   /* A coding system of this category is always ASCII compatible.  */
4619   src += coding->head_ascii;
4620
4621   while (1)
4622     {
4623       src_base = src;
4624       ONE_MORE_BYTE (c);
4625       if (c < 0x80)
4626         continue;
4627       if ((c >= 0x81 && c <= 0x9F)
4628           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4629         {
4630           ONE_MORE_BYTE (c);
4631           if (c < 0x40 || c == 0x7F || c > 0xFC)
4632             break;
4633           found = CATEGORY_MASK_SJIS;
4634         }
4635       else if (c >= 0xA0 && c < 0xE0)
4636         found = CATEGORY_MASK_SJIS;
4637       else
4638         break;
4639     }
4640   detect_info->rejected |= CATEGORY_MASK_SJIS;
4641   return 0;
4642
4643  no_more_source:
4644   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4645     {
4646       detect_info->rejected |= CATEGORY_MASK_SJIS;
4647       return 0;
4648     }
4649   detect_info->found |= found;
4650   return 1;
4651 }
4652
4653 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4654    Return true if a text is encoded in BIG5.  */
4655
4656 static bool
4657 detect_coding_big5 (struct coding_system *coding,
4658                     struct coding_detection_info *detect_info)
4659 {
4660   const unsigned char *src = coding->source, *src_base;
4661   const unsigned char *src_end = coding->source + coding->src_bytes;
4662   bool multibytep = coding->src_multibyte;
4663   ptrdiff_t consumed_chars = 0;
4664   int found = 0;
4665   int c;
4666
4667   detect_info->checked |= CATEGORY_MASK_BIG5;
4668   /* A coding system of this category is always ASCII compatible.  */
4669   src += coding->head_ascii;
4670
4671   while (1)
4672     {
4673       src_base = src;
4674       ONE_MORE_BYTE (c);
4675       if (c < 0x80)
4676         continue;
4677       if (c >= 0xA1)
4678         {
4679           ONE_MORE_BYTE (c);
4680           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4681             return 0;
4682           found = CATEGORY_MASK_BIG5;
4683         }
4684       else
4685         break;
4686     }
4687   detect_info->rejected |= CATEGORY_MASK_BIG5;
4688   return 0;
4689
4690  no_more_source:
4691   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4692     {
4693       detect_info->rejected |= CATEGORY_MASK_BIG5;
4694       return 0;
4695     }
4696   detect_info->found |= found;
4697   return 1;
4698 }
4699
4700 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4701
4702 static void
4703 decode_coding_sjis (struct coding_system *coding)
4704 {
4705   const unsigned char *src = coding->source + coding->consumed;
4706   const unsigned char *src_end = coding->source + coding->src_bytes;
4707   const unsigned char *src_base;
4708   int *charbuf = coding->charbuf + coding->charbuf_used;
4709   /* We may produce one charset annotation in one loop and one more at
4710      the end.  */
4711   int *charbuf_end
4712     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4713   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4714   bool multibytep = coding->src_multibyte;
4715   struct charset *charset_roman, *charset_kanji, *charset_kana;
4716   struct charset *charset_kanji2;
4717   Lisp_Object attrs, charset_list, val;
4718   ptrdiff_t char_offset = coding->produced_char;
4719   ptrdiff_t last_offset = char_offset;
4720   int last_id = charset_ascii;
4721   bool eol_dos
4722     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4723   int byte_after_cr = -1;
4724
4725   CODING_GET_INFO (coding, attrs, charset_list);
4726
4727   val = charset_list;
4728   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4729   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4730   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4731   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4732
4733   while (1)
4734     {
4735       int c, c1;
4736       struct charset *charset;
4737
4738       src_base = src;
4739       consumed_chars_base = consumed_chars;
4740
4741       if (charbuf >= charbuf_end)
4742         {
4743           if (byte_after_cr >= 0)
4744             src_base--;
4745           break;
4746         }
4747
4748       if (byte_after_cr >= 0)
4749         c = byte_after_cr, byte_after_cr = -1;
4750       else
4751         ONE_MORE_BYTE (c);
4752       if (c < 0)
4753         goto invalid_code;
4754       if (c < 0x80)
4755         {
4756           if (eol_dos && c == '\r')
4757             ONE_MORE_BYTE (byte_after_cr);
4758           charset = charset_roman;
4759         }
4760       else if (c == 0x80 || c == 0xA0)
4761         goto invalid_code;
4762       else if (c >= 0xA1 && c <= 0xDF)
4763         {
4764           /* SJIS -> JISX0201-Kana */
4765           c &= 0x7F;
4766           charset = charset_kana;
4767         }
4768       else if (c <= 0xEF)
4769         {
4770           /* SJIS -> JISX0208 */
4771           ONE_MORE_BYTE (c1);
4772           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4773             goto invalid_code;
4774           c = (c << 8) | c1;
4775           SJIS_TO_JIS (c);
4776           charset = charset_kanji;
4777         }
4778       else if (c <= 0xFC && charset_kanji2)
4779         {
4780           /* SJIS -> JISX0213-2 */
4781           ONE_MORE_BYTE (c1);
4782           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4783             goto invalid_code;
4784           c = (c << 8) | c1;
4785           SJIS_TO_JIS2 (c);
4786           charset = charset_kanji2;
4787         }
4788       else
4789         goto invalid_code;
4790       if (charset->id != charset_ascii
4791           && last_id != charset->id)
4792         {
4793           if (last_id != charset_ascii)
4794             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4795           last_id = charset->id;
4796           last_offset = char_offset;
4797         }
4798       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4799       *charbuf++ = c;
4800       char_offset++;
4801       continue;
4802
4803     invalid_code:
4804       src = src_base;
4805       consumed_chars = consumed_chars_base;
4806       ONE_MORE_BYTE (c);
4807       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4808       char_offset++;
4809     }
4810
4811  no_more_source:
4812   if (last_id != charset_ascii)
4813     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4814   coding->consumed_char += consumed_chars_base;
4815   coding->consumed = src_base - coding->source;
4816   coding->charbuf_used = charbuf - coding->charbuf;
4817 }
4818
4819 static void
4820 decode_coding_big5 (struct coding_system *coding)
4821 {
4822   const unsigned char *src = coding->source + coding->consumed;
4823   const unsigned char *src_end = coding->source + coding->src_bytes;
4824   const unsigned char *src_base;
4825   int *charbuf = coding->charbuf + coding->charbuf_used;
4826   /* We may produce one charset annotation in one loop and one more at
4827      the end.  */
4828   int *charbuf_end
4829     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4830   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4831   bool multibytep = coding->src_multibyte;
4832   struct charset *charset_roman, *charset_big5;
4833   Lisp_Object attrs, charset_list, val;
4834   ptrdiff_t char_offset = coding->produced_char;
4835   ptrdiff_t last_offset = char_offset;
4836   int last_id = charset_ascii;
4837   bool eol_dos
4838     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4839   int byte_after_cr = -1;
4840
4841   CODING_GET_INFO (coding, attrs, charset_list);
4842   val = charset_list;
4843   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4844   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4845
4846   while (1)
4847     {
4848       int c, c1;
4849       struct charset *charset;
4850
4851       src_base = src;
4852       consumed_chars_base = consumed_chars;
4853
4854       if (charbuf >= charbuf_end)
4855         {
4856           if (byte_after_cr >= 0)
4857             src_base--;
4858           break;
4859         }
4860
4861       if (byte_after_cr >= 0)
4862         c = byte_after_cr, byte_after_cr = -1;
4863       else
4864         ONE_MORE_BYTE (c);
4865
4866       if (c < 0)
4867         goto invalid_code;
4868       if (c < 0x80)
4869         {
4870           if (eol_dos && c == '\r')
4871             ONE_MORE_BYTE (byte_after_cr);
4872           charset = charset_roman;
4873         }
4874       else
4875         {
4876           /* BIG5 -> Big5 */
4877           if (c < 0xA1 || c > 0xFE)
4878             goto invalid_code;
4879           ONE_MORE_BYTE (c1);
4880           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4881             goto invalid_code;
4882           c = c << 8 | c1;
4883           charset = charset_big5;
4884         }
4885       if (charset->id != charset_ascii
4886           && last_id != charset->id)
4887         {
4888           if (last_id != charset_ascii)
4889             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4890           last_id = charset->id;
4891           last_offset = char_offset;
4892         }
4893       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4894       *charbuf++ = c;
4895       char_offset++;
4896       continue;
4897
4898     invalid_code:
4899       src = src_base;
4900       consumed_chars = consumed_chars_base;
4901       ONE_MORE_BYTE (c);
4902       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4903       char_offset++;
4904     }
4905
4906  no_more_source:
4907   if (last_id != charset_ascii)
4908     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4909   coding->consumed_char += consumed_chars_base;
4910   coding->consumed = src_base - coding->source;
4911   coding->charbuf_used = charbuf - coding->charbuf;
4912 }
4913
4914 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4915    This function can encode charsets `ascii', `katakana-jisx0201',
4916    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4917    are sure that all these charsets are registered as official charset
4918    (i.e. do not have extended leading-codes).  Characters of other
4919    charsets are produced without any encoding.  */
4920
4921 static bool
4922 encode_coding_sjis (struct coding_system *coding)
4923 {
4924   bool multibytep = coding->dst_multibyte;
4925   int *charbuf = coding->charbuf;
4926   int *charbuf_end = charbuf + coding->charbuf_used;
4927   unsigned char *dst = coding->destination + coding->produced;
4928   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4929   int safe_room = 4;
4930   ptrdiff_t produced_chars = 0;
4931   Lisp_Object attrs, charset_list, val;
4932   bool ascii_compatible;
4933   struct charset *charset_kanji, *charset_kana;
4934   struct charset *charset_kanji2;
4935   int c;
4936
4937   CODING_GET_INFO (coding, attrs, charset_list);
4938   val = XCDR (charset_list);
4939   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4940   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4941   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4942
4943   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4944
4945   while (charbuf < charbuf_end)
4946     {
4947       ASSURE_DESTINATION (safe_room);
4948       c = *charbuf++;
4949       /* Now encode the character C.  */
4950       if (ASCII_CHAR_P (c) && ascii_compatible)
4951         EMIT_ONE_ASCII_BYTE (c);
4952       else if (CHAR_BYTE8_P (c))
4953         {
4954           c = CHAR_TO_BYTE8 (c);
4955           EMIT_ONE_BYTE (c);
4956         }
4957       else
4958         {
4959           unsigned code;
4960           struct charset *charset;
4961           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4962                                &code, charset);
4963
4964           if (!charset)
4965             {
4966               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4967                 {
4968                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4969                   charset = CHARSET_FROM_ID (charset_ascii);
4970                 }
4971               else
4972                 {
4973                   c = coding->default_char;
4974                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4975                                        charset_list, &code, charset);
4976                 }
4977             }
4978           if (code == CHARSET_INVALID_CODE (charset))
4979             emacs_abort ();
4980           if (charset == charset_kanji)
4981             {
4982               int c1, c2;
4983               JIS_TO_SJIS (code);
4984               c1 = code >> 8, c2 = code & 0xFF;
4985               EMIT_TWO_BYTES (c1, c2);
4986             }
4987           else if (charset == charset_kana)
4988             EMIT_ONE_BYTE (code | 0x80);
4989           else if (charset_kanji2 && charset == charset_kanji2)
4990             {
4991               int c1, c2;
4992
4993               c1 = code >> 8;
4994               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4995                   || c1 == 0x28
4996                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4997                 {
4998                   JIS_TO_SJIS2 (code);
4999                   c1 = code >> 8, c2 = code & 0xFF;
5000                   EMIT_TWO_BYTES (c1, c2);
5001                 }
5002               else
5003                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5004             }
5005           else
5006             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5007         }
5008     }
5009   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5010   coding->produced_char += produced_chars;
5011   coding->produced = dst - coding->destination;
5012   return 0;
5013 }
5014
5015 static bool
5016 encode_coding_big5 (struct coding_system *coding)
5017 {
5018   bool multibytep = coding->dst_multibyte;
5019   int *charbuf = coding->charbuf;
5020   int *charbuf_end = charbuf + coding->charbuf_used;
5021   unsigned char *dst = coding->destination + coding->produced;
5022   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5023   int safe_room = 4;
5024   ptrdiff_t produced_chars = 0;
5025   Lisp_Object attrs, charset_list, val;
5026   bool ascii_compatible;
5027   struct charset *charset_big5;
5028   int c;
5029
5030   CODING_GET_INFO (coding, attrs, charset_list);
5031   val = XCDR (charset_list);
5032   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5033   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5034
5035   while (charbuf < charbuf_end)
5036     {
5037       ASSURE_DESTINATION (safe_room);
5038       c = *charbuf++;
5039       /* Now encode the character C.  */
5040       if (ASCII_CHAR_P (c) && ascii_compatible)
5041         EMIT_ONE_ASCII_BYTE (c);
5042       else if (CHAR_BYTE8_P (c))
5043         {
5044           c = CHAR_TO_BYTE8 (c);
5045           EMIT_ONE_BYTE (c);
5046         }
5047       else
5048         {
5049           unsigned code;
5050           struct charset *charset;
5051           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5052                                &code, charset);
5053
5054           if (! charset)
5055             {
5056               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5057                 {
5058                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5059                   charset = CHARSET_FROM_ID (charset_ascii);
5060                 }
5061               else
5062                 {
5063                   c = coding->default_char;
5064                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5065                                        charset_list, &code, charset);
5066                 }
5067             }
5068           if (code == CHARSET_INVALID_CODE (charset))
5069             emacs_abort ();
5070           if (charset == charset_big5)
5071             {
5072               int c1, c2;
5073
5074               c1 = code >> 8, c2 = code & 0xFF;
5075               EMIT_TWO_BYTES (c1, c2);
5076             }
5077           else
5078             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5079         }
5080     }
5081   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5082   coding->produced_char += produced_chars;
5083   coding->produced = dst - coding->destination;
5084   return 0;
5085 }
5086
5087 \f
5088 /*** 10. CCL handlers ***/
5089
5090 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5091    Return true if a text is encoded in a coding system of which
5092    encoder/decoder are written in CCL program.  */
5093
5094 static bool
5095 detect_coding_ccl (struct coding_system *coding,
5096                    struct coding_detection_info *detect_info)
5097 {
5098   const unsigned char *src = coding->source, *src_base;
5099   const unsigned char *src_end = coding->source + coding->src_bytes;
5100   bool multibytep = coding->src_multibyte;
5101   ptrdiff_t consumed_chars = 0;
5102   int found = 0;
5103   unsigned char *valids;
5104   ptrdiff_t head_ascii = coding->head_ascii;
5105   Lisp_Object attrs;
5106
5107   detect_info->checked |= CATEGORY_MASK_CCL;
5108
5109   coding = &coding_categories[coding_category_ccl];
5110   valids = CODING_CCL_VALIDS (coding);
5111   attrs = CODING_ID_ATTRS (coding->id);
5112   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5113     src += head_ascii;
5114
5115   while (1)
5116     {
5117       int c;
5118
5119       src_base = src;
5120       ONE_MORE_BYTE (c);
5121       if (c < 0 || ! valids[c])
5122         break;
5123       if ((valids[c] > 1))
5124         found = CATEGORY_MASK_CCL;
5125     }
5126   detect_info->rejected |= CATEGORY_MASK_CCL;
5127   return 0;
5128
5129  no_more_source:
5130   detect_info->found |= found;
5131   return 1;
5132 }
5133
5134 static void
5135 decode_coding_ccl (struct coding_system *coding)
5136 {
5137   const unsigned char *src = coding->source + coding->consumed;
5138   const unsigned char *src_end = coding->source + coding->src_bytes;
5139   int *charbuf = coding->charbuf + coding->charbuf_used;
5140   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5141   ptrdiff_t consumed_chars = 0;
5142   bool multibytep = coding->src_multibyte;
5143   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5144   int source_charbuf[1024];
5145   int source_byteidx[1025];
5146   Lisp_Object attrs, charset_list;
5147
5148   CODING_GET_INFO (coding, attrs, charset_list);
5149
5150   while (1)
5151     {
5152       const unsigned char *p = src;
5153       ptrdiff_t offset;
5154       int i = 0;
5155
5156       if (multibytep)
5157         {
5158           while (i < 1024 && p < src_end)
5159             {
5160               source_byteidx[i] = p - src;
5161               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5162             }
5163           source_byteidx[i] = p - src;
5164         }
5165       else
5166         while (i < 1024 && p < src_end)
5167           source_charbuf[i++] = *p++;
5168
5169       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5170         ccl->last_block = true;
5171       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5172       charset_map_loaded = 0;
5173       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5174                   charset_list);
5175       if (charset_map_loaded
5176           && (offset = coding_change_source (coding)))
5177         {
5178           p += offset;
5179           src += offset;
5180           src_end += offset;
5181         }
5182       charbuf += ccl->produced;
5183       if (multibytep)
5184         src += source_byteidx[ccl->consumed];
5185       else
5186         src += ccl->consumed;
5187       consumed_chars += ccl->consumed;
5188       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5189         break;
5190     }
5191
5192   switch (ccl->status)
5193     {
5194     case CCL_STAT_SUSPEND_BY_SRC:
5195       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5196       break;
5197     case CCL_STAT_SUSPEND_BY_DST:
5198       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5199       break;
5200     case CCL_STAT_QUIT:
5201     case CCL_STAT_INVALID_CMD:
5202       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5203       break;
5204     default:
5205       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5206       break;
5207     }
5208   coding->consumed_char += consumed_chars;
5209   coding->consumed = src - coding->source;
5210   coding->charbuf_used = charbuf - coding->charbuf;
5211 }
5212
5213 static bool
5214 encode_coding_ccl (struct coding_system *coding)
5215 {
5216   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5217   bool multibytep = coding->dst_multibyte;
5218   int *charbuf = coding->charbuf;
5219   int *charbuf_end = charbuf + coding->charbuf_used;
5220   unsigned char *dst = coding->destination + coding->produced;
5221   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5222   int destination_charbuf[1024];
5223   ptrdiff_t produced_chars = 0;
5224   int i;
5225   Lisp_Object attrs, charset_list;
5226
5227   CODING_GET_INFO (coding, attrs, charset_list);
5228   if (coding->consumed_char == coding->src_chars
5229       && coding->mode & CODING_MODE_LAST_BLOCK)
5230     ccl->last_block = true;
5231
5232   do
5233     {
5234       ptrdiff_t offset;
5235
5236       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5237       charset_map_loaded = 0;
5238       ccl_driver (ccl, charbuf, destination_charbuf,
5239                   charbuf_end - charbuf, 1024, charset_list);
5240       if (charset_map_loaded
5241           && (offset = coding_change_destination (coding)))
5242         dst += offset;
5243       if (multibytep)
5244         {
5245           ASSURE_DESTINATION (ccl->produced * 2);
5246           for (i = 0; i < ccl->produced; i++)
5247             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5248         }
5249       else
5250         {
5251           ASSURE_DESTINATION (ccl->produced);
5252           for (i = 0; i < ccl->produced; i++)
5253             *dst++ = destination_charbuf[i] & 0xFF;
5254           produced_chars += ccl->produced;
5255         }
5256       charbuf += ccl->consumed;
5257       if (ccl->status == CCL_STAT_QUIT
5258           || ccl->status == CCL_STAT_INVALID_CMD)
5259         break;
5260     }
5261   while (charbuf < charbuf_end);
5262
5263   switch (ccl->status)
5264     {
5265     case CCL_STAT_SUSPEND_BY_SRC:
5266       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5267       break;
5268     case CCL_STAT_SUSPEND_BY_DST:
5269       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5270       break;
5271     case CCL_STAT_QUIT:
5272     case CCL_STAT_INVALID_CMD:
5273       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5274       break;
5275     default:
5276       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5277       break;
5278     }
5279
5280   coding->produced_char += produced_chars;
5281   coding->produced = dst - coding->destination;
5282   return 0;
5283 }
5284
5285 \f
5286 /*** 10, 11. no-conversion handlers ***/
5287
5288 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5289
5290 static void
5291 decode_coding_raw_text (struct coding_system *coding)
5292 {
5293   bool eol_dos
5294     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5295
5296   coding->chars_at_source = 1;
5297   coding->consumed_char = coding->src_chars;
5298   coding->consumed = coding->src_bytes;
5299   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5300     {
5301       coding->consumed_char--;
5302       coding->consumed--;
5303       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5304     }
5305   else
5306     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5307 }
5308
5309 static bool
5310 encode_coding_raw_text (struct coding_system *coding)
5311 {
5312   bool multibytep = coding->dst_multibyte;
5313   int *charbuf = coding->charbuf;
5314   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5315   unsigned char *dst = coding->destination + coding->produced;
5316   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5317   ptrdiff_t produced_chars = 0;
5318   int c;
5319
5320   if (multibytep)
5321     {
5322       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5323
5324       if (coding->src_multibyte)
5325         while (charbuf < charbuf_end)
5326           {
5327             ASSURE_DESTINATION (safe_room);
5328             c = *charbuf++;
5329             if (ASCII_CHAR_P (c))
5330               EMIT_ONE_ASCII_BYTE (c);
5331             else if (CHAR_BYTE8_P (c))
5332               {
5333                 c = CHAR_TO_BYTE8 (c);
5334                 EMIT_ONE_BYTE (c);
5335               }
5336             else
5337               {
5338                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5339
5340                 CHAR_STRING_ADVANCE (c, p1);
5341                 do
5342                   {
5343                     EMIT_ONE_BYTE (*p0);
5344                     p0++;
5345                   }
5346                 while (p0 < p1);
5347               }
5348           }
5349       else
5350         while (charbuf < charbuf_end)
5351           {
5352             ASSURE_DESTINATION (safe_room);
5353             c = *charbuf++;
5354             EMIT_ONE_BYTE (c);
5355           }
5356     }
5357   else
5358     {
5359       if (coding->src_multibyte)
5360         {
5361           int safe_room = MAX_MULTIBYTE_LENGTH;
5362
5363           while (charbuf < charbuf_end)
5364             {
5365               ASSURE_DESTINATION (safe_room);
5366               c = *charbuf++;
5367               if (ASCII_CHAR_P (c))
5368                 *dst++ = c;
5369               else if (CHAR_BYTE8_P (c))
5370                 *dst++ = CHAR_TO_BYTE8 (c);
5371               else
5372                 CHAR_STRING_ADVANCE (c, dst);
5373             }
5374         }
5375       else
5376         {
5377           ASSURE_DESTINATION (charbuf_end - charbuf);
5378           while (charbuf < charbuf_end && dst < dst_end)
5379             *dst++ = *charbuf++;
5380         }
5381       produced_chars = dst - (coding->destination + coding->produced);
5382     }
5383   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5384   coding->produced_char += produced_chars;
5385   coding->produced = dst - coding->destination;
5386   return 0;
5387 }
5388
5389 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5390    Return true if a text is encoded in a charset-based coding system.  */
5391
5392 static bool
5393 detect_coding_charset (struct coding_system *coding,
5394                        struct coding_detection_info *detect_info)
5395 {
5396   const unsigned char *src = coding->source, *src_base;
5397   const unsigned char *src_end = coding->source + coding->src_bytes;
5398   bool multibytep = coding->src_multibyte;
5399   ptrdiff_t consumed_chars = 0;
5400   Lisp_Object attrs, valids, name;
5401   int found = 0;
5402   ptrdiff_t head_ascii = coding->head_ascii;
5403   bool check_latin_extra = 0;
5404
5405   detect_info->checked |= CATEGORY_MASK_CHARSET;
5406
5407   coding = &coding_categories[coding_category_charset];
5408   attrs = CODING_ID_ATTRS (coding->id);
5409   valids = AREF (attrs, coding_attr_charset_valids);
5410   name = CODING_ID_NAME (coding->id);
5411   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5412                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5413       || strncmp (SSDATA (SYMBOL_NAME (name)),
5414                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5415     check_latin_extra = 1;
5416
5417   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5418     src += head_ascii;
5419
5420   while (1)
5421     {
5422       int c;
5423       Lisp_Object val;
5424       struct charset *charset;
5425       int dim, idx;
5426
5427       src_base = src;
5428       ONE_MORE_BYTE (c);
5429       if (c < 0)
5430         continue;
5431       val = AREF (valids, c);
5432       if (NILP (val))
5433         break;
5434       if (c >= 0x80)
5435         {
5436           if (c < 0xA0
5437               && check_latin_extra
5438               && (!VECTORP (Vlatin_extra_code_table)
5439                   || NILP (AREF (Vlatin_extra_code_table, c))))
5440             break;
5441           found = CATEGORY_MASK_CHARSET;
5442         }
5443       if (INTEGERP (val))
5444         {
5445           charset = CHARSET_FROM_ID (XFASTINT (val));
5446           dim = CHARSET_DIMENSION (charset);
5447           for (idx = 1; idx < dim; idx++)
5448             {
5449               if (src == src_end)
5450                 goto too_short;
5451               ONE_MORE_BYTE (c);
5452               if (c < charset->code_space[(dim - 1 - idx) * 4]
5453                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5454                 break;
5455             }
5456           if (idx < dim)
5457             break;
5458         }
5459       else
5460         {
5461           idx = 1;
5462           for (; CONSP (val); val = XCDR (val))
5463             {
5464               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5465               dim = CHARSET_DIMENSION (charset);
5466               while (idx < dim)
5467                 {
5468                   if (src == src_end)
5469                     goto too_short;
5470                   ONE_MORE_BYTE (c);
5471                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5472                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5473                     break;
5474                   idx++;
5475                 }
5476               if (idx == dim)
5477                 {
5478                   val = Qnil;
5479                   break;
5480                 }
5481             }
5482           if (CONSP (val))
5483             break;
5484         }
5485     }
5486  too_short:
5487   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5488   return 0;
5489
5490  no_more_source:
5491   detect_info->found |= found;
5492   return 1;
5493 }
5494
5495 static void
5496 decode_coding_charset (struct coding_system *coding)
5497 {
5498   const unsigned char *src = coding->source + coding->consumed;
5499   const unsigned char *src_end = coding->source + coding->src_bytes;
5500   const unsigned char *src_base;
5501   int *charbuf = coding->charbuf + coding->charbuf_used;
5502   /* We may produce one charset annotation in one loop and one more at
5503      the end.  */
5504   int *charbuf_end
5505     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5506   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5507   bool multibytep = coding->src_multibyte;
5508   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5509   Lisp_Object valids;
5510   ptrdiff_t char_offset = coding->produced_char;
5511   ptrdiff_t last_offset = char_offset;
5512   int last_id = charset_ascii;
5513   bool eol_dos
5514     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5515   int byte_after_cr = -1;
5516
5517   valids = AREF (attrs, coding_attr_charset_valids);
5518
5519   while (1)
5520     {
5521       int c;
5522       Lisp_Object val;
5523       struct charset *charset;
5524       int dim;
5525       int len = 1;
5526       unsigned code;
5527
5528       src_base = src;
5529       consumed_chars_base = consumed_chars;
5530
5531       if (charbuf >= charbuf_end)
5532         {
5533           if (byte_after_cr >= 0)
5534             src_base--;
5535           break;
5536         }
5537
5538       if (byte_after_cr >= 0)
5539         {
5540           c = byte_after_cr;
5541           byte_after_cr = -1;
5542         }
5543       else
5544         {
5545           ONE_MORE_BYTE (c);
5546           if (eol_dos && c == '\r')
5547             ONE_MORE_BYTE (byte_after_cr);
5548         }
5549       if (c < 0)
5550         goto invalid_code;
5551       code = c;
5552
5553       val = AREF (valids, c);
5554       if (! INTEGERP (val) && ! CONSP (val))
5555         goto invalid_code;
5556       if (INTEGERP (val))
5557         {
5558           charset = CHARSET_FROM_ID (XFASTINT (val));
5559           dim = CHARSET_DIMENSION (charset);
5560           while (len < dim)
5561             {
5562               ONE_MORE_BYTE (c);
5563               code = (code << 8) | c;
5564               len++;
5565             }
5566           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5567                               charset, code, c);
5568         }
5569       else
5570         {
5571           /* VAL is a list of charset IDs.  It is assured that the
5572              list is sorted by charset dimensions (smaller one
5573              comes first).  */
5574           while (CONSP (val))
5575             {
5576               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5577               dim = CHARSET_DIMENSION (charset);
5578               while (len < dim)
5579                 {
5580                   ONE_MORE_BYTE (c);
5581                   code = (code << 8) | c;
5582                   len++;
5583                 }
5584               CODING_DECODE_CHAR (coding, src, src_base,
5585                                   src_end, charset, code, c);
5586               if (c >= 0)
5587                 break;
5588               val = XCDR (val);
5589             }
5590         }
5591       if (c < 0)
5592         goto invalid_code;
5593       if (charset->id != charset_ascii
5594           && last_id != charset->id)
5595         {
5596           if (last_id != charset_ascii)
5597             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5598           last_id = charset->id;
5599           last_offset = char_offset;
5600         }
5601
5602       *charbuf++ = c;
5603       char_offset++;
5604       continue;
5605
5606     invalid_code:
5607       src = src_base;
5608       consumed_chars = consumed_chars_base;
5609       ONE_MORE_BYTE (c);
5610       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5611       char_offset++;
5612     }
5613
5614  no_more_source:
5615   if (last_id != charset_ascii)
5616     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5617   coding->consumed_char += consumed_chars_base;
5618   coding->consumed = src_base - coding->source;
5619   coding->charbuf_used = charbuf - coding->charbuf;
5620 }
5621
5622 static bool
5623 encode_coding_charset (struct coding_system *coding)
5624 {
5625   bool multibytep = coding->dst_multibyte;
5626   int *charbuf = coding->charbuf;
5627   int *charbuf_end = charbuf + coding->charbuf_used;
5628   unsigned char *dst = coding->destination + coding->produced;
5629   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5630   int safe_room = MAX_MULTIBYTE_LENGTH;
5631   ptrdiff_t produced_chars = 0;
5632   Lisp_Object attrs, charset_list;
5633   bool ascii_compatible;
5634   int c;
5635
5636   CODING_GET_INFO (coding, attrs, charset_list);
5637   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5638
5639   while (charbuf < charbuf_end)
5640     {
5641       struct charset *charset;
5642       unsigned code;
5643
5644       ASSURE_DESTINATION (safe_room);
5645       c = *charbuf++;
5646       if (ascii_compatible && ASCII_CHAR_P (c))
5647         EMIT_ONE_ASCII_BYTE (c);
5648       else if (CHAR_BYTE8_P (c))
5649         {
5650           c = CHAR_TO_BYTE8 (c);
5651           EMIT_ONE_BYTE (c);
5652         }
5653       else
5654         {
5655           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5656                                &code, charset);
5657
5658           if (charset)
5659             {
5660               if (CHARSET_DIMENSION (charset) == 1)
5661                 EMIT_ONE_BYTE (code);
5662               else if (CHARSET_DIMENSION (charset) == 2)
5663                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5664               else if (CHARSET_DIMENSION (charset) == 3)
5665                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5666               else
5667                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5668                                  (code >> 8) & 0xFF, code & 0xFF);
5669             }
5670           else
5671             {
5672               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5673                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5674               else
5675                 c = coding->default_char;
5676               EMIT_ONE_BYTE (c);
5677             }
5678         }
5679     }
5680
5681   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5682   coding->produced_char += produced_chars;
5683   coding->produced = dst - coding->destination;
5684   return 0;
5685 }
5686
5687 \f
5688 /*** 7. C library functions ***/
5689
5690 /* Setup coding context CODING from information about CODING_SYSTEM.
5691    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5692    CODING_SYSTEM is invalid, signal an error.  */
5693
5694 void
5695 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5696 {
5697   Lisp_Object attrs;
5698   Lisp_Object eol_type;
5699   Lisp_Object coding_type;
5700   Lisp_Object val;
5701
5702   if (NILP (coding_system))
5703     coding_system = Qundecided;
5704
5705   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5706
5707   attrs = CODING_ID_ATTRS (coding->id);
5708   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5709
5710   coding->mode = 0;
5711   if (VECTORP (eol_type))
5712     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5713                             | CODING_REQUIRE_DETECTION_MASK);
5714   else if (! EQ (eol_type, Qunix))
5715     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5716                             | CODING_REQUIRE_ENCODING_MASK);
5717   else
5718     coding->common_flags = 0;
5719   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5720     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5721   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5722     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5723   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5724     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5725
5726   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5727   coding->max_charset_id = SCHARS (val) - 1;
5728   coding->safe_charsets = SDATA (val);
5729   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5730   coding->carryover_bytes = 0;
5731   coding->raw_destination = 0;
5732
5733   coding_type = CODING_ATTR_TYPE (attrs);
5734   if (EQ (coding_type, Qundecided))
5735     {
5736       coding->detector = NULL;
5737       coding->decoder = decode_coding_raw_text;
5738       coding->encoder = encode_coding_raw_text;
5739       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5740       coding->spec.undecided.inhibit_nbd
5741         = (encode_inhibit_flag
5742            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5743       coding->spec.undecided.inhibit_ied
5744         = (encode_inhibit_flag
5745            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5746       coding->spec.undecided.prefer_utf_8
5747         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5748     }
5749   else if (EQ (coding_type, Qiso_2022))
5750     {
5751       int i;
5752       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5753
5754       /* Invoke graphic register 0 to plane 0.  */
5755       CODING_ISO_INVOCATION (coding, 0) = 0;
5756       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5757       CODING_ISO_INVOCATION (coding, 1)
5758         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5759       /* Setup the initial status of designation.  */
5760       for (i = 0; i < 4; i++)
5761         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5762       /* Not single shifting initially.  */
5763       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5764       /* Beginning of buffer should also be regarded as bol. */
5765       CODING_ISO_BOL (coding) = 1;
5766       coding->detector = detect_coding_iso_2022;
5767       coding->decoder = decode_coding_iso_2022;
5768       coding->encoder = encode_coding_iso_2022;
5769       if (flags & CODING_ISO_FLAG_SAFE)
5770         coding->mode |= CODING_MODE_SAFE_ENCODING;
5771       coding->common_flags
5772         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5773             | CODING_REQUIRE_FLUSHING_MASK);
5774       if (flags & CODING_ISO_FLAG_COMPOSITION)
5775         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5776       if (flags & CODING_ISO_FLAG_DESIGNATION)
5777         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5778       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5779         {
5780           setup_iso_safe_charsets (attrs);
5781           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5782           coding->max_charset_id = SCHARS (val) - 1;
5783           coding->safe_charsets = SDATA (val);
5784         }
5785       CODING_ISO_FLAGS (coding) = flags;
5786       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5787       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5788       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5789       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5790     }
5791   else if (EQ (coding_type, Qcharset))
5792     {
5793       coding->detector = detect_coding_charset;
5794       coding->decoder = decode_coding_charset;
5795       coding->encoder = encode_coding_charset;
5796       coding->common_flags
5797         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5798     }
5799   else if (EQ (coding_type, Qutf_8))
5800     {
5801       val = AREF (attrs, coding_attr_utf_bom);
5802       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5803                                    : EQ (val, Qt) ? utf_with_bom
5804                                    : utf_without_bom);
5805       coding->detector = detect_coding_utf_8;
5806       coding->decoder = decode_coding_utf_8;
5807       coding->encoder = encode_coding_utf_8;
5808       coding->common_flags
5809         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5810       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5811         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5812     }
5813   else if (EQ (coding_type, Qutf_16))
5814     {
5815       val = AREF (attrs, coding_attr_utf_bom);
5816       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5817                                     : EQ (val, Qt) ? utf_with_bom
5818                                     : utf_without_bom);
5819       val = AREF (attrs, coding_attr_utf_16_endian);
5820       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5821                                        : utf_16_little_endian);
5822       CODING_UTF_16_SURROGATE (coding) = 0;
5823       coding->detector = detect_coding_utf_16;
5824       coding->decoder = decode_coding_utf_16;
5825       coding->encoder = encode_coding_utf_16;
5826       coding->common_flags
5827         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5828       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5829         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5830     }
5831   else if (EQ (coding_type, Qccl))
5832     {
5833       coding->detector = detect_coding_ccl;
5834       coding->decoder = decode_coding_ccl;
5835       coding->encoder = encode_coding_ccl;
5836       coding->common_flags
5837         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5838             | CODING_REQUIRE_FLUSHING_MASK);
5839     }
5840   else if (EQ (coding_type, Qemacs_mule))
5841     {
5842       coding->detector = detect_coding_emacs_mule;
5843       coding->decoder = decode_coding_emacs_mule;
5844       coding->encoder = encode_coding_emacs_mule;
5845       coding->common_flags
5846         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5847       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5848           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5849         {
5850           Lisp_Object tail, safe_charsets;
5851           int max_charset_id = 0;
5852
5853           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5854                tail = XCDR (tail))
5855             if (max_charset_id < XFASTINT (XCAR (tail)))
5856               max_charset_id = XFASTINT (XCAR (tail));
5857           safe_charsets = make_uninit_string (max_charset_id + 1);
5858           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5859           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5860                tail = XCDR (tail))
5861             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5862           coding->max_charset_id = max_charset_id;
5863           coding->safe_charsets = SDATA (safe_charsets);
5864         }
5865       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5866       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5867     }
5868   else if (EQ (coding_type, Qshift_jis))
5869     {
5870       coding->detector = detect_coding_sjis;
5871       coding->decoder = decode_coding_sjis;
5872       coding->encoder = encode_coding_sjis;
5873       coding->common_flags
5874         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5875     }
5876   else if (EQ (coding_type, Qbig5))
5877     {
5878       coding->detector = detect_coding_big5;
5879       coding->decoder = decode_coding_big5;
5880       coding->encoder = encode_coding_big5;
5881       coding->common_flags
5882         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5883     }
5884   else                          /* EQ (coding_type, Qraw_text) */
5885     {
5886       coding->detector = NULL;
5887       coding->decoder = decode_coding_raw_text;
5888       coding->encoder = encode_coding_raw_text;
5889       if (! EQ (eol_type, Qunix))
5890         {
5891           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5892           if (! VECTORP (eol_type))
5893             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5894         }
5895
5896     }
5897
5898   return;
5899 }
5900
5901 /* Return a list of charsets supported by CODING.  */
5902
5903 Lisp_Object
5904 coding_charset_list (struct coding_system *coding)
5905 {
5906   Lisp_Object attrs, charset_list;
5907
5908   CODING_GET_INFO (coding, attrs, charset_list);
5909   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5910     {
5911       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5912
5913       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5914         charset_list = Viso_2022_charset_list;
5915     }
5916   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5917     {
5918       charset_list = Vemacs_mule_charset_list;
5919     }
5920   return charset_list;
5921 }
5922
5923
5924 /* Return a list of charsets supported by CODING-SYSTEM.  */
5925
5926 Lisp_Object
5927 coding_system_charset_list (Lisp_Object coding_system)
5928 {
5929   ptrdiff_t id;
5930   Lisp_Object attrs, charset_list;
5931
5932   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5933   attrs = CODING_ID_ATTRS (id);
5934
5935   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5936     {
5937       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5938
5939       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5940         charset_list = Viso_2022_charset_list;
5941       else
5942         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5943     }
5944   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5945     {
5946       charset_list = Vemacs_mule_charset_list;
5947     }
5948   else
5949     {
5950       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5951     }
5952   return charset_list;
5953 }
5954
5955
5956 /* Return raw-text or one of its subsidiaries that has the same
5957    eol_type as CODING-SYSTEM.  */
5958
5959 Lisp_Object
5960 raw_text_coding_system (Lisp_Object coding_system)
5961 {
5962   Lisp_Object spec, attrs;
5963   Lisp_Object eol_type, raw_text_eol_type;
5964
5965   if (NILP (coding_system))
5966     return Qraw_text;
5967   spec = CODING_SYSTEM_SPEC (coding_system);
5968   attrs = AREF (spec, 0);
5969
5970   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5971     return coding_system;
5972
5973   eol_type = AREF (spec, 2);
5974   if (VECTORP (eol_type))
5975     return Qraw_text;
5976   spec = CODING_SYSTEM_SPEC (Qraw_text);
5977   raw_text_eol_type = AREF (spec, 2);
5978   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5979           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5980           : AREF (raw_text_eol_type, 2));
5981 }
5982
5983 /* Return true if CODING corresponds to raw-text coding-system.  */
5984
5985 bool
5986 raw_text_coding_system_p (struct coding_system *coding)
5987 {
5988   return (coding->decoder == decode_coding_raw_text
5989           && coding->encoder == encode_coding_raw_text) ? true : false;
5990 }
5991
5992
5993 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5994    the subsidiary that has the same eol-spec as PARENT (if it is not
5995    nil and specifies end-of-line format) or the system's setting
5996    (system_eol_type).  */
5997
5998 Lisp_Object
5999 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6000 {
6001   Lisp_Object spec, eol_type;
6002
6003   if (NILP (coding_system))
6004     coding_system = Qraw_text;
6005   else
6006     CHECK_CODING_SYSTEM (coding_system);
6007   spec = CODING_SYSTEM_SPEC (coding_system);
6008   eol_type = AREF (spec, 2);
6009   if (VECTORP (eol_type))
6010     {
6011       Lisp_Object parent_eol_type;
6012
6013       if (! NILP (parent))
6014         {
6015           Lisp_Object parent_spec;
6016
6017           CHECK_CODING_SYSTEM (parent);
6018           parent_spec = CODING_SYSTEM_SPEC (parent);
6019           parent_eol_type = AREF (parent_spec, 2);
6020           if (VECTORP (parent_eol_type))
6021             parent_eol_type = system_eol_type;
6022         }
6023       else
6024         parent_eol_type = system_eol_type;
6025       if (EQ (parent_eol_type, Qunix))
6026         coding_system = AREF (eol_type, 0);
6027       else if (EQ (parent_eol_type, Qdos))
6028         coding_system = AREF (eol_type, 1);
6029       else if (EQ (parent_eol_type, Qmac))
6030         coding_system = AREF (eol_type, 2);
6031     }
6032   return coding_system;
6033 }
6034
6035
6036 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6037    decided for writing to a process.  If not, complement them, and
6038    return a new coding system.  */
6039
6040 Lisp_Object
6041 complement_process_encoding_system (Lisp_Object coding_system)
6042 {
6043   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6044   Lisp_Object spec, attrs;
6045   int i;
6046
6047   for (i = 0; i < 3; i++)
6048     {
6049       if (i == 1)
6050         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6051       else if (i == 2)
6052         coding_system = preferred_coding_system ();
6053       spec = CODING_SYSTEM_SPEC (coding_system);
6054       if (NILP (spec))
6055         continue;
6056       attrs = AREF (spec, 0);
6057       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6058         coding_base = CODING_ATTR_BASE_NAME (attrs);
6059       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6060         eol_base = coding_system;
6061       if (! NILP (coding_base) && ! NILP (eol_base))
6062         break;
6063     }
6064
6065   if (i > 0)
6066     /* The original CODING_SYSTEM didn't specify text-conversion or
6067        eol-conversion.  Be sure that we return a fully complemented
6068        coding system.  */
6069     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6070   return coding_system;
6071 }
6072
6073
6074 /* Emacs has a mechanism to automatically detect a coding system if it
6075    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6076    it's impossible to distinguish some coding systems accurately
6077    because they use the same range of codes.  So, at first, coding
6078    systems are categorized into 7, those are:
6079
6080    o coding-category-emacs-mule
6081
6082         The category for a coding system which has the same code range
6083         as Emacs' internal format.  Assigned the coding-system (Lisp
6084         symbol) `emacs-mule' by default.
6085
6086    o coding-category-sjis
6087
6088         The category for a coding system which has the same code range
6089         as SJIS.  Assigned the coding-system (Lisp
6090         symbol) `japanese-shift-jis' by default.
6091
6092    o coding-category-iso-7
6093
6094         The category for a coding system which has the same code range
6095         as ISO2022 of 7-bit environment.  This doesn't use any locking
6096         shift and single shift functions.  This can encode/decode all
6097         charsets.  Assigned the coding-system (Lisp symbol)
6098         `iso-2022-7bit' by default.
6099
6100    o coding-category-iso-7-tight
6101
6102         Same as coding-category-iso-7 except that this can
6103         encode/decode only the specified charsets.
6104
6105    o coding-category-iso-8-1
6106
6107         The category for a coding system which has the same code range
6108         as ISO2022 of 8-bit environment and graphic plane 1 used only
6109         for DIMENSION1 charset.  This doesn't use any locking shift
6110         and single shift functions.  Assigned the coding-system (Lisp
6111         symbol) `iso-latin-1' by default.
6112
6113    o coding-category-iso-8-2
6114
6115         The category for a coding system which has the same code range
6116         as ISO2022 of 8-bit environment and graphic plane 1 used only
6117         for DIMENSION2 charset.  This doesn't use any locking shift
6118         and single shift functions.  Assigned the coding-system (Lisp
6119         symbol) `japanese-iso-8bit' by default.
6120
6121    o coding-category-iso-7-else
6122
6123         The category for a coding system which has the same code range
6124         as ISO2022 of 7-bit environment but uses locking shift or
6125         single shift functions.  Assigned the coding-system (Lisp
6126         symbol) `iso-2022-7bit-lock' by default.
6127
6128    o coding-category-iso-8-else
6129
6130         The category for a coding system which has the same code range
6131         as ISO2022 of 8-bit environment but uses locking shift or
6132         single shift functions.  Assigned the coding-system (Lisp
6133         symbol) `iso-2022-8bit-ss2' by default.
6134
6135    o coding-category-big5
6136
6137         The category for a coding system which has the same code range
6138         as BIG5.  Assigned the coding-system (Lisp symbol)
6139         `cn-big5' by default.
6140
6141    o coding-category-utf-8
6142
6143         The category for a coding system which has the same code range
6144         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6145         symbol) `utf-8' by default.
6146
6147    o coding-category-utf-16-be
6148
6149         The category for a coding system in which a text has an
6150         Unicode signature (cf. Unicode Standard) in the order of BIG
6151         endian at the head.  Assigned the coding-system (Lisp symbol)
6152         `utf-16-be' by default.
6153
6154    o coding-category-utf-16-le
6155
6156         The category for a coding system in which a text has an
6157         Unicode signature (cf. Unicode Standard) in the order of
6158         LITTLE endian at the head.  Assigned the coding-system (Lisp
6159         symbol) `utf-16-le' by default.
6160
6161    o coding-category-ccl
6162
6163         The category for a coding system of which encoder/decoder is
6164         written in CCL programs.  The default value is nil, i.e., no
6165         coding system is assigned.
6166
6167    o coding-category-binary
6168
6169         The category for a coding system not categorized in any of the
6170         above.  Assigned the coding-system (Lisp symbol)
6171         `no-conversion' by default.
6172
6173    Each of them is a Lisp symbol and the value is an actual
6174    `coding-system's (this is also a Lisp symbol) assigned by a user.
6175    What Emacs does actually is to detect a category of coding system.
6176    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6177    decide only one possible category, it selects a category of the
6178    highest priority.  Priorities of categories are also specified by a
6179    user in a Lisp variable `coding-category-list'.
6180
6181 */
6182
6183 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6184                                            int eol_seen);
6185
6186
6187 /* Return the number of ASCII characters at the head of the source.
6188    By side effects, set coding->head_ascii and update
6189    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6190    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6191    reliable only when all the source bytes are ASCII.  */
6192
6193 static ptrdiff_t
6194 check_ascii (struct coding_system *coding)
6195 {
6196   const unsigned char *src, *end;
6197   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6198   int eol_seen = coding->eol_seen;
6199
6200   coding_set_source (coding);
6201   src = coding->source;
6202   end = src + coding->src_bytes;
6203
6204   if (inhibit_eol_conversion
6205       || SYMBOLP (eol_type))
6206     {
6207       /* We don't have to check EOL format.  */
6208       while (src < end && !( *src & 0x80))
6209         {
6210           if (*src++ == '\n')
6211             eol_seen |= EOL_SEEN_LF;
6212         }
6213     }
6214   else
6215     {
6216       end--;                /* We look ahead one byte for "CR LF".  */
6217       while (src < end)
6218         {
6219           int c = *src;
6220
6221           if (c & 0x80)
6222             break;
6223           src++;
6224           if (c == '\r')
6225             {
6226               if (*src == '\n')
6227                 {
6228                   eol_seen |= EOL_SEEN_CRLF;
6229                   src++;
6230                 }
6231               else
6232                 eol_seen |= EOL_SEEN_CR;
6233             }
6234           else if (c == '\n')
6235             eol_seen |= EOL_SEEN_LF;
6236         }
6237       if (src == end)
6238         {
6239           int c = *src;
6240
6241           /* All bytes but the last one C are ASCII.  */
6242           if (! (c & 0x80))
6243             {
6244               if (c == '\r')
6245                 eol_seen |= EOL_SEEN_CR;
6246               else if (c  == '\n')
6247                 eol_seen |= EOL_SEEN_LF;
6248               src++;
6249             }
6250         }
6251     }
6252   coding->head_ascii = src - coding->source;
6253   coding->eol_seen = eol_seen;
6254   return (coding->head_ascii);
6255 }
6256
6257
6258 /* Return the number of characters at the source if all the bytes are
6259    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6260    effects, update coding->eol_seen.  The value of coding->eol_seen is
6261    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6262    the value is reliable only when all the source bytes are valid
6263    UTF-8.  */
6264
6265 static ptrdiff_t
6266 check_utf_8 (struct coding_system *coding)
6267 {
6268   const unsigned char *src, *end;
6269   int eol_seen;
6270   ptrdiff_t nchars = coding->head_ascii;
6271
6272   if (coding->head_ascii < 0)
6273     check_ascii (coding);
6274   else
6275     coding_set_source (coding);
6276   src = coding->source + coding->head_ascii;
6277   /* We look ahead one byte for CR LF.  */
6278   end = coding->source + coding->src_bytes - 1;
6279   eol_seen = coding->eol_seen;
6280   while (src < end)
6281     {
6282       int c = *src;
6283
6284       if (UTF_8_1_OCTET_P (*src))
6285         {
6286           src++;
6287           if (c < 0x20)
6288             {
6289               if (c == '\r')
6290                 {
6291                   if (*src == '\n')
6292                     {
6293                       eol_seen |= EOL_SEEN_CRLF;
6294                       src++;
6295                       nchars++;
6296                     }
6297                   else
6298                     eol_seen |= EOL_SEEN_CR;
6299                 }
6300               else if (c == '\n')
6301                 eol_seen |= EOL_SEEN_LF;
6302             }
6303         }
6304       else if (UTF_8_2_OCTET_LEADING_P (c))
6305         {
6306           if (c < 0xC2          /* overlong sequence */
6307               || src + 1 >= end
6308               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6309             return -1;
6310           src += 2;
6311         }
6312       else if (UTF_8_3_OCTET_LEADING_P (c))
6313         {
6314           if (src + 2 >= end
6315               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6316                     && UTF_8_EXTRA_OCTET_P (src[2])))
6317             return -1;
6318           c = (((c & 0xF) << 12)
6319                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6320           if (c < 0x800                       /* overlong sequence */
6321               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6322             return -1;
6323           src += 3;
6324         }
6325       else if (UTF_8_4_OCTET_LEADING_P (c))
6326         {
6327           if (src + 3 >= end
6328               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6329                     && UTF_8_EXTRA_OCTET_P (src[2])
6330                     && UTF_8_EXTRA_OCTET_P (src[3])))
6331             return -1;
6332           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6333                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6334           if (c < 0x10000       /* overlong sequence */
6335               || c >= 0x110000) /* non-Unicode character  */
6336             return -1;
6337           src += 4;
6338         }
6339       else
6340         return -1;
6341       nchars++;
6342     }
6343
6344   if (src == end)
6345     {
6346       if (! UTF_8_1_OCTET_P (*src))
6347         return -1;
6348       nchars++;
6349       if (*src == '\r')
6350         eol_seen |= EOL_SEEN_CR;
6351       else if (*src  == '\n')
6352         eol_seen |= EOL_SEEN_LF;
6353     }
6354   coding->eol_seen = eol_seen;
6355   return nchars;
6356 }
6357
6358
6359 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6360    SOURCE is encoded.  If CATEGORY is one of
6361    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6362    two-byte, else they are encoded by one-byte.
6363
6364    Return one of EOL_SEEN_XXX.  */
6365
6366 #define MAX_EOL_CHECK_COUNT 3
6367
6368 static int
6369 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6370             enum coding_category category)
6371 {
6372   const unsigned char *src = source, *src_end = src + src_bytes;
6373   unsigned char c;
6374   int total  = 0;
6375   int eol_seen = EOL_SEEN_NONE;
6376
6377   if ((1 << category) & CATEGORY_MASK_UTF_16)
6378     {
6379       bool msb = category == (coding_category_utf_16_le
6380                               | coding_category_utf_16_le_nosig);
6381       bool lsb = !msb;
6382
6383       while (src + 1 < src_end)
6384         {
6385           c = src[lsb];
6386           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6387             {
6388               int this_eol;
6389
6390               if (c == '\n')
6391                 this_eol = EOL_SEEN_LF;
6392               else if (src + 3 >= src_end
6393                        || src[msb + 2] != 0
6394                        || src[lsb + 2] != '\n')
6395                 this_eol = EOL_SEEN_CR;
6396               else
6397                 {
6398                   this_eol = EOL_SEEN_CRLF;
6399                   src += 2;
6400                 }
6401
6402               if (eol_seen == EOL_SEEN_NONE)
6403                 /* This is the first end-of-line.  */
6404                 eol_seen = this_eol;
6405               else if (eol_seen != this_eol)
6406                 {
6407                   /* The found type is different from what found before.
6408                      Allow for stray ^M characters in DOS EOL files.  */
6409                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6410                       || (eol_seen == EOL_SEEN_CRLF
6411                           && this_eol == EOL_SEEN_CR))
6412                     eol_seen = EOL_SEEN_CRLF;
6413                   else
6414                     {
6415                       eol_seen = EOL_SEEN_LF;
6416                       break;
6417                     }
6418                 }
6419               if (++total == MAX_EOL_CHECK_COUNT)
6420                 break;
6421             }
6422           src += 2;
6423         }
6424     }
6425   else
6426     while (src < src_end)
6427       {
6428         c = *src++;
6429         if (c == '\n' || c == '\r')
6430           {
6431             int this_eol;
6432
6433             if (c == '\n')
6434               this_eol = EOL_SEEN_LF;
6435             else if (src >= src_end || *src != '\n')
6436               this_eol = EOL_SEEN_CR;
6437             else
6438               this_eol = EOL_SEEN_CRLF, src++;
6439
6440             if (eol_seen == EOL_SEEN_NONE)
6441               /* This is the first end-of-line.  */
6442               eol_seen = this_eol;
6443             else if (eol_seen != this_eol)
6444               {
6445                 /* The found type is different from what found before.
6446                    Allow for stray ^M characters in DOS EOL files.  */
6447                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6448                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6449                   eol_seen = EOL_SEEN_CRLF;
6450                 else
6451                   {
6452                     eol_seen = EOL_SEEN_LF;
6453                     break;
6454                   }
6455               }
6456             if (++total == MAX_EOL_CHECK_COUNT)
6457               break;
6458           }
6459       }
6460   return eol_seen;
6461 }
6462
6463
6464 static Lisp_Object
6465 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6466 {
6467   Lisp_Object eol_type;
6468
6469   eol_type = CODING_ID_EOL_TYPE (coding->id);
6470   if (! VECTORP (eol_type))
6471     /* Already adjusted.  */
6472     return eol_type;
6473   if (eol_seen & EOL_SEEN_LF)
6474     {
6475       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6476       eol_type = Qunix;
6477     }
6478   else if (eol_seen & EOL_SEEN_CRLF)
6479     {
6480       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6481       eol_type = Qdos;
6482     }
6483   else if (eol_seen & EOL_SEEN_CR)
6484     {
6485       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6486       eol_type = Qmac;
6487     }
6488   return eol_type;
6489 }
6490
6491 /* Detect how a text specified in CODING is encoded.  If a coding
6492    system is detected, update fields of CODING by the detected coding
6493    system.  */
6494
6495 static void
6496 detect_coding (struct coding_system *coding)
6497 {
6498   const unsigned char *src, *src_end;
6499   unsigned int saved_mode = coding->mode;
6500   Lisp_Object found = Qnil;
6501   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6502
6503   coding->consumed = coding->consumed_char = 0;
6504   coding->produced = coding->produced_char = 0;
6505   coding_set_source (coding);
6506
6507   src_end = coding->source + coding->src_bytes;
6508
6509   coding->eol_seen = EOL_SEEN_NONE;
6510   /* If we have not yet decided the text encoding type, detect it
6511      now.  */
6512   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6513     {
6514       int c, i;
6515       struct coding_detection_info detect_info;
6516       bool null_byte_found = 0, eight_bit_found = 0;
6517       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6518                                        inhibit_null_byte_detection);
6519       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6520                                        inhibit_iso_escape_detection);
6521       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6522
6523       coding->head_ascii = 0;
6524       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6525       for (src = coding->source; src < src_end; src++)
6526         {
6527           c = *src;
6528           if (c & 0x80)
6529             {
6530               eight_bit_found = 1;
6531               if (null_byte_found)
6532                 break;
6533             }
6534           else if (c < 0x20)
6535             {
6536               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6537                   && ! inhibit_ied
6538                   && ! detect_info.checked)
6539                 {
6540                   if (detect_coding_iso_2022 (coding, &detect_info))
6541                     {
6542                       /* We have scanned the whole data.  */
6543                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6544                         {
6545                           /* We didn't find an 8-bit code.  We may
6546                              have found a null-byte, but it's very
6547                              rare that a binary file conforms to
6548                              ISO-2022.  */
6549                           src = src_end;
6550                           coding->head_ascii = src - coding->source;
6551                         }
6552                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6553                       break;
6554                     }
6555                 }
6556               else if (! c && !inhibit_nbd)
6557                 {
6558                   null_byte_found = 1;
6559                   if (eight_bit_found)
6560                     break;
6561                 }
6562               else if (! disable_ascii_optimization
6563                        && ! inhibit_eol_conversion)
6564                 {
6565                   if (c == '\r')
6566                     {
6567                       if (src < src_end && src[1] == '\n')
6568                         {
6569                           coding->eol_seen |= EOL_SEEN_CRLF;
6570                           src++;
6571                           if (! eight_bit_found)
6572                             coding->head_ascii++;
6573                         }
6574                       else
6575                         coding->eol_seen |= EOL_SEEN_CR;
6576                     }
6577                   else if (c == '\n')
6578                     {
6579                       coding->eol_seen |= EOL_SEEN_LF;
6580                     }
6581                 }
6582
6583               if (! eight_bit_found)
6584                 coding->head_ascii++;
6585             }
6586           else if (! eight_bit_found)
6587             coding->head_ascii++;
6588         }
6589
6590       if (null_byte_found || eight_bit_found
6591           || coding->head_ascii < coding->src_bytes
6592           || detect_info.found)
6593         {
6594           enum coding_category category;
6595           struct coding_system *this;
6596
6597           if (coding->head_ascii == coding->src_bytes)
6598             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6599             for (i = 0; i < coding_category_raw_text; i++)
6600               {
6601                 category = coding_priorities[i];
6602                 this = coding_categories + category;
6603                 if (detect_info.found & (1 << category))
6604                   break;
6605               }
6606           else
6607             {
6608               if (null_byte_found)
6609                 {
6610                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6611                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6612                 }
6613               else if (prefer_utf_8
6614                        && detect_coding_utf_8 (coding, &detect_info))
6615                 {
6616                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6617                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6618                 }
6619               for (i = 0; i < coding_category_raw_text; i++)
6620                 {
6621                   category = coding_priorities[i];
6622                   this = coding_categories + category;
6623                   /* Some of this->detector (e.g. detect_coding_sjis)
6624                      require this information.  */
6625                   coding->id = this->id;
6626                   if (this->id < 0)
6627                     {
6628                       /* No coding system of this category is defined.  */
6629                       detect_info.rejected |= (1 << category);
6630                     }
6631                   else if (category >= coding_category_raw_text)
6632                     continue;
6633                   else if (detect_info.checked & (1 << category))
6634                     {
6635                       if (detect_info.found & (1 << category))
6636                         break;
6637                     }
6638                   else if ((*(this->detector)) (coding, &detect_info)
6639                            && detect_info.found & (1 << category))
6640                     break;
6641                 }
6642             }
6643
6644           if (i < coding_category_raw_text)
6645             {
6646               if (category == coding_category_utf_8_auto)
6647                 {
6648                   Lisp_Object coding_systems;
6649
6650                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6651                                          coding_attr_utf_bom);
6652                   if (CONSP (coding_systems))
6653                     {
6654                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6655                         found = XCAR (coding_systems);
6656                       else
6657                         found = XCDR (coding_systems);
6658                     }
6659                   else
6660                     found = CODING_ID_NAME (this->id);
6661                 }
6662               else if (category == coding_category_utf_16_auto)
6663                 {
6664                   Lisp_Object coding_systems;
6665
6666                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6667                                          coding_attr_utf_bom);
6668                   if (CONSP (coding_systems))
6669                     {
6670                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6671                         found = XCAR (coding_systems);
6672                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6673                         found = XCDR (coding_systems);
6674                     }
6675                   else
6676                     found = CODING_ID_NAME (this->id);
6677                 }
6678               else
6679                 found = CODING_ID_NAME (this->id);
6680             }
6681           else if (null_byte_found)
6682             found = Qno_conversion;
6683           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6684                    == CATEGORY_MASK_ANY)
6685             found = Qraw_text;
6686           else if (detect_info.rejected)
6687             for (i = 0; i < coding_category_raw_text; i++)
6688               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6689                 {
6690                   this = coding_categories + coding_priorities[i];
6691                   found = CODING_ID_NAME (this->id);
6692                   break;
6693                 }
6694         }
6695     }
6696   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6697            == coding_category_utf_8_auto)
6698     {
6699       Lisp_Object coding_systems;
6700       struct coding_detection_info detect_info;
6701
6702       coding_systems
6703         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6704       detect_info.found = detect_info.rejected = 0;
6705       if (check_ascii (coding) == coding->src_bytes)
6706         {
6707           if (CONSP (coding_systems))
6708             found = XCDR (coding_systems);
6709         }
6710       else
6711         {
6712           if (CONSP (coding_systems)
6713               && detect_coding_utf_8 (coding, &detect_info))
6714             {
6715               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6716                 found = XCAR (coding_systems);
6717               else
6718                 found = XCDR (coding_systems);
6719             }
6720         }
6721     }
6722   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6723            == coding_category_utf_16_auto)
6724     {
6725       Lisp_Object coding_systems;
6726       struct coding_detection_info detect_info;
6727
6728       coding_systems
6729         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6730       detect_info.found = detect_info.rejected = 0;
6731       coding->head_ascii = 0;
6732       if (CONSP (coding_systems)
6733           && detect_coding_utf_16 (coding, &detect_info))
6734         {
6735           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6736             found = XCAR (coding_systems);
6737           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6738             found = XCDR (coding_systems);
6739         }
6740     }
6741
6742   if (! NILP (found))
6743     {
6744       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6745                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6746                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6747                            : EOL_SEEN_LF);
6748
6749       setup_coding_system (found, coding);
6750       if (specified_eol != EOL_SEEN_NONE)
6751         adjust_coding_eol_type (coding, specified_eol);
6752     }
6753
6754   coding->mode = saved_mode;
6755 }
6756
6757
6758 static void
6759 decode_eol (struct coding_system *coding)
6760 {
6761   Lisp_Object eol_type;
6762   unsigned char *p, *pbeg, *pend;
6763
6764   eol_type = CODING_ID_EOL_TYPE (coding->id);
6765   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6766     return;
6767
6768   if (NILP (coding->dst_object))
6769     pbeg = coding->destination;
6770   else
6771     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6772   pend = pbeg + coding->produced;
6773
6774   if (VECTORP (eol_type))
6775     {
6776       int eol_seen = EOL_SEEN_NONE;
6777
6778       for (p = pbeg; p < pend; p++)
6779         {
6780           if (*p == '\n')
6781             eol_seen |= EOL_SEEN_LF;
6782           else if (*p == '\r')
6783             {
6784               if (p + 1 < pend && *(p + 1) == '\n')
6785                 {
6786                   eol_seen |= EOL_SEEN_CRLF;
6787                   p++;
6788                 }
6789               else
6790                 eol_seen |= EOL_SEEN_CR;
6791             }
6792         }
6793       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6794       if ((eol_seen & EOL_SEEN_CRLF) != 0
6795           && (eol_seen & EOL_SEEN_CR) != 0
6796           && (eol_seen & EOL_SEEN_LF) == 0)
6797         eol_seen = EOL_SEEN_CRLF;
6798       else if (eol_seen != EOL_SEEN_NONE
6799           && eol_seen != EOL_SEEN_LF
6800           && eol_seen != EOL_SEEN_CRLF
6801           && eol_seen != EOL_SEEN_CR)
6802         eol_seen = EOL_SEEN_LF;
6803       if (eol_seen != EOL_SEEN_NONE)
6804         eol_type = adjust_coding_eol_type (coding, eol_seen);
6805     }
6806
6807   if (EQ (eol_type, Qmac))
6808     {
6809       for (p = pbeg; p < pend; p++)
6810         if (*p == '\r')
6811           *p = '\n';
6812     }
6813   else if (EQ (eol_type, Qdos))
6814     {
6815       ptrdiff_t n = 0;
6816       ptrdiff_t pos = coding->dst_pos;
6817       ptrdiff_t pos_byte = coding->dst_pos_byte;
6818       ptrdiff_t pos_end = pos_byte + coding->produced - 1;
6819
6820       /* This assertion is here instead of code, now deleted, that
6821          handled the NILP case, which no longer happens with the
6822          current codebase.  */
6823       eassert (!NILP (coding->dst_object));
6824
6825       while (pos_byte < pos_end)
6826         {
6827           int incr;
6828
6829           p = BYTE_POS_ADDR (pos_byte);
6830           if (coding->dst_multibyte)
6831             incr = BYTES_BY_CHAR_HEAD (*p);
6832           else
6833             incr = 1;
6834
6835           if (*p == '\r' && p[1] == '\n')
6836             {
6837               del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6838               n++;
6839               pos_end--;
6840             }
6841           pos++;
6842           pos_byte += incr;
6843         }
6844       coding->produced -= n;
6845       coding->produced_char -= n;
6846     }
6847 }
6848
6849
6850 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
6851    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
6852    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
6853 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6854
6855 /* Return a translation table (or list of them) from coding system
6856    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6857    not ENCODEP). */
6858
6859 static Lisp_Object
6860 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6861 {
6862   Lisp_Object standard, translation_table;
6863   Lisp_Object val;
6864
6865   if (NILP (Venable_character_translation))
6866     {
6867       if (max_lookup)
6868         *max_lookup = 0;
6869       return Qnil;
6870     }
6871   if (encodep)
6872     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6873       standard = Vstandard_translation_table_for_encode;
6874   else
6875     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6876       standard = Vstandard_translation_table_for_decode;
6877   if (NILP (translation_table))
6878     translation_table = standard;
6879   else
6880     {
6881       if (SYMBOLP (translation_table))
6882         translation_table = Fget (translation_table, Qtranslation_table);
6883       else if (CONSP (translation_table))
6884         {
6885           translation_table = Fcopy_sequence (translation_table);
6886           for (val = translation_table; CONSP (val); val = XCDR (val))
6887             if (SYMBOLP (XCAR (val)))
6888               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6889         }
6890       if (CHAR_TABLE_P (standard))
6891         {
6892           if (CONSP (translation_table))
6893             translation_table = nconc2 (translation_table, list1 (standard));
6894           else
6895             translation_table = list2 (translation_table, standard);
6896         }
6897     }
6898
6899   if (max_lookup)
6900     {
6901       *max_lookup = 1;
6902       if (CHAR_TABLE_P (translation_table)
6903           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6904         {
6905           val = XCHAR_TABLE (translation_table)->extras[1];
6906           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6907             *max_lookup = min (XFASTINT (val), MAX_LOOKUP_MAX);
6908         }
6909       else if (CONSP (translation_table))
6910         {
6911           Lisp_Object tail;
6912
6913           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6914             if (CHAR_TABLE_P (XCAR (tail))
6915                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6916               {
6917                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6918                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6919                   *max_lookup = min (XFASTINT (tailval), MAX_LOOKUP_MAX);
6920               }
6921         }
6922     }
6923   return translation_table;
6924 }
6925
6926 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6927   do {                                                          \
6928     trans = Qnil;                                               \
6929     if (CHAR_TABLE_P (table))                                   \
6930       {                                                         \
6931         trans = CHAR_TABLE_REF (table, c);                      \
6932         if (CHARACTERP (trans))                                 \
6933           c = XFASTINT (trans), trans = Qnil;                   \
6934       }                                                         \
6935     else if (CONSP (table))                                     \
6936       {                                                         \
6937         Lisp_Object tail;                                       \
6938                                                                 \
6939         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6940           if (CHAR_TABLE_P (XCAR (tail)))                       \
6941             {                                                   \
6942               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6943               if (CHARACTERP (trans))                           \
6944                 c = XFASTINT (trans), trans = Qnil;             \
6945               else if (! NILP (trans))                          \
6946                 break;                                          \
6947             }                                                   \
6948       }                                                         \
6949   } while (0)
6950
6951
6952 /* Return a translation of character(s) at BUF according to TRANS.
6953    TRANS is TO-CHAR, [TO-CHAR ...], or ((FROM .  TO) ...) where FROM =
6954    [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].  The return value
6955    is TO-CHAR or [TO-CHAR ...] if a translation is found, Qnil if not
6956    found, or Qt if BUF is too short to lookup characters in FROM.  As
6957    a side effect, if a translation is found, *NCHARS is set to the
6958    number of characters being translated.  */
6959
6960 static Lisp_Object
6961 get_translation (Lisp_Object trans, int *buf, int *buf_end, ptrdiff_t *nchars)
6962 {
6963   if (INTEGERP (trans) || VECTORP (trans))
6964     {
6965       *nchars = 1;
6966       return trans;
6967     }
6968   for (; CONSP (trans); trans = XCDR (trans))
6969     {
6970       Lisp_Object val = XCAR (trans);
6971       Lisp_Object from = XCAR (val);
6972       ptrdiff_t len = ASIZE (from);
6973       ptrdiff_t i;
6974
6975       for (i = 0; i < len; i++)
6976         {
6977           if (buf + i == buf_end)
6978             return Qt;
6979           if (XINT (AREF (from, i)) != buf[i])
6980             break;
6981         }
6982       if (i == len)
6983         {
6984           *nchars = len;
6985           return XCDR (val);
6986         }
6987     }
6988   return Qnil;
6989 }
6990
6991
6992 static int
6993 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6994                bool last_block)
6995 {
6996   unsigned char *dst = coding->destination + coding->produced;
6997   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6998   ptrdiff_t produced;
6999   ptrdiff_t produced_chars = 0;
7000   int carryover = 0;
7001
7002   if (! coding->chars_at_source)
7003     {
7004       /* Source characters are in coding->charbuf.  */
7005       int *buf = coding->charbuf;
7006       int *buf_end = buf + coding->charbuf_used;
7007
7008       if (EQ (coding->src_object, coding->dst_object)
7009           && ! NILP (coding->dst_object))
7010         {
7011           eassert (growable_destination (coding));
7012           coding_set_source (coding);
7013           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7014         }
7015
7016       while (buf < buf_end)
7017         {
7018           int c = *buf;
7019           ptrdiff_t i;
7020
7021           if (c >= 0)
7022             {
7023               ptrdiff_t from_nchars = 1, to_nchars = 1;
7024               Lisp_Object trans = Qnil;
7025
7026               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7027               if (! NILP (trans))
7028                 {
7029                   trans = get_translation (trans, buf, buf_end, &from_nchars);
7030                   if (INTEGERP (trans))
7031                     c = XINT (trans);
7032                   else if (VECTORP (trans))
7033                     {
7034                       to_nchars = ASIZE (trans);
7035                       c = XINT (AREF (trans, 0));
7036                     }
7037                   else if (EQ (trans, Qt) && ! last_block)
7038                     break;
7039                 }
7040
7041               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7042                 {
7043                   eassert (growable_destination (coding));
7044                   ptrdiff_t dst_size;
7045                   if (INT_MULTIPLY_WRAPV (to_nchars, MAX_MULTIBYTE_LENGTH,
7046                                           &dst_size)
7047                       || INT_ADD_WRAPV (buf_end - buf, dst_size, &dst_size))
7048                     memory_full (SIZE_MAX);
7049                   dst = alloc_destination (coding, dst_size, dst);
7050                   if (EQ (coding->src_object, coding->dst_object))
7051                     {
7052                       coding_set_source (coding);
7053                       dst_end = (((unsigned char *) coding->source)
7054                                  + coding->consumed);
7055                     }
7056                   else
7057                     dst_end = coding->destination + coding->dst_bytes;
7058                 }
7059
7060               for (i = 0; i < to_nchars; i++)
7061                 {
7062                   if (i > 0)
7063                     c = XINT (AREF (trans, i));
7064                   if (coding->dst_multibyte
7065                       || ! CHAR_BYTE8_P (c))
7066                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7067                   else
7068                     *dst++ = CHAR_TO_BYTE8 (c);
7069                 }
7070               produced_chars += to_nchars;
7071               buf += from_nchars;
7072             }
7073           else
7074             /* This is an annotation datum.  (-C) is the length.  */
7075             buf += -c;
7076         }
7077       carryover = buf_end - buf;
7078     }
7079   else
7080     {
7081       /* Source characters are at coding->source.  */
7082       const unsigned char *src = coding->source;
7083       const unsigned char *src_end = src + coding->consumed;
7084
7085       if (EQ (coding->dst_object, coding->src_object))
7086         {
7087           eassert (growable_destination (coding));
7088           dst_end = (unsigned char *) src;
7089         }
7090       if (coding->src_multibyte != coding->dst_multibyte)
7091         {
7092           if (coding->src_multibyte)
7093             {
7094               bool multibytep = 1;
7095               ptrdiff_t consumed_chars = 0;
7096
7097               while (1)
7098                 {
7099                   const unsigned char *src_base = src;
7100                   int c;
7101
7102                   ONE_MORE_BYTE (c);
7103                   if (dst == dst_end)
7104                     {
7105                       eassert (growable_destination (coding));
7106                       if (EQ (coding->src_object, coding->dst_object))
7107                         dst_end = (unsigned char *) src;
7108                       if (dst == dst_end)
7109                         {
7110                           ptrdiff_t offset = src - coding->source;
7111
7112                           dst = alloc_destination (coding, src_end - src + 1,
7113                                                    dst);
7114                           dst_end = coding->destination + coding->dst_bytes;
7115                           coding_set_source (coding);
7116                           src = coding->source + offset;
7117                           src_end = coding->source + coding->consumed;
7118                           if (EQ (coding->src_object, coding->dst_object))
7119                             dst_end = (unsigned char *) src;
7120                         }
7121                     }
7122                   *dst++ = c;
7123                   produced_chars++;
7124                 }
7125             no_more_source:
7126               ;
7127             }
7128           else
7129             while (src < src_end)
7130               {
7131                 bool multibytep = 1;
7132                 int c = *src++;
7133
7134                 if (dst >= dst_end - 1)
7135                   {
7136                     eassert (growable_destination (coding));
7137                     if (EQ (coding->src_object, coding->dst_object))
7138                       dst_end = (unsigned char *) src;
7139                     if (dst >= dst_end - 1)
7140                       {
7141                         ptrdiff_t offset = src - coding->source;
7142                         ptrdiff_t more_bytes;
7143
7144                         if (EQ (coding->src_object, coding->dst_object))
7145                           more_bytes = ((src_end - src) / 2) + 2;
7146                         else
7147                           more_bytes = src_end - src + 2;
7148                         dst = alloc_destination (coding, more_bytes, dst);
7149                         dst_end = coding->destination + coding->dst_bytes;
7150                         coding_set_source (coding);
7151                         src = coding->source + offset;
7152                         src_end = coding->source + coding->consumed;
7153                         if (EQ (coding->src_object, coding->dst_object))
7154                           dst_end = (unsigned char *) src;
7155                       }
7156                   }
7157                 EMIT_ONE_BYTE (c);
7158               }
7159         }
7160       else
7161         {
7162           if (!EQ (coding->src_object, coding->dst_object))
7163             {
7164               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7165
7166               if (require > 0)
7167                 {
7168                   ptrdiff_t offset = src - coding->source;
7169
7170                   dst = alloc_destination (coding, require, dst);
7171                   coding_set_source (coding);
7172                   src = coding->source + offset;
7173                   src_end = coding->source + coding->consumed;
7174                 }
7175             }
7176           produced_chars = coding->consumed_char;
7177           while (src < src_end)
7178             *dst++ = *src++;
7179         }
7180     }
7181
7182   produced = dst - (coding->destination + coding->produced);
7183   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7184     insert_from_gap (produced_chars, produced, 0);
7185   coding->produced += produced;
7186   coding->produced_char += produced_chars;
7187   return carryover;
7188 }
7189
7190 /* Compose text in CODING->object according to the annotation data at
7191    CHARBUF.  CHARBUF is an array:
7192      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7193  */
7194
7195 static void
7196 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7197 {
7198   int len;
7199   ptrdiff_t to;
7200   enum composition_method method;
7201   Lisp_Object components;
7202
7203   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7204   to = pos + charbuf[2];
7205   method = (enum composition_method) (charbuf[4]);
7206
7207   if (method == COMPOSITION_RELATIVE)
7208     components = Qnil;
7209   else
7210     {
7211       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7212       int i, j;
7213
7214       if (method == COMPOSITION_WITH_RULE)
7215         len = charbuf[2] * 3 - 2;
7216       charbuf += MAX_ANNOTATION_LENGTH;
7217       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7218       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7219         {
7220           if (charbuf[i] >= 0)
7221             args[j] = make_number (charbuf[i]);
7222           else
7223             {
7224               i++;
7225               args[j] = make_number (charbuf[i] % 0x100);
7226             }
7227         }
7228       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7229     }
7230   compose_text (pos, to, components, Qnil, coding->dst_object);
7231 }
7232
7233
7234 /* Put `charset' property on text in CODING->object according to
7235    the annotation data at CHARBUF.  CHARBUF is an array:
7236      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7237  */
7238
7239 static void
7240 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7241 {
7242   ptrdiff_t from = pos - charbuf[2];
7243   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7244
7245   Fput_text_property (make_number (from), make_number (pos),
7246                       Qcharset, CHARSET_NAME (charset),
7247                       coding->dst_object);
7248 }
7249
7250 #define MAX_CHARBUF_SIZE 0x4000
7251 /* How many units decoding functions expect in coding->charbuf at
7252    most.  Currently, decode_coding_emacs_mule expects the following
7253    size, and that is the largest value.  */
7254 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7255
7256 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7257   do {                                                          \
7258     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
7259                            MAX_CHARBUF_SIZE);                   \
7260     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
7261     coding->charbuf_size = units;                               \
7262   } while (0)
7263
7264 static void
7265 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7266 {
7267   int *charbuf = coding->charbuf;
7268   int *charbuf_end = charbuf + coding->charbuf_used;
7269
7270   if (NILP (coding->dst_object))
7271     return;
7272
7273   while (charbuf < charbuf_end)
7274     {
7275       if (*charbuf >= 0)
7276         pos++, charbuf++;
7277       else
7278         {
7279           int len = -*charbuf;
7280
7281           if (len > 2)
7282             switch (charbuf[1])
7283               {
7284               case CODING_ANNOTATE_COMPOSITION_MASK:
7285                 produce_composition (coding, charbuf, pos);
7286                 break;
7287               case CODING_ANNOTATE_CHARSET_MASK:
7288                 produce_charset (coding, charbuf, pos);
7289                 break;
7290               default:
7291                 break;
7292               }
7293           charbuf += len;
7294         }
7295     }
7296 }
7297
7298 /* Decode the data at CODING->src_object into CODING->dst_object.
7299    CODING->src_object is a buffer, a string, or nil.
7300    CODING->dst_object is a buffer.
7301
7302    If CODING->src_object is a buffer, it must be the current buffer.
7303    In this case, if CODING->src_pos is positive, it is a position of
7304    the source text in the buffer, otherwise, the source text is in the
7305    gap area of the buffer, and CODING->src_pos specifies the offset of
7306    the text from GPT (which must be the same as PT).  If this is the
7307    same buffer as CODING->dst_object, CODING->src_pos must be
7308    negative.
7309
7310    If CODING->src_object is a string, CODING->src_pos is an index to
7311    that string.
7312
7313    If CODING->src_object is nil, CODING->source must already point to
7314    the non-relocatable memory area.  In this case, CODING->src_pos is
7315    an offset from CODING->source.
7316
7317    The decoded data is inserted at the current point of the buffer
7318    CODING->dst_object.
7319 */
7320
7321 static void
7322 decode_coding (struct coding_system *coding)
7323 {
7324   Lisp_Object attrs;
7325   Lisp_Object undo_list;
7326   Lisp_Object translation_table;
7327   struct ccl_spec cclspec;
7328   int carryover;
7329   int i;
7330
7331   USE_SAFE_ALLOCA;
7332
7333   if (BUFFERP (coding->src_object)
7334       && coding->src_pos > 0
7335       && coding->src_pos < GPT
7336       && coding->src_pos + coding->src_chars > GPT)
7337     move_gap_both (coding->src_pos, coding->src_pos_byte);
7338
7339   undo_list = Qt;
7340   if (BUFFERP (coding->dst_object))
7341     {
7342       set_buffer_internal (XBUFFER (coding->dst_object));
7343       if (GPT != PT)
7344         move_gap_both (PT, PT_BYTE);
7345
7346       /* We must disable undo_list in order to record the whole insert
7347          transaction via record_insert at the end.  But doing so also
7348          disables the recording of the first change to the undo_list.
7349          Therefore we check for first change here and record it via
7350          record_first_change if needed.  */
7351       if (MODIFF <= SAVE_MODIFF)
7352         record_first_change ();
7353
7354       undo_list = BVAR (current_buffer, undo_list);
7355       bset_undo_list (current_buffer, Qt);
7356     }
7357
7358   coding->consumed = coding->consumed_char = 0;
7359   coding->produced = coding->produced_char = 0;
7360   coding->chars_at_source = 0;
7361   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7362
7363   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7364
7365   attrs = CODING_ID_ATTRS (coding->id);
7366   translation_table = get_translation_table (attrs, 0, NULL);
7367
7368   carryover = 0;
7369   if (coding->decoder == decode_coding_ccl)
7370     {
7371       coding->spec.ccl = &cclspec;
7372       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7373     }
7374   do
7375     {
7376       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7377
7378       coding_set_source (coding);
7379       coding->annotated = 0;
7380       coding->charbuf_used = carryover;
7381       (*(coding->decoder)) (coding);
7382       coding_set_destination (coding);
7383       carryover = produce_chars (coding, translation_table, 0);
7384       if (coding->annotated)
7385         produce_annotation (coding, pos);
7386       for (i = 0; i < carryover; i++)
7387         coding->charbuf[i]
7388           = coding->charbuf[coding->charbuf_used - carryover + i];
7389     }
7390   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7391          || (coding->consumed < coding->src_bytes
7392              && (coding->result == CODING_RESULT_SUCCESS
7393                  || coding->result == CODING_RESULT_INVALID_SRC)));
7394
7395   if (carryover > 0)
7396     {
7397       coding_set_destination (coding);
7398       coding->charbuf_used = carryover;
7399       produce_chars (coding, translation_table, 1);
7400     }
7401
7402   coding->carryover_bytes = 0;
7403   if (coding->consumed < coding->src_bytes)
7404     {
7405       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7406       const unsigned char *src;
7407
7408       coding_set_source (coding);
7409       coding_set_destination (coding);
7410       src = coding->source + coding->consumed;
7411
7412       if (coding->mode & CODING_MODE_LAST_BLOCK)
7413         {
7414           /* Flush out unprocessed data as binary chars.  We are sure
7415              that the number of data is less than the size of
7416              coding->charbuf.  */
7417           coding->charbuf_used = 0;
7418           coding->chars_at_source = 0;
7419
7420           while (nbytes-- > 0)
7421             {
7422               int c;
7423
7424               /* Copy raw bytes in their 2-byte forms from multibyte
7425                  text as single characters.  */
7426               if (coding->src_multibyte
7427                   && CHAR_BYTE8_HEAD_P (*src) && nbytes > 0)
7428                 {
7429                   c = STRING_CHAR_ADVANCE (src);
7430                   nbytes--;
7431                 }
7432               else
7433                 {
7434                   c = *src++;
7435
7436                   if (c & 0x80)
7437                     c = BYTE8_TO_CHAR (c);
7438                 }
7439               coding->charbuf[coding->charbuf_used++] = c;
7440             }
7441           produce_chars (coding, Qnil, 1);
7442         }
7443       else
7444         {
7445           /* Record unprocessed bytes in coding->carryover.  We are
7446              sure that the number of data is less than the size of
7447              coding->carryover.  */
7448           unsigned char *p = coding->carryover;
7449
7450           if (nbytes > sizeof coding->carryover)
7451             nbytes = sizeof coding->carryover;
7452           coding->carryover_bytes = nbytes;
7453           while (nbytes-- > 0)
7454             *p++ = *src++;
7455         }
7456       coding->consumed = coding->src_bytes;
7457     }
7458
7459   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7460       && !inhibit_eol_conversion)
7461     decode_eol (coding);
7462   if (BUFFERP (coding->dst_object))
7463     {
7464       bset_undo_list (current_buffer, undo_list);
7465       record_insert (coding->dst_pos, coding->produced_char);
7466     }
7467
7468   SAFE_FREE ();
7469 }
7470
7471
7472 /* Extract an annotation datum from a composition starting at POS and
7473    ending before LIMIT of CODING->src_object (buffer or string), store
7474    the data in BUF, set *STOP to a starting position of the next
7475    composition (if any) or to LIMIT, and return the address of the
7476    next element of BUF.
7477
7478    If such an annotation is not found, set *STOP to a starting
7479    position of a composition after POS (if any) or to LIMIT, and
7480    return BUF.  */
7481
7482 static int *
7483 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7484                                struct coding_system *coding, int *buf,
7485                                ptrdiff_t *stop)
7486 {
7487   ptrdiff_t start, end;
7488   Lisp_Object prop;
7489
7490   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7491       || end > limit)
7492     *stop = limit;
7493   else if (start > pos)
7494     *stop = start;
7495   else
7496     {
7497       if (start == pos)
7498         {
7499           /* We found a composition.  Store the corresponding
7500              annotation data in BUF.  */
7501           int *head = buf;
7502           enum composition_method method = composition_method (prop);
7503           int nchars = COMPOSITION_LENGTH (prop);
7504
7505           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7506           if (method != COMPOSITION_RELATIVE)
7507             {
7508               Lisp_Object components;
7509               ptrdiff_t i, len, i_byte;
7510
7511               components = COMPOSITION_COMPONENTS (prop);
7512               if (VECTORP (components))
7513                 {
7514                   len = ASIZE (components);
7515                   for (i = 0; i < len; i++)
7516                     *buf++ = XINT (AREF (components, i));
7517                 }
7518               else if (STRINGP (components))
7519                 {
7520                   len = SCHARS (components);
7521                   i = i_byte = 0;
7522                   while (i < len)
7523                     {
7524                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7525                       buf++;
7526                     }
7527                 }
7528               else if (INTEGERP (components))
7529                 {
7530                   len = 1;
7531                   *buf++ = XINT (components);
7532                 }
7533               else if (CONSP (components))
7534                 {
7535                   for (len = 0; CONSP (components);
7536                        len++, components = XCDR (components))
7537                     *buf++ = XINT (XCAR (components));
7538                 }
7539               else
7540                 emacs_abort ();
7541               *head -= len;
7542             }
7543         }
7544
7545       if (find_composition (end, limit, &start, &end, &prop,
7546                             coding->src_object)
7547           && end <= limit)
7548         *stop = start;
7549       else
7550         *stop = limit;
7551     }
7552   return buf;
7553 }
7554
7555
7556 /* Extract an annotation datum from a text property `charset' at POS of
7557    CODING->src_object (buffer of string), store the data in BUF, set
7558    *STOP to the position where the value of `charset' property changes
7559    (limiting by LIMIT), and return the address of the next element of
7560    BUF.
7561
7562    If the property value is nil, set *STOP to the position where the
7563    property value is non-nil (limiting by LIMIT), and return BUF.  */
7564
7565 static int *
7566 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7567                            struct coding_system *coding, int *buf,
7568                            ptrdiff_t *stop)
7569 {
7570   Lisp_Object val, next;
7571   int id;
7572
7573   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7574   if (! NILP (val) && CHARSETP (val))
7575     id = XINT (CHARSET_SYMBOL_ID (val));
7576   else
7577     id = -1;
7578   ADD_CHARSET_DATA (buf, 0, id);
7579   next = Fnext_single_property_change (make_number (pos), Qcharset,
7580                                        coding->src_object,
7581                                        make_number (limit));
7582   *stop = XINT (next);
7583   return buf;
7584 }
7585
7586
7587 static void
7588 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7589                int max_lookup)
7590 {
7591   int *buf = coding->charbuf;
7592   int *buf_end = coding->charbuf + coding->charbuf_size;
7593   const unsigned char *src = coding->source + coding->consumed;
7594   const unsigned char *src_end = coding->source + coding->src_bytes;
7595   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7596   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7597   bool multibytep = coding->src_multibyte;
7598   Lisp_Object eol_type;
7599   int c;
7600   ptrdiff_t stop, stop_composition, stop_charset;
7601   int *lookup_buf = NULL;
7602
7603   if (! NILP (translation_table))
7604     lookup_buf = alloca (sizeof (int) * max_lookup);
7605
7606   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7607   if (VECTORP (eol_type))
7608     eol_type = Qunix;
7609
7610   /* Note: composition handling is not yet implemented.  */
7611   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7612
7613   if (NILP (coding->src_object))
7614     stop = stop_composition = stop_charset = end_pos;
7615   else
7616     {
7617       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7618         stop = stop_composition = pos;
7619       else
7620         stop = stop_composition = end_pos;
7621       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7622         stop = stop_charset = pos;
7623       else
7624         stop_charset = end_pos;
7625     }
7626
7627   /* Compensate for CRLF and conversion.  */
7628   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7629   while (buf < buf_end)
7630     {
7631       Lisp_Object trans;
7632
7633       if (pos == stop)
7634         {
7635           if (pos == end_pos)
7636             break;
7637           if (pos == stop_composition)
7638             buf = handle_composition_annotation (pos, end_pos, coding,
7639                                                  buf, &stop_composition);
7640           if (pos == stop_charset)
7641             buf = handle_charset_annotation (pos, end_pos, coding,
7642                                              buf, &stop_charset);
7643           stop = (stop_composition < stop_charset
7644                   ? stop_composition : stop_charset);
7645         }
7646
7647       if (! multibytep)
7648         {
7649           int bytes;
7650
7651           if (coding->encoder == encode_coding_raw_text
7652               || coding->encoder == encode_coding_ccl)
7653             c = *src++, pos++;
7654           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7655             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7656           else
7657             c = BYTE8_TO_CHAR (*src), src++, pos++;
7658         }
7659       else
7660         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7661       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7662         c = '\n';
7663       if (! EQ (eol_type, Qunix))
7664         {
7665           if (c == '\n')
7666             {
7667               if (EQ (eol_type, Qdos))
7668                 *buf++ = '\r';
7669               else
7670                 c = '\r';
7671             }
7672         }
7673
7674       trans = Qnil;
7675       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7676       if (NILP (trans))
7677         *buf++ = c;
7678       else
7679         {
7680           ptrdiff_t from_nchars = 1, to_nchars = 1;
7681           int *lookup_buf_end;
7682           const unsigned char *p = src;
7683           int i;
7684
7685           lookup_buf[0] = c;
7686           for (i = 1; i < max_lookup && p < src_end; i++)
7687             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7688           lookup_buf_end = lookup_buf + i;
7689           trans = get_translation (trans, lookup_buf, lookup_buf_end,
7690                                    &from_nchars);
7691           if (INTEGERP (trans))
7692             c = XINT (trans);
7693           else if (VECTORP (trans))
7694             {
7695               to_nchars = ASIZE (trans);
7696               if (buf_end - buf < to_nchars)
7697                 break;
7698               c = XINT (AREF (trans, 0));
7699             }
7700           else
7701             break;
7702           *buf++ = c;
7703           for (i = 1; i < to_nchars; i++)
7704             *buf++ = XINT (AREF (trans, i));
7705           for (i = 1; i < from_nchars; i++, pos++)
7706             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7707         }
7708     }
7709
7710   coding->consumed = src - coding->source;
7711   coding->consumed_char = pos - coding->src_pos;
7712   coding->charbuf_used = buf - coding->charbuf;
7713   coding->chars_at_source = 0;
7714 }
7715
7716
7717 /* Encode the text at CODING->src_object into CODING->dst_object.
7718    CODING->src_object is a buffer or a string.
7719    CODING->dst_object is a buffer or nil.
7720
7721    If CODING->src_object is a buffer, it must be the current buffer.
7722    In this case, if CODING->src_pos is positive, it is a position of
7723    the source text in the buffer, otherwise. the source text is in the
7724    gap area of the buffer, and coding->src_pos specifies the offset of
7725    the text from GPT (which must be the same as PT).  If this is the
7726    same buffer as CODING->dst_object, CODING->src_pos must be
7727    negative and CODING should not have `pre-write-conversion'.
7728
7729    If CODING->src_object is a string, CODING should not have
7730    `pre-write-conversion'.
7731
7732    If CODING->dst_object is a buffer, the encoded data is inserted at
7733    the current point of that buffer.
7734
7735    If CODING->dst_object is nil, the encoded data is placed at the
7736    memory area specified by CODING->destination.  */
7737
7738 static void
7739 encode_coding (struct coding_system *coding)
7740 {
7741   Lisp_Object attrs;
7742   Lisp_Object translation_table;
7743   int max_lookup;
7744   struct ccl_spec cclspec;
7745
7746   USE_SAFE_ALLOCA;
7747
7748   attrs = CODING_ID_ATTRS (coding->id);
7749   if (coding->encoder == encode_coding_raw_text)
7750     translation_table = Qnil, max_lookup = 0;
7751   else
7752     translation_table = get_translation_table (attrs, 1, &max_lookup);
7753
7754   if (BUFFERP (coding->dst_object))
7755     {
7756       set_buffer_internal (XBUFFER (coding->dst_object));
7757       coding->dst_multibyte
7758         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7759     }
7760
7761   coding->consumed = coding->consumed_char = 0;
7762   coding->produced = coding->produced_char = 0;
7763   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7764
7765   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7766
7767   if (coding->encoder == encode_coding_ccl)
7768     {
7769       coding->spec.ccl = &cclspec;
7770       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7771     }
7772   do {
7773     coding_set_source (coding);
7774     consume_chars (coding, translation_table, max_lookup);
7775     coding_set_destination (coding);
7776     (*(coding->encoder)) (coding);
7777   } while (coding->consumed_char < coding->src_chars);
7778
7779   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7780     insert_from_gap (coding->produced_char, coding->produced, 0);
7781
7782   SAFE_FREE ();
7783 }
7784
7785
7786 /* Name (or base name) of work buffer for code conversion.  */
7787 static Lisp_Object Vcode_conversion_workbuf_name;
7788
7789 /* A working buffer used by the top level conversion.  Once it is
7790    created, it is never destroyed.  It has the name
7791    Vcode_conversion_workbuf_name.  The other working buffers are
7792    destroyed after the use is finished, and their names are modified
7793    versions of Vcode_conversion_workbuf_name.  */
7794 static Lisp_Object Vcode_conversion_reused_workbuf;
7795
7796 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7797 static bool reused_workbuf_in_use;
7798
7799
7800 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7801    multibyteness of returning buffer.  */
7802
7803 static Lisp_Object
7804 make_conversion_work_buffer (bool multibyte)
7805 {
7806   Lisp_Object name, workbuf;
7807   struct buffer *current;
7808
7809   if (reused_workbuf_in_use)
7810     {
7811       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7812       workbuf = Fget_buffer_create (name);
7813     }
7814   else
7815     {
7816       reused_workbuf_in_use = 1;
7817       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7818         Vcode_conversion_reused_workbuf
7819           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7820       workbuf = Vcode_conversion_reused_workbuf;
7821     }
7822   current = current_buffer;
7823   set_buffer_internal (XBUFFER (workbuf));
7824   /* We can't allow modification hooks to run in the work buffer.  For
7825      instance, directory_files_internal assumes that file decoding
7826      doesn't compile new regexps.  */
7827   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7828   Ferase_buffer ();
7829   bset_undo_list (current_buffer, Qt);
7830   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7831   set_buffer_internal (current);
7832   return workbuf;
7833 }
7834
7835
7836 static void
7837 code_conversion_restore (Lisp_Object arg)
7838 {
7839   Lisp_Object current, workbuf;
7840
7841   current = XCAR (arg);
7842   workbuf = XCDR (arg);
7843   if (! NILP (workbuf))
7844     {
7845       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7846         reused_workbuf_in_use = 0;
7847       else
7848         Fkill_buffer (workbuf);
7849     }
7850   set_buffer_internal (XBUFFER (current));
7851 }
7852
7853 Lisp_Object
7854 code_conversion_save (bool with_work_buf, bool multibyte)
7855 {
7856   Lisp_Object workbuf = Qnil;
7857
7858   if (with_work_buf)
7859     workbuf = make_conversion_work_buffer (multibyte);
7860   record_unwind_protect (code_conversion_restore,
7861                          Fcons (Fcurrent_buffer (), workbuf));
7862   return workbuf;
7863 }
7864
7865 static void
7866 coding_restore_undo_list (Lisp_Object arg)
7867 {
7868   Lisp_Object undo_list = XCAR (arg);
7869   struct buffer *buf = XBUFFER (XCDR (arg));
7870
7871   bset_undo_list (buf, undo_list);
7872 }
7873
7874 void
7875 decode_coding_gap (struct coding_system *coding,
7876                    ptrdiff_t chars, ptrdiff_t bytes)
7877 {
7878   ptrdiff_t count = SPECPDL_INDEX ();
7879   Lisp_Object attrs;
7880
7881   coding->src_object = Fcurrent_buffer ();
7882   coding->src_chars = chars;
7883   coding->src_bytes = bytes;
7884   coding->src_pos = -chars;
7885   coding->src_pos_byte = -bytes;
7886   coding->src_multibyte = chars < bytes;
7887   coding->dst_object = coding->src_object;
7888   coding->dst_pos = PT;
7889   coding->dst_pos_byte = PT_BYTE;
7890   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7891
7892   coding->head_ascii = -1;
7893   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7894   coding->eol_seen = EOL_SEEN_NONE;
7895   if (CODING_REQUIRE_DETECTION (coding))
7896     detect_coding (coding);
7897   attrs = CODING_ID_ATTRS (coding->id);
7898   if (! disable_ascii_optimization
7899       && ! coding->src_multibyte
7900       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7901       && NILP (CODING_ATTR_POST_READ (attrs))
7902       && NILP (get_translation_table (attrs, 0, NULL)))
7903     {
7904       chars = coding->head_ascii;
7905       if (chars < 0)
7906         chars = check_ascii (coding);
7907       if (chars != bytes)
7908         {
7909           /* There exists a non-ASCII byte.  */
7910           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7911               && coding->detected_utf8_bytes == coding->src_bytes)
7912             {
7913               if (coding->detected_utf8_chars >= 0)
7914                 chars = coding->detected_utf8_chars;
7915               else
7916                 chars = check_utf_8 (coding);
7917               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7918                   && coding->head_ascii == 0
7919                   && coding->source[0] == UTF_8_BOM_1
7920                   && coding->source[1] == UTF_8_BOM_2
7921                   && coding->source[2] == UTF_8_BOM_3)
7922                 {
7923                   chars--;
7924                   bytes -= 3;
7925                   coding->src_bytes -= 3;
7926                 }
7927             }
7928           else
7929             chars = -1;
7930         }
7931       if (chars >= 0)
7932         {
7933           Lisp_Object eol_type;
7934
7935           eol_type = CODING_ID_EOL_TYPE (coding->id);
7936           if (VECTORP (eol_type))
7937             {
7938               if (coding->eol_seen != EOL_SEEN_NONE)
7939                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7940             }
7941           if (EQ (eol_type, Qmac))
7942             {
7943               unsigned char *src_end = GAP_END_ADDR;
7944               unsigned char *src = src_end - coding->src_bytes;
7945
7946               while (src < src_end)
7947                 {
7948                   if (*src++ == '\r')
7949                     src[-1] = '\n';
7950                 }
7951             }
7952           else if (EQ (eol_type, Qdos))
7953             {
7954               unsigned char *src = GAP_END_ADDR;
7955               unsigned char *src_beg = src - coding->src_bytes;
7956               unsigned char *dst = src;
7957               ptrdiff_t diff;
7958
7959               while (src_beg < src)
7960                 {
7961                   *--dst = *--src;
7962                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7963                     src--;
7964                 }
7965               diff = dst - src;
7966               bytes -= diff;
7967               chars -= diff;
7968             }
7969           coding->produced = bytes;
7970           coding->produced_char = chars;
7971           insert_from_gap (chars, bytes, 1);
7972           return;
7973         }
7974     }
7975   code_conversion_save (0, 0);
7976
7977   coding->mode |= CODING_MODE_LAST_BLOCK;
7978   current_buffer->text->inhibit_shrinking = 1;
7979   decode_coding (coding);
7980   current_buffer->text->inhibit_shrinking = 0;
7981
7982   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7983     {
7984       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7985       Lisp_Object val;
7986       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
7987       ptrdiff_t count1 = SPECPDL_INDEX ();
7988
7989       record_unwind_protect (coding_restore_undo_list,
7990                              Fcons (undo_list, Fcurrent_buffer ()));
7991       bset_undo_list (current_buffer, Qt);
7992       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7993       val = call1 (CODING_ATTR_POST_READ (attrs),
7994                    make_number (coding->produced_char));
7995       CHECK_NATNUM (val);
7996       coding->produced_char += Z - prev_Z;
7997       coding->produced += Z_BYTE - prev_Z_BYTE;
7998       unbind_to (count1, Qnil);
7999     }
8000
8001   unbind_to (count, Qnil);
8002 }
8003
8004
8005 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8006    SRC_OBJECT into DST_OBJECT by coding context CODING.
8007
8008    SRC_OBJECT is a buffer, a string, or Qnil.
8009
8010    If it is a buffer, the text is at point of the buffer.  FROM and TO
8011    are positions in the buffer.
8012
8013    If it is a string, the text is at the beginning of the string.
8014    FROM and TO are indices to the string.
8015
8016    If it is nil, the text is at coding->source.  FROM and TO are
8017    indices to coding->source.
8018
8019    DST_OBJECT is a buffer, Qt, or Qnil.
8020
8021    If it is a buffer, the decoded text is inserted at point of the
8022    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8023    is deleted.
8024
8025    If it is Qt, a string is made from the decoded text, and
8026    set in CODING->dst_object.
8027
8028    If it is Qnil, the decoded text is stored at CODING->destination.
8029    The caller must allocate CODING->dst_bytes bytes at
8030    CODING->destination by xmalloc.  If the decoded text is longer than
8031    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8032  */
8033
8034 void
8035 decode_coding_object (struct coding_system *coding,
8036                       Lisp_Object src_object,
8037                       ptrdiff_t from, ptrdiff_t from_byte,
8038                       ptrdiff_t to, ptrdiff_t to_byte,
8039                       Lisp_Object dst_object)
8040 {
8041   ptrdiff_t count = SPECPDL_INDEX ();
8042   unsigned char *destination UNINIT;
8043   ptrdiff_t dst_bytes UNINIT;
8044   ptrdiff_t chars = to - from;
8045   ptrdiff_t bytes = to_byte - from_byte;
8046   Lisp_Object attrs;
8047   ptrdiff_t saved_pt = -1, saved_pt_byte UNINIT;
8048   bool need_marker_adjustment = 0;
8049   Lisp_Object old_deactivate_mark;
8050
8051   old_deactivate_mark = Vdeactivate_mark;
8052
8053   if (NILP (dst_object))
8054     {
8055       destination = coding->destination;
8056       dst_bytes = coding->dst_bytes;
8057     }
8058
8059   coding->src_object = src_object;
8060   coding->src_chars = chars;
8061   coding->src_bytes = bytes;
8062   coding->src_multibyte = chars < bytes;
8063
8064   if (STRINGP (src_object))
8065     {
8066       coding->src_pos = from;
8067       coding->src_pos_byte = from_byte;
8068     }
8069   else if (BUFFERP (src_object))
8070     {
8071       set_buffer_internal (XBUFFER (src_object));
8072       if (from != GPT)
8073         move_gap_both (from, from_byte);
8074       if (EQ (src_object, dst_object))
8075         {
8076           struct Lisp_Marker *tail;
8077
8078           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8079             {
8080               tail->need_adjustment
8081                 = tail->charpos == (tail->insertion_type ? from : to);
8082               need_marker_adjustment |= tail->need_adjustment;
8083             }
8084           saved_pt = PT, saved_pt_byte = PT_BYTE;
8085           TEMP_SET_PT_BOTH (from, from_byte);
8086           current_buffer->text->inhibit_shrinking = 1;
8087           del_range_both (from, from_byte, to, to_byte, 1);
8088           coding->src_pos = -chars;
8089           coding->src_pos_byte = -bytes;
8090         }
8091       else
8092         {
8093           coding->src_pos = from;
8094           coding->src_pos_byte = from_byte;
8095         }
8096     }
8097
8098   if (CODING_REQUIRE_DETECTION (coding))
8099     detect_coding (coding);
8100   attrs = CODING_ID_ATTRS (coding->id);
8101
8102   if (EQ (dst_object, Qt)
8103       || (! NILP (CODING_ATTR_POST_READ (attrs))
8104           && NILP (dst_object)))
8105     {
8106       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8107       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8108       coding->dst_pos = BEG;
8109       coding->dst_pos_byte = BEG_BYTE;
8110     }
8111   else if (BUFFERP (dst_object))
8112     {
8113       code_conversion_save (0, 0);
8114       coding->dst_object = dst_object;
8115       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8116       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8117       coding->dst_multibyte
8118         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8119     }
8120   else
8121     {
8122       code_conversion_save (0, 0);
8123       coding->dst_object = Qnil;
8124       /* Most callers presume this will return a multibyte result, and they
8125          won't use `binary' or `raw-text' anyway, so let's not worry about
8126          CODING_FOR_UNIBYTE.  */
8127       coding->dst_multibyte = 1;
8128     }
8129
8130   decode_coding (coding);
8131
8132   if (BUFFERP (coding->dst_object))
8133     set_buffer_internal (XBUFFER (coding->dst_object));
8134
8135   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8136     {
8137       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8138       Lisp_Object val;
8139       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
8140       ptrdiff_t count1 = SPECPDL_INDEX ();
8141
8142       record_unwind_protect (coding_restore_undo_list,
8143                              Fcons (undo_list, Fcurrent_buffer ()));
8144       bset_undo_list (current_buffer, Qt);
8145       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8146       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8147                         make_number (coding->produced_char));
8148       CHECK_NATNUM (val);
8149       coding->produced_char += Z - prev_Z;
8150       coding->produced += Z_BYTE - prev_Z_BYTE;
8151       unbind_to (count1, Qnil);
8152     }
8153
8154   if (EQ (dst_object, Qt))
8155     {
8156       coding->dst_object = Fbuffer_string ();
8157     }
8158   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8159     {
8160       set_buffer_internal (XBUFFER (coding->dst_object));
8161       if (dst_bytes < coding->produced)
8162         {
8163           eassert (coding->produced > 0);
8164           destination = xrealloc (destination, coding->produced);
8165           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8166             move_gap_both (BEGV, BEGV_BYTE);
8167           memcpy (destination, BEGV_ADDR, coding->produced);
8168           coding->destination = destination;
8169         }
8170     }
8171
8172   if (saved_pt >= 0)
8173     {
8174       /* This is the case of:
8175          (BUFFERP (src_object) && EQ (src_object, dst_object))
8176          As we have moved PT while replacing the original buffer
8177          contents, we must recover it now.  */
8178       set_buffer_internal (XBUFFER (src_object));
8179       current_buffer->text->inhibit_shrinking = 0;
8180       if (saved_pt < from)
8181         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8182       else if (saved_pt < from + chars)
8183         TEMP_SET_PT_BOTH (from, from_byte);
8184       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8185         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8186                           saved_pt_byte + (coding->produced - bytes));
8187       else
8188         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8189                           saved_pt_byte + (coding->produced - bytes));
8190
8191       if (need_marker_adjustment)
8192         {
8193           struct Lisp_Marker *tail;
8194
8195           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8196             if (tail->need_adjustment)
8197               {
8198                 tail->need_adjustment = 0;
8199                 if (tail->insertion_type)
8200                   {
8201                     tail->bytepos = from_byte;
8202                     tail->charpos = from;
8203                   }
8204                 else
8205                   {
8206                     tail->bytepos = from_byte + coding->produced;
8207                     tail->charpos
8208                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8209                          ? tail->bytepos : from + coding->produced_char);
8210                   }
8211               }
8212         }
8213     }
8214
8215   Vdeactivate_mark = old_deactivate_mark;
8216   unbind_to (count, coding->dst_object);
8217 }
8218
8219
8220 void
8221 encode_coding_object (struct coding_system *coding,
8222                       Lisp_Object src_object,
8223                       ptrdiff_t from, ptrdiff_t from_byte,
8224                       ptrdiff_t to, ptrdiff_t to_byte,
8225                       Lisp_Object dst_object)
8226 {
8227   ptrdiff_t count = SPECPDL_INDEX ();
8228   ptrdiff_t chars = to - from;
8229   ptrdiff_t bytes = to_byte - from_byte;
8230   Lisp_Object attrs;
8231   ptrdiff_t saved_pt = -1, saved_pt_byte;
8232   bool need_marker_adjustment = 0;
8233   bool kill_src_buffer = 0;
8234   Lisp_Object old_deactivate_mark;
8235
8236   old_deactivate_mark = Vdeactivate_mark;
8237
8238   coding->src_object = src_object;
8239   coding->src_chars = chars;
8240   coding->src_bytes = bytes;
8241   coding->src_multibyte = chars < bytes;
8242
8243   attrs = CODING_ID_ATTRS (coding->id);
8244
8245   if (EQ (src_object, dst_object))
8246     {
8247       struct Lisp_Marker *tail;
8248
8249       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8250         {
8251           tail->need_adjustment
8252             = tail->charpos == (tail->insertion_type ? from : to);
8253           need_marker_adjustment |= tail->need_adjustment;
8254         }
8255     }
8256
8257   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8258     {
8259       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8260       set_buffer_internal (XBUFFER (coding->src_object));
8261       if (STRINGP (src_object))
8262         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8263       else if (BUFFERP (src_object))
8264         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8265       else
8266         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8267
8268       if (EQ (src_object, dst_object))
8269         {
8270           set_buffer_internal (XBUFFER (src_object));
8271           saved_pt = PT, saved_pt_byte = PT_BYTE;
8272           del_range_both (from, from_byte, to, to_byte, 1);
8273           set_buffer_internal (XBUFFER (coding->src_object));
8274         }
8275
8276       safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8277                   make_number (BEG), make_number (Z));
8278       if (XBUFFER (coding->src_object) != current_buffer)
8279         kill_src_buffer = 1;
8280       coding->src_object = Fcurrent_buffer ();
8281       if (BEG != GPT)
8282         move_gap_both (BEG, BEG_BYTE);
8283       coding->src_chars = Z - BEG;
8284       coding->src_bytes = Z_BYTE - BEG_BYTE;
8285       coding->src_pos = BEG;
8286       coding->src_pos_byte = BEG_BYTE;
8287       coding->src_multibyte = Z < Z_BYTE;
8288     }
8289   else if (STRINGP (src_object))
8290     {
8291       code_conversion_save (0, 0);
8292       coding->src_pos = from;
8293       coding->src_pos_byte = from_byte;
8294     }
8295   else if (BUFFERP (src_object))
8296     {
8297       code_conversion_save (0, 0);
8298       set_buffer_internal (XBUFFER (src_object));
8299       if (EQ (src_object, dst_object))
8300         {
8301           saved_pt = PT, saved_pt_byte = PT_BYTE;
8302           coding->src_object = del_range_1 (from, to, 1, 1);
8303           coding->src_pos = 0;
8304           coding->src_pos_byte = 0;
8305         }
8306       else
8307         {
8308           if (from < GPT && to >= GPT)
8309             move_gap_both (from, from_byte);
8310           coding->src_pos = from;
8311           coding->src_pos_byte = from_byte;
8312         }
8313     }
8314   else
8315     {
8316       code_conversion_save (0, 0);
8317       coding->src_pos = from;
8318       coding->src_pos_byte = from_byte;
8319     }
8320
8321   if (BUFFERP (dst_object))
8322     {
8323       coding->dst_object = dst_object;
8324       if (EQ (src_object, dst_object))
8325         {
8326           coding->dst_pos = from;
8327           coding->dst_pos_byte = from_byte;
8328         }
8329       else
8330         {
8331           struct buffer *current = current_buffer;
8332
8333           set_buffer_temp (XBUFFER (dst_object));
8334           coding->dst_pos = PT;
8335           coding->dst_pos_byte = PT_BYTE;
8336           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8337           set_buffer_temp (current);
8338         }
8339       coding->dst_multibyte
8340         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8341     }
8342   else if (EQ (dst_object, Qt))
8343     {
8344       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8345       coding->dst_object = Qnil;
8346       coding->destination = xmalloc (dst_bytes);
8347       coding->dst_bytes = dst_bytes;
8348       coding->dst_multibyte = 0;
8349     }
8350   else
8351     {
8352       coding->dst_object = Qnil;
8353       coding->dst_multibyte = 0;
8354     }
8355
8356   encode_coding (coding);
8357
8358   if (EQ (dst_object, Qt))
8359     {
8360       if (BUFFERP (coding->dst_object))
8361         coding->dst_object = Fbuffer_string ();
8362       else if (coding->raw_destination)
8363         /* This is used to avoid creating huge Lisp string.
8364            NOTE: caller who sets `raw_destination' is also
8365            responsible for freeing `destination' buffer.  */
8366         coding->dst_object = Qnil;
8367       else
8368         {
8369           coding->dst_object
8370             = make_unibyte_string ((char *) coding->destination,
8371                                    coding->produced);
8372           xfree (coding->destination);
8373         }
8374     }
8375
8376   if (saved_pt >= 0)
8377     {
8378       /* This is the case of:
8379          (BUFFERP (src_object) && EQ (src_object, dst_object))
8380          As we have moved PT while replacing the original buffer
8381          contents, we must recover it now.  */
8382       set_buffer_internal (XBUFFER (src_object));
8383       if (saved_pt < from)
8384         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8385       else if (saved_pt < from + chars)
8386         TEMP_SET_PT_BOTH (from, from_byte);
8387       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8388         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8389                           saved_pt_byte + (coding->produced - bytes));
8390       else
8391         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8392                           saved_pt_byte + (coding->produced - bytes));
8393
8394       if (need_marker_adjustment)
8395         {
8396           struct Lisp_Marker *tail;
8397
8398           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8399             if (tail->need_adjustment)
8400               {
8401                 tail->need_adjustment = 0;
8402                 if (tail->insertion_type)
8403                   {
8404                     tail->bytepos = from_byte;
8405                     tail->charpos = from;
8406                   }
8407                 else
8408                   {
8409                     tail->bytepos = from_byte + coding->produced;
8410                     tail->charpos
8411                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8412                          ? tail->bytepos : from + coding->produced_char);
8413                   }
8414               }
8415         }
8416     }
8417
8418   if (kill_src_buffer)
8419     Fkill_buffer (coding->src_object);
8420
8421   Vdeactivate_mark = old_deactivate_mark;
8422   unbind_to (count, Qnil);
8423 }
8424
8425
8426 Lisp_Object
8427 preferred_coding_system (void)
8428 {
8429   int id = coding_categories[coding_priorities[0]].id;
8430
8431   return CODING_ID_NAME (id);
8432 }
8433
8434 #if defined (WINDOWSNT) || defined (CYGWIN)
8435
8436 Lisp_Object
8437 from_unicode (Lisp_Object str)
8438 {
8439   CHECK_STRING (str);
8440   if (!STRING_MULTIBYTE (str) &&
8441       SBYTES (str) & 1)
8442     {
8443       str = Fsubstring (str, make_number (0), make_number (-1));
8444     }
8445
8446   return code_convert_string_norecord (str, Qutf_16le, 0);
8447 }
8448
8449 Lisp_Object
8450 from_unicode_buffer (const wchar_t *wstr)
8451 {
8452   /* We get one of the two final null bytes for free.  */
8453   ptrdiff_t len = 1 + sizeof (wchar_t) * wcslen (wstr);
8454   AUTO_STRING_WITH_LEN (str, (char *) wstr, len);
8455   return from_unicode (str);
8456 }
8457
8458 wchar_t *
8459 to_unicode (Lisp_Object str, Lisp_Object *buf)
8460 {
8461   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8462   /* We need to make another copy (in addition to the one made by
8463      code_convert_string_norecord) to ensure that the final string is
8464      _doubly_ zero terminated --- that is, that the string is
8465      terminated by two zero bytes and one utf-16le null character.
8466      Because strings are already terminated with a single zero byte,
8467      we just add one additional zero. */
8468   str = make_uninit_string (SBYTES (*buf) + 1);
8469   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8470   SDATA (str) [SBYTES (*buf)] = '\0';
8471   *buf = str;
8472   return WCSDATA (*buf);
8473 }
8474
8475 #endif /* WINDOWSNT || CYGWIN */
8476
8477 \f
8478 #ifdef emacs
8479 /*** 8. Emacs Lisp library functions ***/
8480
8481 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8482        doc: /* Return t if OBJECT is nil or a coding-system.
8483 See the documentation of `define-coding-system' for information
8484 about coding-system objects.  */)
8485   (Lisp_Object object)
8486 {
8487   if (NILP (object)
8488       || CODING_SYSTEM_ID (object) >= 0)
8489     return Qt;
8490   if (! SYMBOLP (object)
8491       || NILP (Fget (object, Qcoding_system_define_form)))
8492     return Qnil;
8493   return Qt;
8494 }
8495
8496 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8497        Sread_non_nil_coding_system, 1, 1, 0,
8498        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8499   (Lisp_Object prompt)
8500 {
8501   Lisp_Object val;
8502   do
8503     {
8504       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8505                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8506     }
8507   while (SCHARS (val) == 0);
8508   return (Fintern (val, Qnil));
8509 }
8510
8511 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8512        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8513 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8514 Ignores case when completing coding systems (all Emacs coding systems
8515 are lower-case).  */)
8516   (Lisp_Object prompt, Lisp_Object default_coding_system)
8517 {
8518   Lisp_Object val;
8519   ptrdiff_t count = SPECPDL_INDEX ();
8520
8521   if (SYMBOLP (default_coding_system))
8522     default_coding_system = SYMBOL_NAME (default_coding_system);
8523   specbind (Qcompletion_ignore_case, Qt);
8524   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8525                           Qt, Qnil, Qcoding_system_history,
8526                           default_coding_system, Qnil);
8527   unbind_to (count, Qnil);
8528   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8529 }
8530
8531 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8532        1, 1, 0,
8533        doc: /* Check validity of CODING-SYSTEM.
8534 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8535 It is valid if it is nil or a symbol defined as a coding system by the
8536 function `define-coding-system'.  */)
8537   (Lisp_Object coding_system)
8538 {
8539   Lisp_Object define_form;
8540
8541   define_form = Fget (coding_system, Qcoding_system_define_form);
8542   if (! NILP (define_form))
8543     {
8544       Fput (coding_system, Qcoding_system_define_form, Qnil);
8545       safe_eval (define_form);
8546     }
8547   if (!NILP (Fcoding_system_p (coding_system)))
8548     return coding_system;
8549   xsignal1 (Qcoding_system_error, coding_system);
8550 }
8551
8552 \f
8553 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8554    HIGHEST, return the coding system of the highest
8555    priority among the detected coding systems.  Otherwise return a
8556    list of detected coding systems sorted by their priorities.  If
8557    MULTIBYTEP, it is assumed that the bytes are in correct
8558    multibyte form but contains only ASCII and eight-bit chars.
8559    Otherwise, the bytes are raw bytes.
8560
8561    CODING-SYSTEM controls the detection as below:
8562
8563    If it is nil, detect both text-format and eol-format.  If the
8564    text-format part of CODING-SYSTEM is already specified
8565    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8566    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8567    detect only text-format.  */
8568
8569 Lisp_Object
8570 detect_coding_system (const unsigned char *src,
8571                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8572                       bool highest, bool multibytep,
8573                       Lisp_Object coding_system)
8574 {
8575   const unsigned char *src_end = src + src_bytes;
8576   Lisp_Object attrs, eol_type;
8577   Lisp_Object val = Qnil;
8578   struct coding_system coding;
8579   ptrdiff_t id;
8580   struct coding_detection_info detect_info;
8581   enum coding_category base_category;
8582   bool null_byte_found = 0, eight_bit_found = 0;
8583
8584   if (NILP (coding_system))
8585     coding_system = Qundecided;
8586   setup_coding_system (coding_system, &coding);
8587   attrs = CODING_ID_ATTRS (coding.id);
8588   eol_type = CODING_ID_EOL_TYPE (coding.id);
8589   coding_system = CODING_ATTR_BASE_NAME (attrs);
8590
8591   coding.source = src;
8592   coding.src_chars = src_chars;
8593   coding.src_bytes = src_bytes;
8594   coding.src_multibyte = multibytep;
8595   coding.consumed = 0;
8596   coding.mode |= CODING_MODE_LAST_BLOCK;
8597   coding.head_ascii = 0;
8598
8599   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8600
8601   /* At first, detect text-format if necessary.  */
8602   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8603   if (base_category == coding_category_undecided)
8604     {
8605       enum coding_category category UNINIT;
8606       struct coding_system *this UNINIT;
8607       int c, i;
8608       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8609                                        inhibit_null_byte_detection);
8610       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8611                                        inhibit_iso_escape_detection);
8612       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8613
8614       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8615       for (; src < src_end; src++)
8616         {
8617           c = *src;
8618           if (c & 0x80)
8619             {
8620               eight_bit_found = 1;
8621               if (null_byte_found)
8622                 break;
8623             }
8624           else if (c < 0x20)
8625             {
8626               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8627                   && ! inhibit_ied
8628                   && ! detect_info.checked)
8629                 {
8630                   if (detect_coding_iso_2022 (&coding, &detect_info))
8631                     {
8632                       /* We have scanned the whole data.  */
8633                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8634                         {
8635                           /* We didn't find an 8-bit code.  We may
8636                              have found a null-byte, but it's very
8637                              rare that a binary file confirm to
8638                              ISO-2022.  */
8639                           src = src_end;
8640                           coding.head_ascii = src - coding.source;
8641                         }
8642                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8643                       break;
8644                     }
8645                 }
8646               else if (! c && !inhibit_nbd)
8647                 {
8648                   null_byte_found = 1;
8649                   if (eight_bit_found)
8650                     break;
8651                 }
8652               if (! eight_bit_found)
8653                 coding.head_ascii++;
8654             }
8655           else if (! eight_bit_found)
8656             coding.head_ascii++;
8657         }
8658
8659       if (null_byte_found || eight_bit_found
8660           || coding.head_ascii < coding.src_bytes
8661           || detect_info.found)
8662         {
8663           if (coding.head_ascii == coding.src_bytes)
8664             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8665             for (i = 0; i < coding_category_raw_text; i++)
8666               {
8667                 category = coding_priorities[i];
8668                 this = coding_categories + category;
8669                 if (detect_info.found & (1 << category))
8670                   break;
8671               }
8672           else
8673             {
8674               if (null_byte_found)
8675                 {
8676                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8677                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8678                 }
8679               else if (prefer_utf_8
8680                        && detect_coding_utf_8 (&coding, &detect_info))
8681                 {
8682                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8683                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8684                 }
8685               for (i = 0; i < coding_category_raw_text; i++)
8686                 {
8687                   category = coding_priorities[i];
8688                   this = coding_categories + category;
8689
8690                   if (this->id < 0)
8691                     {
8692                       /* No coding system of this category is defined.  */
8693                       detect_info.rejected |= (1 << category);
8694                     }
8695                   else if (category >= coding_category_raw_text)
8696                     continue;
8697                   else if (detect_info.checked & (1 << category))
8698                     {
8699                       if (highest
8700                           && (detect_info.found & (1 << category)))
8701                         break;
8702                     }
8703                   else if ((*(this->detector)) (&coding, &detect_info)
8704                            && highest
8705                            && (detect_info.found & (1 << category)))
8706                     {
8707                       if (category == coding_category_utf_16_auto)
8708                         {
8709                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8710                             category = coding_category_utf_16_le;
8711                           else
8712                             category = coding_category_utf_16_be;
8713                         }
8714                       break;
8715                     }
8716                 }
8717             }
8718         }
8719
8720       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8721           || null_byte_found)
8722         {
8723           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8724           id = CODING_SYSTEM_ID (Qno_conversion);
8725           val = list1 (make_number (id));
8726         }
8727       else if (! detect_info.rejected && ! detect_info.found)
8728         {
8729           detect_info.found = CATEGORY_MASK_ANY;
8730           id = coding_categories[coding_category_undecided].id;
8731           val = list1 (make_number (id));
8732         }
8733       else if (highest)
8734         {
8735           if (detect_info.found)
8736             {
8737               detect_info.found = 1 << category;
8738               val = list1 (make_number (this->id));
8739             }
8740           else
8741             for (i = 0; i < coding_category_raw_text; i++)
8742               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8743                 {
8744                   detect_info.found = 1 << coding_priorities[i];
8745                   id = coding_categories[coding_priorities[i]].id;
8746                   val = list1 (make_number (id));
8747                   break;
8748                 }
8749         }
8750       else
8751         {
8752           int mask = detect_info.rejected | detect_info.found;
8753           int found = 0;
8754
8755           for (i = coding_category_raw_text - 1; i >= 0; i--)
8756             {
8757               category = coding_priorities[i];
8758               if (! (mask & (1 << category)))
8759                 {
8760                   found |= 1 << category;
8761                   id = coding_categories[category].id;
8762                   if (id >= 0)
8763                     val = list1 (make_number (id));
8764                 }
8765             }
8766           for (i = coding_category_raw_text - 1; i >= 0; i--)
8767             {
8768               category = coding_priorities[i];
8769               if (detect_info.found & (1 << category))
8770                 {
8771                   id = coding_categories[category].id;
8772                   val = Fcons (make_number (id), val);
8773                 }
8774             }
8775           detect_info.found |= found;
8776         }
8777     }
8778   else if (base_category == coding_category_utf_8_auto)
8779     {
8780       if (detect_coding_utf_8 (&coding, &detect_info))
8781         {
8782           struct coding_system *this;
8783
8784           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8785             this = coding_categories + coding_category_utf_8_sig;
8786           else
8787             this = coding_categories + coding_category_utf_8_nosig;
8788           val = list1 (make_number (this->id));
8789         }
8790     }
8791   else if (base_category == coding_category_utf_16_auto)
8792     {
8793       if (detect_coding_utf_16 (&coding, &detect_info))
8794         {
8795           struct coding_system *this;
8796
8797           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8798             this = coding_categories + coding_category_utf_16_le;
8799           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8800             this = coding_categories + coding_category_utf_16_be;
8801           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8802             this = coding_categories + coding_category_utf_16_be_nosig;
8803           else
8804             this = coding_categories + coding_category_utf_16_le_nosig;
8805           val = list1 (make_number (this->id));
8806         }
8807     }
8808   else
8809     {
8810       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8811       val = list1 (make_number (coding.id));
8812     }
8813
8814   /* Then, detect eol-format if necessary.  */
8815   {
8816     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8817     Lisp_Object tail;
8818
8819     if (VECTORP (eol_type))
8820       {
8821         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8822           {
8823             if (null_byte_found)
8824               normal_eol = EOL_SEEN_LF;
8825             else
8826               normal_eol = detect_eol (coding.source, src_bytes,
8827                                        coding_category_raw_text);
8828           }
8829         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8830                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8831           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8832                                       coding_category_utf_16_be);
8833         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8834                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8835           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8836                                       coding_category_utf_16_le);
8837       }
8838     else
8839       {
8840         if (EQ (eol_type, Qunix))
8841           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8842         else if (EQ (eol_type, Qdos))
8843           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8844         else
8845           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8846       }
8847
8848     for (tail = val; CONSP (tail); tail = XCDR (tail))
8849       {
8850         enum coding_category category;
8851         int this_eol;
8852
8853         id = XINT (XCAR (tail));
8854         attrs = CODING_ID_ATTRS (id);
8855         category = XINT (CODING_ATTR_CATEGORY (attrs));
8856         eol_type = CODING_ID_EOL_TYPE (id);
8857         if (VECTORP (eol_type))
8858           {
8859             if (category == coding_category_utf_16_be
8860                 || category == coding_category_utf_16_be_nosig)
8861               this_eol = utf_16_be_eol;
8862             else if (category == coding_category_utf_16_le
8863                      || category == coding_category_utf_16_le_nosig)
8864               this_eol = utf_16_le_eol;
8865             else
8866               this_eol = normal_eol;
8867
8868             if (this_eol == EOL_SEEN_LF)
8869               XSETCAR (tail, AREF (eol_type, 0));
8870             else if (this_eol == EOL_SEEN_CRLF)
8871               XSETCAR (tail, AREF (eol_type, 1));
8872             else if (this_eol == EOL_SEEN_CR)
8873               XSETCAR (tail, AREF (eol_type, 2));
8874             else
8875               XSETCAR (tail, CODING_ID_NAME (id));
8876           }
8877         else
8878           XSETCAR (tail, CODING_ID_NAME (id));
8879       }
8880   }
8881
8882   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8883 }
8884
8885
8886 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8887        2, 3, 0,
8888        doc: /* Detect coding system of the text in the region between START and END.
8889 Return a list of possible coding systems ordered by priority.
8890 The coding systems to try and their priorities follows what
8891 the function `coding-system-priority-list' (which see) returns.
8892
8893 If only ASCII characters are found (except for such ISO-2022 control
8894 characters as ESC), it returns a list of single element `undecided'
8895 or its subsidiary coding system according to a detected end-of-line
8896 format.
8897
8898 If optional argument HIGHEST is non-nil, return the coding system of
8899 highest priority.  */)
8900   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8901 {
8902   ptrdiff_t from, to;
8903   ptrdiff_t from_byte, to_byte;
8904
8905   validate_region (&start, &end);
8906   from = XINT (start), to = XINT (end);
8907   from_byte = CHAR_TO_BYTE (from);
8908   to_byte = CHAR_TO_BYTE (to);
8909
8910   if (from < GPT && to >= GPT)
8911     move_gap_both (to, to_byte);
8912
8913   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8914                                to - from, to_byte - from_byte,
8915                                !NILP (highest),
8916                                !NILP (BVAR (current_buffer
8917                                       , enable_multibyte_characters)),
8918                                Qnil);
8919 }
8920
8921 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8922        1, 2, 0,
8923        doc: /* Detect coding system of the text in STRING.
8924 Return a list of possible coding systems ordered by priority.
8925 The coding systems to try and their priorities follows what
8926 the function `coding-system-priority-list' (which see) returns.
8927
8928 If only ASCII characters are found (except for such ISO-2022 control
8929 characters as ESC), it returns a list of single element `undecided'
8930 or its subsidiary coding system according to a detected end-of-line
8931 format.
8932
8933 If optional argument HIGHEST is non-nil, return the coding system of
8934 highest priority.  */)
8935   (Lisp_Object string, Lisp_Object highest)
8936 {
8937   CHECK_STRING (string);
8938
8939   return detect_coding_system (SDATA (string),
8940                                SCHARS (string), SBYTES (string),
8941                                !NILP (highest), STRING_MULTIBYTE (string),
8942                                Qnil);
8943 }
8944
8945
8946 static bool
8947 char_encodable_p (int c, Lisp_Object attrs)
8948 {
8949   Lisp_Object tail;
8950   struct charset *charset;
8951   Lisp_Object translation_table;
8952
8953   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8954   if (! NILP (translation_table))
8955     c = translate_char (translation_table, c);
8956   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8957        CONSP (tail); tail = XCDR (tail))
8958     {
8959       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8960       if (CHAR_CHARSET_P (c, charset))
8961         break;
8962     }
8963   return (! NILP (tail));
8964 }
8965
8966
8967 /* Return a list of coding systems that safely encode the text between
8968    START and END.  If EXCLUDE is non-nil, it is a list of coding
8969    systems not to check.  The returned list doesn't contain any such
8970    coding systems.  In any case, if the text contains only ASCII or is
8971    unibyte, return t.  */
8972
8973 DEFUN ("find-coding-systems-region-internal",
8974        Ffind_coding_systems_region_internal,
8975        Sfind_coding_systems_region_internal, 2, 3, 0,
8976        doc: /* Internal use only.  */)
8977   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8978 {
8979   Lisp_Object coding_attrs_list, safe_codings;
8980   ptrdiff_t start_byte, end_byte;
8981   const unsigned char *p, *pbeg, *pend;
8982   int c;
8983   Lisp_Object tail, elt, work_table;
8984
8985   if (STRINGP (start))
8986     {
8987       if (!STRING_MULTIBYTE (start)
8988           || SCHARS (start) == SBYTES (start))
8989         return Qt;
8990       start_byte = 0;
8991       end_byte = SBYTES (start);
8992     }
8993   else
8994     {
8995       CHECK_NUMBER_COERCE_MARKER (start);
8996       CHECK_NUMBER_COERCE_MARKER (end);
8997       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8998         args_out_of_range (start, end);
8999       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9000         return Qt;
9001       start_byte = CHAR_TO_BYTE (XINT (start));
9002       end_byte = CHAR_TO_BYTE (XINT (end));
9003       if (XINT (end) - XINT (start) == end_byte - start_byte)
9004         return Qt;
9005
9006       if (XINT (start) < GPT && XINT (end) > GPT)
9007         {
9008           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9009             move_gap_both (XINT (start), start_byte);
9010           else
9011             move_gap_both (XINT (end), end_byte);
9012         }
9013     }
9014
9015   coding_attrs_list = Qnil;
9016   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9017     if (NILP (exclude)
9018         || NILP (Fmemq (XCAR (tail), exclude)))
9019       {
9020         Lisp_Object attrs;
9021
9022         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9023         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9024           {
9025             ASET (attrs, coding_attr_trans_tbl,
9026                   get_translation_table (attrs, 1, NULL));
9027             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9028           }
9029       }
9030
9031   if (STRINGP (start))
9032     p = pbeg = SDATA (start);
9033   else
9034     p = pbeg = BYTE_POS_ADDR (start_byte);
9035   pend = p + (end_byte - start_byte);
9036
9037   while (p < pend && ASCII_CHAR_P (*p)) p++;
9038   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9039
9040   work_table = Fmake_char_table (Qnil, Qnil);
9041   while (p < pend)
9042     {
9043       if (ASCII_CHAR_P (*p))
9044         p++;
9045       else
9046         {
9047           c = STRING_CHAR_ADVANCE (p);
9048           if (!NILP (char_table_ref (work_table, c)))
9049             /* This character was already checked.  Ignore it.  */
9050             continue;
9051
9052           charset_map_loaded = 0;
9053           for (tail = coding_attrs_list; CONSP (tail);)
9054             {
9055               elt = XCAR (tail);
9056               if (NILP (elt))
9057                 tail = XCDR (tail);
9058               else if (char_encodable_p (c, elt))
9059                 tail = XCDR (tail);
9060               else if (CONSP (XCDR (tail)))
9061                 {
9062                   XSETCAR (tail, XCAR (XCDR (tail)));
9063                   XSETCDR (tail, XCDR (XCDR (tail)));
9064                 }
9065               else
9066                 {
9067                   XSETCAR (tail, Qnil);
9068                   tail = XCDR (tail);
9069                 }
9070             }
9071           if (charset_map_loaded)
9072             {
9073               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9074
9075               if (STRINGP (start))
9076                 pbeg = SDATA (start);
9077               else
9078                 pbeg = BYTE_POS_ADDR (start_byte);
9079               p = pbeg + p_offset;
9080               pend = pbeg + pend_offset;
9081             }
9082           char_table_set (work_table, c, Qt);
9083         }
9084     }
9085
9086   safe_codings = list2 (Qraw_text, Qno_conversion);
9087   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9088     if (! NILP (XCAR (tail)))
9089       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9090
9091   return safe_codings;
9092 }
9093
9094
9095 DEFUN ("unencodable-char-position", Funencodable_char_position,
9096        Sunencodable_char_position, 3, 5, 0,
9097        doc: /* Return position of first un-encodable character in a region.
9098 START and END specify the region and CODING-SYSTEM specifies the
9099 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9100
9101 If optional 4th argument COUNT is non-nil, it specifies at most how
9102 many un-encodable characters to search.  In this case, the value is a
9103 list of positions.
9104
9105 If optional 5th argument STRING is non-nil, it is a string to search
9106 for un-encodable characters.  In that case, START and END are indexes
9107 to the string and treated as in `substring'.  */)
9108   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9109    Lisp_Object count, Lisp_Object string)
9110 {
9111   EMACS_INT n;
9112   struct coding_system coding;
9113   Lisp_Object attrs, charset_list, translation_table;
9114   Lisp_Object positions;
9115   ptrdiff_t from, to;
9116   const unsigned char *p, *stop, *pend;
9117   bool ascii_compatible;
9118
9119   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9120   attrs = CODING_ID_ATTRS (coding.id);
9121   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9122     return Qnil;
9123   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9124   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9125   translation_table = get_translation_table (attrs, 1, NULL);
9126
9127   if (NILP (string))
9128     {
9129       validate_region (&start, &end);
9130       from = XINT (start);
9131       to = XINT (end);
9132       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9133           || (ascii_compatible
9134               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9135         return Qnil;
9136       p = CHAR_POS_ADDR (from);
9137       pend = CHAR_POS_ADDR (to);
9138       if (from < GPT && to >= GPT)
9139         stop = GPT_ADDR;
9140       else
9141         stop = pend;
9142     }
9143   else
9144     {
9145       CHECK_STRING (string);
9146       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9147       if (! STRING_MULTIBYTE (string))
9148         return Qnil;
9149       p = SDATA (string) + string_char_to_byte (string, from);
9150       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9151       if (ascii_compatible && (to - from) == (pend - p))
9152         return Qnil;
9153     }
9154
9155   if (NILP (count))
9156     n = 1;
9157   else
9158     {
9159       CHECK_NATNUM (count);
9160       n = XINT (count);
9161     }
9162
9163   positions = Qnil;
9164   charset_map_loaded = 0;
9165   while (1)
9166     {
9167       int c;
9168
9169       if (ascii_compatible)
9170         while (p < stop && ASCII_CHAR_P (*p))
9171           p++, from++;
9172       if (p >= stop)
9173         {
9174           if (p >= pend)
9175             break;
9176           stop = pend;
9177           p = GAP_END_ADDR;
9178         }
9179
9180       c = STRING_CHAR_ADVANCE (p);
9181       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9182           && ! char_charset (translate_char (translation_table, c),
9183                              charset_list, NULL))
9184         {
9185           positions = Fcons (make_number (from), positions);
9186           n--;
9187           if (n == 0)
9188             break;
9189         }
9190
9191       from++;
9192       if (charset_map_loaded && NILP (string))
9193         {
9194           p = CHAR_POS_ADDR (from);
9195           pend = CHAR_POS_ADDR (to);
9196           if (from < GPT && to >= GPT)
9197             stop = GPT_ADDR;
9198           else
9199             stop = pend;
9200           charset_map_loaded = 0;
9201         }
9202     }
9203
9204   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9205 }
9206
9207
9208 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9209        Scheck_coding_systems_region, 3, 3, 0,
9210        doc: /* Check if the region is encodable by coding systems.
9211
9212 START and END are buffer positions specifying the region.
9213 CODING-SYSTEM-LIST is a list of coding systems to check.
9214
9215 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9216 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9217 whole region, POS0, POS1, ... are buffer positions where non-encodable
9218 characters are found.
9219
9220 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9221 value is nil.
9222
9223 START may be a string.  In that case, check if the string is
9224 encodable, and the value contains indices to the string instead of
9225 buffer positions.  END is ignored.
9226
9227 If the current buffer (or START if it is a string) is unibyte, the value
9228 is nil.  */)
9229   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9230 {
9231   Lisp_Object list;
9232   ptrdiff_t start_byte, end_byte;
9233   ptrdiff_t pos;
9234   const unsigned char *p, *pbeg, *pend;
9235   int c;
9236   Lisp_Object tail, elt, attrs;
9237
9238   if (STRINGP (start))
9239     {
9240       if (!STRING_MULTIBYTE (start)
9241           || SCHARS (start) == SBYTES (start))
9242         return Qnil;
9243       start_byte = 0;
9244       end_byte = SBYTES (start);
9245       pos = 0;
9246     }
9247   else
9248     {
9249       CHECK_NUMBER_COERCE_MARKER (start);
9250       CHECK_NUMBER_COERCE_MARKER (end);
9251       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9252         args_out_of_range (start, end);
9253       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9254         return Qnil;
9255       start_byte = CHAR_TO_BYTE (XINT (start));
9256       end_byte = CHAR_TO_BYTE (XINT (end));
9257       if (XINT (end) - XINT (start) == end_byte - start_byte)
9258         return Qnil;
9259
9260       if (XINT (start) < GPT && XINT (end) > GPT)
9261         {
9262           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9263             move_gap_both (XINT (start), start_byte);
9264           else
9265             move_gap_both (XINT (end), end_byte);
9266         }
9267       pos = XINT (start);
9268     }
9269
9270   list = Qnil;
9271   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9272     {
9273       elt = XCAR (tail);
9274       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9275       ASET (attrs, coding_attr_trans_tbl,
9276             get_translation_table (attrs, 1, NULL));
9277       list = Fcons (list2 (elt, attrs), list);
9278     }
9279
9280   if (STRINGP (start))
9281     p = pbeg = SDATA (start);
9282   else
9283     p = pbeg = BYTE_POS_ADDR (start_byte);
9284   pend = p + (end_byte - start_byte);
9285
9286   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9287   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9288
9289   while (p < pend)
9290     {
9291       if (ASCII_CHAR_P (*p))
9292         p++;
9293       else
9294         {
9295           c = STRING_CHAR_ADVANCE (p);
9296
9297           charset_map_loaded = 0;
9298           for (tail = list; CONSP (tail); tail = XCDR (tail))
9299             {
9300               elt = XCDR (XCAR (tail));
9301               if (! char_encodable_p (c, XCAR (elt)))
9302                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9303             }
9304           if (charset_map_loaded)
9305             {
9306               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9307
9308               if (STRINGP (start))
9309                 pbeg = SDATA (start);
9310               else
9311                 pbeg = BYTE_POS_ADDR (start_byte);
9312               p = pbeg + p_offset;
9313               pend = pbeg + pend_offset;
9314             }
9315         }
9316       pos++;
9317     }
9318
9319   tail = list;
9320   list = Qnil;
9321   for (; CONSP (tail); tail = XCDR (tail))
9322     {
9323       elt = XCAR (tail);
9324       if (CONSP (XCDR (XCDR (elt))))
9325         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9326                       list);
9327     }
9328
9329   return list;
9330 }
9331
9332
9333 static Lisp_Object
9334 code_convert_region (Lisp_Object start, Lisp_Object end,
9335                      Lisp_Object coding_system, Lisp_Object dst_object,
9336                      bool encodep, bool norecord)
9337 {
9338   struct coding_system coding;
9339   ptrdiff_t from, from_byte, to, to_byte;
9340   Lisp_Object src_object;
9341
9342   if (NILP (coding_system))
9343     coding_system = Qno_conversion;
9344   else
9345     CHECK_CODING_SYSTEM (coding_system);
9346   src_object = Fcurrent_buffer ();
9347   if (NILP (dst_object))
9348     dst_object = src_object;
9349   else if (! EQ (dst_object, Qt))
9350     CHECK_BUFFER (dst_object);
9351
9352   validate_region (&start, &end);
9353   from = XFASTINT (start);
9354   from_byte = CHAR_TO_BYTE (from);
9355   to = XFASTINT (end);
9356   to_byte = CHAR_TO_BYTE (to);
9357
9358   setup_coding_system (coding_system, &coding);
9359   coding.mode |= CODING_MODE_LAST_BLOCK;
9360
9361   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9362     {
9363       struct buffer *buf = XBUFFER (dst_object);
9364       ptrdiff_t buf_pt = BUF_PT (buf);
9365
9366       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9367     }
9368
9369   if (encodep)
9370     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9371                           dst_object);
9372   else
9373     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9374                           dst_object);
9375   if (! norecord)
9376     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9377
9378   return (BUFFERP (dst_object)
9379           ? make_number (coding.produced_char)
9380           : coding.dst_object);
9381 }
9382
9383
9384 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9385        3, 4, "r\nzCoding system: ",
9386        doc: /* Decode the current region from the specified coding system.
9387 When called from a program, takes four arguments:
9388         START, END, CODING-SYSTEM, and DESTINATION.
9389 START and END are buffer positions.
9390
9391 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9392 If nil, the region between START and END is replaced by the decoded text.
9393 If buffer, the decoded text is inserted in that buffer after point (point
9394 does not move).
9395 In those cases, the length of the decoded text is returned.
9396 If DESTINATION is t, the decoded text is returned.
9397
9398 This function sets `last-coding-system-used' to the precise coding system
9399 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9400 not fully specified.)  */)
9401   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9402 {
9403   return code_convert_region (start, end, coding_system, destination, 0, 0);
9404 }
9405
9406 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9407        3, 4, "r\nzCoding system: ",
9408        doc: /* Encode the current region by specified coding system.
9409 When called from a program, takes four arguments:
9410         START, END, CODING-SYSTEM and DESTINATION.
9411 START and END are buffer positions.
9412
9413 Optional 4th argument DESTINATION specifies where the encoded text goes.
9414 If nil, the region between START and END is replaced by the encoded text.
9415 If buffer, the encoded text is inserted in that buffer after point (point
9416 does not move).
9417 In those cases, the length of the encoded text is returned.
9418 If DESTINATION is t, the encoded text is returned.
9419
9420 This function sets `last-coding-system-used' to the precise coding system
9421 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9422 not fully specified.)  */)
9423   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9424 {
9425   return code_convert_region (start, end, coding_system, destination, 1, 0);
9426 }
9427
9428 Lisp_Object
9429 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9430                      Lisp_Object dst_object, bool encodep, bool nocopy,
9431                      bool norecord)
9432 {
9433   struct coding_system coding;
9434   ptrdiff_t chars, bytes;
9435
9436   CHECK_STRING (string);
9437   if (NILP (coding_system))
9438     {
9439       if (! norecord)
9440         Vlast_coding_system_used = Qno_conversion;
9441       if (NILP (dst_object))
9442         return (nocopy ? Fcopy_sequence (string) : string);
9443     }
9444
9445   if (NILP (coding_system))
9446     coding_system = Qno_conversion;
9447   else
9448     CHECK_CODING_SYSTEM (coding_system);
9449   if (NILP (dst_object))
9450     dst_object = Qt;
9451   else if (! EQ (dst_object, Qt))
9452     CHECK_BUFFER (dst_object);
9453
9454   setup_coding_system (coding_system, &coding);
9455   coding.mode |= CODING_MODE_LAST_BLOCK;
9456   chars = SCHARS (string);
9457   bytes = SBYTES (string);
9458
9459   if (BUFFERP (dst_object))
9460     {
9461       struct buffer *buf = XBUFFER (dst_object);
9462       ptrdiff_t buf_pt = BUF_PT (buf);
9463
9464       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9465     }
9466
9467   if (encodep)
9468     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9469   else
9470     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9471   if (! norecord)
9472     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9473
9474   return (BUFFERP (dst_object)
9475           ? make_number (coding.produced_char)
9476           : coding.dst_object);
9477 }
9478
9479
9480 /* Encode or decode STRING according to CODING_SYSTEM.
9481    Do not set Vlast_coding_system_used.
9482
9483    This function is called only from macros DECODE_FILE and
9484    ENCODE_FILE, thus we ignore character composition.  */
9485
9486 Lisp_Object
9487 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9488                               bool encodep)
9489 {
9490   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9491 }
9492
9493 /* Encode or decode a file name, to or from a unibyte string suitable
9494    for passing to C library functions.  */
9495 Lisp_Object
9496 decode_file_name (Lisp_Object fname)
9497 {
9498 #ifdef WINDOWSNT
9499   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9500      converts the file names either to UTF-16LE or to the system ANSI
9501      codepage internally, depending on the underlying OS; see w32.c.  */
9502   if (! NILP (Fcoding_system_p (Qutf_8)))
9503     return code_convert_string_norecord (fname, Qutf_8, 0);
9504   return fname;
9505 #else  /* !WINDOWSNT */
9506   if (! NILP (Vfile_name_coding_system))
9507     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9508   else if (! NILP (Vdefault_file_name_coding_system))
9509     return code_convert_string_norecord (fname,
9510                                          Vdefault_file_name_coding_system, 0);
9511   else
9512     return fname;
9513 #endif
9514 }
9515
9516 Lisp_Object
9517 encode_file_name (Lisp_Object fname)
9518 {
9519   /* This is especially important during bootstrap and dumping, when
9520      file-name encoding is not yet known, and therefore any non-ASCII
9521      file names are unibyte strings, and could only be thrashed if we
9522      try to encode them.  */
9523   if (!STRING_MULTIBYTE (fname))
9524     return fname;
9525 #ifdef WINDOWSNT
9526   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9527      converts the file names either to UTF-16LE or to the system ANSI
9528      codepage internally, depending on the underlying OS; see w32.c.  */
9529   if (! NILP (Fcoding_system_p (Qutf_8)))
9530     return code_convert_string_norecord (fname, Qutf_8, 1);
9531   return fname;
9532 #else  /* !WINDOWSNT */
9533   if (! NILP (Vfile_name_coding_system))
9534     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9535   else if (! NILP (Vdefault_file_name_coding_system))
9536     return code_convert_string_norecord (fname,
9537                                          Vdefault_file_name_coding_system, 1);
9538   else
9539     return fname;
9540 #endif
9541 }
9542
9543 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9544        2, 4, 0,
9545        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9546
9547 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9548 if the decoding operation is trivial.
9549
9550 Optional fourth arg BUFFER non-nil means that the decoded text is
9551 inserted in that buffer after point (point does not move).  In this
9552 case, the return value is the length of the decoded text.
9553
9554 This function sets `last-coding-system-used' to the precise coding system
9555 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9556 not fully specified.)  */)
9557   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9558 {
9559   return code_convert_string (string, coding_system, buffer,
9560                               0, ! NILP (nocopy), 0);
9561 }
9562
9563 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9564        2, 4, 0,
9565        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9566
9567 Optional third arg NOCOPY non-nil means it is OK to return STRING
9568 itself if the encoding operation is trivial.
9569
9570 Optional fourth arg BUFFER non-nil means that the encoded text is
9571 inserted in that buffer after point (point does not move).  In this
9572 case, the return value is the length of the encoded text.
9573
9574 This function sets `last-coding-system-used' to the precise coding system
9575 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9576 not fully specified.)  */)
9577   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9578 {
9579   return code_convert_string (string, coding_system, buffer,
9580                               1, ! NILP (nocopy), 0);
9581 }
9582
9583 \f
9584 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9585        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9586 Return the corresponding character.  */)
9587   (Lisp_Object code)
9588 {
9589   Lisp_Object spec, attrs, val;
9590   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9591   EMACS_INT ch;
9592   int c;
9593
9594   CHECK_NATNUM (code);
9595   ch = XFASTINT (code);
9596   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9597   attrs = AREF (spec, 0);
9598
9599   if (ASCII_CHAR_P (ch)
9600       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9601     return code;
9602
9603   val = CODING_ATTR_CHARSET_LIST (attrs);
9604   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9605   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9606   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9607
9608   if (ch <= 0x7F)
9609     {
9610       c = ch;
9611       charset = charset_roman;
9612     }
9613   else if (ch >= 0xA0 && ch < 0xDF)
9614     {
9615       c = ch - 0x80;
9616       charset = charset_kana;
9617     }
9618   else
9619     {
9620       EMACS_INT c1 = ch >> 8;
9621       int c2 = ch & 0xFF;
9622
9623       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9624           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9625         error ("Invalid code: %"pI"d", ch);
9626       c = ch;
9627       SJIS_TO_JIS (c);
9628       charset = charset_kanji;
9629     }
9630   c = DECODE_CHAR (charset, c);
9631   if (c < 0)
9632     error ("Invalid code: %"pI"d", ch);
9633   return make_number (c);
9634 }
9635
9636
9637 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9638        doc: /* Encode a Japanese character CH to shift_jis encoding.
9639 Return the corresponding code in SJIS.  */)
9640   (Lisp_Object ch)
9641 {
9642   Lisp_Object spec, attrs, charset_list;
9643   int c;
9644   struct charset *charset;
9645   unsigned code;
9646
9647   CHECK_CHARACTER (ch);
9648   c = XFASTINT (ch);
9649   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9650   attrs = AREF (spec, 0);
9651
9652   if (ASCII_CHAR_P (c)
9653       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9654     return ch;
9655
9656   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9657   charset = char_charset (c, charset_list, &code);
9658   if (code == CHARSET_INVALID_CODE (charset))
9659     error ("Can't encode by shift_jis encoding: %c", c);
9660   JIS_TO_SJIS (code);
9661
9662   return make_number (code);
9663 }
9664
9665 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9666        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9667 Return the corresponding character.  */)
9668   (Lisp_Object code)
9669 {
9670   Lisp_Object spec, attrs, val;
9671   struct charset *charset_roman, *charset_big5, *charset;
9672   EMACS_INT ch;
9673   int c;
9674
9675   CHECK_NATNUM (code);
9676   ch = XFASTINT (code);
9677   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9678   attrs = AREF (spec, 0);
9679
9680   if (ASCII_CHAR_P (ch)
9681       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9682     return code;
9683
9684   val = CODING_ATTR_CHARSET_LIST (attrs);
9685   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9686   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9687
9688   if (ch <= 0x7F)
9689     {
9690       c = ch;
9691       charset = charset_roman;
9692     }
9693   else
9694     {
9695       EMACS_INT b1 = ch >> 8;
9696       int b2 = ch & 0x7F;
9697       if (b1 < 0xA1 || b1 > 0xFE
9698           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9699         error ("Invalid code: %"pI"d", ch);
9700       c = ch;
9701       charset = charset_big5;
9702     }
9703   c = DECODE_CHAR (charset, c);
9704   if (c < 0)
9705     error ("Invalid code: %"pI"d", ch);
9706   return make_number (c);
9707 }
9708
9709 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9710        doc: /* Encode the Big5 character CH to BIG5 coding system.
9711 Return the corresponding character code in Big5.  */)
9712   (Lisp_Object ch)
9713 {
9714   Lisp_Object spec, attrs, charset_list;
9715   struct charset *charset;
9716   int c;
9717   unsigned code;
9718
9719   CHECK_CHARACTER (ch);
9720   c = XFASTINT (ch);
9721   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9722   attrs = AREF (spec, 0);
9723   if (ASCII_CHAR_P (c)
9724       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9725     return ch;
9726
9727   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9728   charset = char_charset (c, charset_list, &code);
9729   if (code == CHARSET_INVALID_CODE (charset))
9730     error ("Can't encode by Big5 encoding: %c", c);
9731
9732   return make_number (code);
9733 }
9734
9735 \f
9736 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9737        Sset_terminal_coding_system_internal, 1, 2, 0,
9738        doc: /* Internal use only.  */)
9739   (Lisp_Object coding_system, Lisp_Object terminal)
9740 {
9741   struct terminal *term = decode_live_terminal (terminal);
9742   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9743   CHECK_SYMBOL (coding_system);
9744   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9745   /* We had better not send unsafe characters to terminal.  */
9746   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9747   /* Character composition should be disabled.  */
9748   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9749   terminal_coding->src_multibyte = 1;
9750   terminal_coding->dst_multibyte = 0;
9751   tset_charset_list
9752     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9753             ? coding_charset_list (terminal_coding)
9754             : list1 (make_number (charset_ascii))));
9755   return Qnil;
9756 }
9757
9758 DEFUN ("set-safe-terminal-coding-system-internal",
9759        Fset_safe_terminal_coding_system_internal,
9760        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9761        doc: /* Internal use only.  */)
9762   (Lisp_Object coding_system)
9763 {
9764   CHECK_SYMBOL (coding_system);
9765   setup_coding_system (Fcheck_coding_system (coding_system),
9766                        &safe_terminal_coding);
9767   /* Character composition should be disabled.  */
9768   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9769   safe_terminal_coding.src_multibyte = 1;
9770   safe_terminal_coding.dst_multibyte = 0;
9771   return Qnil;
9772 }
9773
9774 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9775        Sterminal_coding_system, 0, 1, 0,
9776        doc: /* Return coding system specified for terminal output on the given terminal.
9777 TERMINAL may be a terminal object, a frame, or nil for the selected
9778 frame's terminal device.  */)
9779   (Lisp_Object terminal)
9780 {
9781   struct coding_system *terminal_coding
9782     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
9783   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9784
9785   /* For backward compatibility, return nil if it is `undecided'.  */
9786   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9787 }
9788
9789 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9790        Sset_keyboard_coding_system_internal, 1, 2, 0,
9791        doc: /* Internal use only.  */)
9792   (Lisp_Object coding_system, Lisp_Object terminal)
9793 {
9794   struct terminal *t = decode_live_terminal (terminal);
9795   CHECK_SYMBOL (coding_system);
9796   if (NILP (coding_system))
9797     coding_system = Qno_conversion;
9798   else
9799     Fcheck_coding_system (coding_system);
9800   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9801   /* Character composition should be disabled.  */
9802   TERMINAL_KEYBOARD_CODING (t)->common_flags
9803     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9804   return Qnil;
9805 }
9806
9807 DEFUN ("keyboard-coding-system",
9808        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9809        doc: /* Return coding system specified for decoding keyboard input.  */)
9810   (Lisp_Object terminal)
9811 {
9812   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9813                          (decode_live_terminal (terminal))->id);
9814 }
9815
9816 \f
9817 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9818        Sfind_operation_coding_system,  1, MANY, 0,
9819        doc: /* Choose a coding system for an operation based on the target name.
9820 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9821 DECODING-SYSTEM is the coding system to use for decoding
9822 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9823 for encoding (in case OPERATION does encoding).
9824
9825 The first argument OPERATION specifies an I/O primitive:
9826   For file I/O, `insert-file-contents' or `write-region'.
9827   For process I/O, `call-process', `call-process-region', or `start-process'.
9828   For network I/O, `open-network-stream'.
9829
9830 The remaining arguments should be the same arguments that were passed
9831 to the primitive.  Depending on which primitive, one of those arguments
9832 is selected as the TARGET.  For example, if OPERATION does file I/O,
9833 whichever argument specifies the file name is TARGET.
9834
9835 TARGET has a meaning which depends on OPERATION:
9836   For file I/O, TARGET is a file name (except for the special case below).
9837   For process I/O, TARGET is a process name.
9838   For network I/O, TARGET is a service name or a port number.
9839
9840 This function looks up what is specified for TARGET in
9841 `file-coding-system-alist', `process-coding-system-alist',
9842 or `network-coding-system-alist' depending on OPERATION.
9843 They may specify a coding system, a cons of coding systems,
9844 or a function symbol to call.
9845 In the last case, we call the function with one argument,
9846 which is a list of all the arguments given to this function.
9847 If the function can't decide a coding system, it can return
9848 `undecided' so that the normal code-detection is performed.
9849
9850 If OPERATION is `insert-file-contents', the argument corresponding to
9851 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9852 file name to look up, and BUFFER is a buffer that contains the file's
9853 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9854 function to call for FILENAME, that function should examine the
9855 contents of BUFFER instead of reading the file.
9856
9857 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9858   (ptrdiff_t nargs, Lisp_Object *args)
9859 {
9860   Lisp_Object operation, target_idx, target, val;
9861   register Lisp_Object chain;
9862
9863   if (nargs < 2)
9864     error ("Too few arguments");
9865   operation = args[0];
9866   if (!SYMBOLP (operation)
9867       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9868     error ("Invalid first argument");
9869   if (nargs <= 1 + XFASTINT (target_idx))
9870     error ("Too few arguments for operation `%s'",
9871            SDATA (SYMBOL_NAME (operation)));
9872   target = args[XFASTINT (target_idx) + 1];
9873   if (!(STRINGP (target)
9874         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9875             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9876         || (EQ (operation, Qopen_network_stream)
9877             && (INTEGERP (target) || EQ (target, Qt)))))
9878     error ("Invalid argument %"pI"d of operation `%s'",
9879            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9880   if (CONSP (target))
9881     target = XCAR (target);
9882
9883   chain = ((EQ (operation, Qinsert_file_contents)
9884             || EQ (operation, Qwrite_region))
9885            ? Vfile_coding_system_alist
9886            : (EQ (operation, Qopen_network_stream)
9887               ? Vnetwork_coding_system_alist
9888               : Vprocess_coding_system_alist));
9889   if (NILP (chain))
9890     return Qnil;
9891
9892   for (; CONSP (chain); chain = XCDR (chain))
9893     {
9894       Lisp_Object elt;
9895
9896       elt = XCAR (chain);
9897       if (CONSP (elt)
9898           && ((STRINGP (target)
9899                && STRINGP (XCAR (elt))
9900                && fast_string_match (XCAR (elt), target) >= 0)
9901               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9902         {
9903           val = XCDR (elt);
9904           /* Here, if VAL is both a valid coding system and a valid
9905              function symbol, we return VAL as a coding system.  */
9906           if (CONSP (val))
9907             return val;
9908           if (! SYMBOLP (val))
9909             return Qnil;
9910           if (! NILP (Fcoding_system_p (val)))
9911             return Fcons (val, val);
9912           if (! NILP (Ffboundp (val)))
9913             {
9914               /* We use call1 rather than safe_call1
9915                  so as to get bug reports about functions called here
9916                  which don't handle the current interface.  */
9917               val = call1 (val, Flist (nargs, args));
9918               if (CONSP (val))
9919                 return val;
9920               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9921                 return Fcons (val, val);
9922             }
9923           return Qnil;
9924         }
9925     }
9926   return Qnil;
9927 }
9928
9929 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9930        Sset_coding_system_priority, 0, MANY, 0,
9931        doc: /* Assign higher priority to the coding systems given as arguments.
9932 If multiple coding systems belong to the same category,
9933 all but the first one are ignored.
9934
9935 usage: (set-coding-system-priority &rest coding-systems)  */)
9936   (ptrdiff_t nargs, Lisp_Object *args)
9937 {
9938   ptrdiff_t i, j;
9939   bool changed[coding_category_max];
9940   enum coding_category priorities[coding_category_max];
9941
9942   memset (changed, 0, sizeof changed);
9943
9944   for (i = j = 0; i < nargs; i++)
9945     {
9946       enum coding_category category;
9947       Lisp_Object spec, attrs;
9948
9949       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9950       attrs = AREF (spec, 0);
9951       category = XINT (CODING_ATTR_CATEGORY (attrs));
9952       if (changed[category])
9953         /* Ignore this coding system because a coding system of the
9954            same category already had a higher priority.  */
9955         continue;
9956       changed[category] = 1;
9957       priorities[j++] = category;
9958       if (coding_categories[category].id >= 0
9959           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9960         setup_coding_system (args[i], &coding_categories[category]);
9961       Fset (AREF (Vcoding_category_table, category), args[i]);
9962     }
9963
9964   /* Now we have decided top J priorities.  Reflect the order of the
9965      original priorities to the remaining priorities.  */
9966
9967   for (i = j, j = 0; i < coding_category_max; i++, j++)
9968     {
9969       while (j < coding_category_max
9970              && changed[coding_priorities[j]])
9971         j++;
9972       if (j == coding_category_max)
9973         emacs_abort ();
9974       priorities[i] = coding_priorities[j];
9975     }
9976
9977   memcpy (coding_priorities, priorities, sizeof priorities);
9978
9979   /* Update `coding-category-list'.  */
9980   Vcoding_category_list = Qnil;
9981   for (i = coding_category_max; i-- > 0; )
9982     Vcoding_category_list
9983       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9984                Vcoding_category_list);
9985
9986   return Qnil;
9987 }
9988
9989 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9990        Scoding_system_priority_list, 0, 1, 0,
9991        doc: /* Return a list of coding systems ordered by their priorities.
9992 The list contains a subset of coding systems; i.e. coding systems
9993 assigned to each coding category (see `coding-category-list').
9994
9995 HIGHESTP non-nil means just return the highest priority one.  */)
9996   (Lisp_Object highestp)
9997 {
9998   int i;
9999   Lisp_Object val;
10000
10001   for (i = 0, val = Qnil; i < coding_category_max; i++)
10002     {
10003       enum coding_category category = coding_priorities[i];
10004       int id = coding_categories[category].id;
10005       Lisp_Object attrs;
10006
10007       if (id < 0)
10008         continue;
10009       attrs = CODING_ID_ATTRS (id);
10010       if (! NILP (highestp))
10011         return CODING_ATTR_BASE_NAME (attrs);
10012       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10013     }
10014   return Fnreverse (val);
10015 }
10016
10017 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10018
10019 static Lisp_Object
10020 make_subsidiaries (Lisp_Object base)
10021 {
10022   Lisp_Object subsidiaries;
10023   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10024   USE_SAFE_ALLOCA;
10025   char *buf = SAFE_ALLOCA (base_name_len + 6);
10026   int i;
10027
10028   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10029   subsidiaries = make_uninit_vector (3);
10030   for (i = 0; i < 3; i++)
10031     {
10032       strcpy (buf + base_name_len, suffixes[i]);
10033       ASET (subsidiaries, i, intern (buf));
10034     }
10035   SAFE_FREE ();
10036   return subsidiaries;
10037 }
10038
10039
10040 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10041        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10042        doc: /* For internal use only.
10043 usage: (define-coding-system-internal ...)  */)
10044   (ptrdiff_t nargs, Lisp_Object *args)
10045 {
10046   Lisp_Object name;
10047   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10048   Lisp_Object attrs;            /* Vector of attributes.  */
10049   Lisp_Object eol_type;
10050   Lisp_Object aliases;
10051   Lisp_Object coding_type, charset_list, safe_charsets;
10052   enum coding_category category;
10053   Lisp_Object tail, val;
10054   int max_charset_id = 0;
10055   int i;
10056
10057   if (nargs < coding_arg_max)
10058     goto short_args;
10059
10060   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10061
10062   name = args[coding_arg_name];
10063   CHECK_SYMBOL (name);
10064   ASET (attrs, coding_attr_base_name, name);
10065
10066   val = args[coding_arg_mnemonic];
10067   if (! STRINGP (val))
10068     CHECK_CHARACTER (val);
10069   ASET (attrs, coding_attr_mnemonic, val);
10070
10071   coding_type = args[coding_arg_coding_type];
10072   CHECK_SYMBOL (coding_type);
10073   ASET (attrs, coding_attr_type, coding_type);
10074
10075   charset_list = args[coding_arg_charset_list];
10076   if (SYMBOLP (charset_list))
10077     {
10078       if (EQ (charset_list, Qiso_2022))
10079         {
10080           if (! EQ (coding_type, Qiso_2022))
10081             error ("Invalid charset-list");
10082           charset_list = Viso_2022_charset_list;
10083         }
10084       else if (EQ (charset_list, Qemacs_mule))
10085         {
10086           if (! EQ (coding_type, Qemacs_mule))
10087             error ("Invalid charset-list");
10088           charset_list = Vemacs_mule_charset_list;
10089         }
10090       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10091         {
10092           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10093             error ("Invalid charset-list");
10094           if (max_charset_id < XFASTINT (XCAR (tail)))
10095             max_charset_id = XFASTINT (XCAR (tail));
10096         }
10097     }
10098   else
10099     {
10100       charset_list = Fcopy_sequence (charset_list);
10101       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10102         {
10103           struct charset *charset;
10104
10105           val = XCAR (tail);
10106           CHECK_CHARSET_GET_CHARSET (val, charset);
10107           if (EQ (coding_type, Qiso_2022)
10108               ? CHARSET_ISO_FINAL (charset) < 0
10109               : EQ (coding_type, Qemacs_mule)
10110               ? CHARSET_EMACS_MULE_ID (charset) < 0
10111               : 0)
10112             error ("Can't handle charset `%s'",
10113                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10114
10115           XSETCAR (tail, make_number (charset->id));
10116           if (max_charset_id < charset->id)
10117             max_charset_id = charset->id;
10118         }
10119     }
10120   ASET (attrs, coding_attr_charset_list, charset_list);
10121
10122   safe_charsets = make_uninit_string (max_charset_id + 1);
10123   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10124   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10125     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10126   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10127
10128   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10129
10130   val = args[coding_arg_decode_translation_table];
10131   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10132     CHECK_SYMBOL (val);
10133   ASET (attrs, coding_attr_decode_tbl, val);
10134
10135   val = args[coding_arg_encode_translation_table];
10136   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10137     CHECK_SYMBOL (val);
10138   ASET (attrs, coding_attr_encode_tbl, val);
10139
10140   val = args[coding_arg_post_read_conversion];
10141   CHECK_SYMBOL (val);
10142   ASET (attrs, coding_attr_post_read, val);
10143
10144   val = args[coding_arg_pre_write_conversion];
10145   CHECK_SYMBOL (val);
10146   ASET (attrs, coding_attr_pre_write, val);
10147
10148   val = args[coding_arg_default_char];
10149   if (NILP (val))
10150     ASET (attrs, coding_attr_default_char, make_number (' '));
10151   else
10152     {
10153       CHECK_CHARACTER (val);
10154       ASET (attrs, coding_attr_default_char, val);
10155     }
10156
10157   val = args[coding_arg_for_unibyte];
10158   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10159
10160   val = args[coding_arg_plist];
10161   CHECK_LIST (val);
10162   ASET (attrs, coding_attr_plist, val);
10163
10164   if (EQ (coding_type, Qcharset))
10165     {
10166       /* Generate a lisp vector of 256 elements.  Each element is nil,
10167          integer, or a list of charset IDs.
10168
10169          If Nth element is nil, the byte code N is invalid in this
10170          coding system.
10171
10172          If Nth element is a number NUM, N is the first byte of a
10173          charset whose ID is NUM.
10174
10175          If Nth element is a list of charset IDs, N is the first byte
10176          of one of them.  The list is sorted by dimensions of the
10177          charsets.  A charset of smaller dimension comes first. */
10178       val = Fmake_vector (make_number (256), Qnil);
10179
10180       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10181         {
10182           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10183           int dim = CHARSET_DIMENSION (charset);
10184           int idx = (dim - 1) * 4;
10185
10186           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10187             ASET (attrs, coding_attr_ascii_compat, Qt);
10188
10189           for (i = charset->code_space[idx];
10190                i <= charset->code_space[idx + 1]; i++)
10191             {
10192               Lisp_Object tmp, tmp2;
10193               int dim2;
10194
10195               tmp = AREF (val, i);
10196               if (NILP (tmp))
10197                 tmp = XCAR (tail);
10198               else if (NUMBERP (tmp))
10199                 {
10200                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10201                   if (dim < dim2)
10202                     tmp = list2 (XCAR (tail), tmp);
10203                   else
10204                     tmp = list2 (tmp, XCAR (tail));
10205                 }
10206               else
10207                 {
10208                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10209                     {
10210                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10211                       if (dim < dim2)
10212                         break;
10213                     }
10214                   if (NILP (tmp2))
10215                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10216                   else
10217                     {
10218                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10219                       XSETCAR (tmp2, XCAR (tail));
10220                     }
10221                 }
10222               ASET (val, i, tmp);
10223             }
10224         }
10225       ASET (attrs, coding_attr_charset_valids, val);
10226       category = coding_category_charset;
10227     }
10228   else if (EQ (coding_type, Qccl))
10229     {
10230       Lisp_Object valids;
10231
10232       if (nargs < coding_arg_ccl_max)
10233         goto short_args;
10234
10235       val = args[coding_arg_ccl_decoder];
10236       CHECK_CCL_PROGRAM (val);
10237       if (VECTORP (val))
10238         val = Fcopy_sequence (val);
10239       ASET (attrs, coding_attr_ccl_decoder, val);
10240
10241       val = args[coding_arg_ccl_encoder];
10242       CHECK_CCL_PROGRAM (val);
10243       if (VECTORP (val))
10244         val = Fcopy_sequence (val);
10245       ASET (attrs, coding_attr_ccl_encoder, val);
10246
10247       val = args[coding_arg_ccl_valids];
10248       valids = Fmake_string (make_number (256), make_number (0));
10249       for (tail = val; CONSP (tail); tail = XCDR (tail))
10250         {
10251           int from, to;
10252
10253           val = XCAR (tail);
10254           if (INTEGERP (val))
10255             {
10256               if (! (0 <= XINT (val) && XINT (val) <= 255))
10257                 args_out_of_range_3 (val, make_number (0), make_number (255));
10258               from = to = XINT (val);
10259             }
10260           else
10261             {
10262               CHECK_CONS (val);
10263               CHECK_NATNUM_CAR (val);
10264               CHECK_NUMBER_CDR (val);
10265               if (XINT (XCAR (val)) > 255)
10266                 args_out_of_range_3 (XCAR (val),
10267                                      make_number (0), make_number (255));
10268               from = XINT (XCAR (val));
10269               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10270                 args_out_of_range_3 (XCDR (val),
10271                                      XCAR (val), make_number (255));
10272               to = XINT (XCDR (val));
10273             }
10274           for (i = from; i <= to; i++)
10275             SSET (valids, i, 1);
10276         }
10277       ASET (attrs, coding_attr_ccl_valids, valids);
10278
10279       category = coding_category_ccl;
10280     }
10281   else if (EQ (coding_type, Qutf_16))
10282     {
10283       Lisp_Object bom, endian;
10284
10285       ASET (attrs, coding_attr_ascii_compat, Qnil);
10286
10287       if (nargs < coding_arg_utf16_max)
10288         goto short_args;
10289
10290       bom = args[coding_arg_utf16_bom];
10291       if (! NILP (bom) && ! EQ (bom, Qt))
10292         {
10293           CHECK_CONS (bom);
10294           val = XCAR (bom);
10295           CHECK_CODING_SYSTEM (val);
10296           val = XCDR (bom);
10297           CHECK_CODING_SYSTEM (val);
10298         }
10299       ASET (attrs, coding_attr_utf_bom, bom);
10300
10301       endian = args[coding_arg_utf16_endian];
10302       CHECK_SYMBOL (endian);
10303       if (NILP (endian))
10304         endian = Qbig;
10305       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10306         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10307       ASET (attrs, coding_attr_utf_16_endian, endian);
10308
10309       category = (CONSP (bom)
10310                   ? coding_category_utf_16_auto
10311                   : NILP (bom)
10312                   ? (EQ (endian, Qbig)
10313                      ? coding_category_utf_16_be_nosig
10314                      : coding_category_utf_16_le_nosig)
10315                   : (EQ (endian, Qbig)
10316                      ? coding_category_utf_16_be
10317                      : coding_category_utf_16_le));
10318     }
10319   else if (EQ (coding_type, Qiso_2022))
10320     {
10321       Lisp_Object initial, reg_usage, request, flags;
10322
10323       if (nargs < coding_arg_iso2022_max)
10324         goto short_args;
10325
10326       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10327       CHECK_VECTOR (initial);
10328       for (i = 0; i < 4; i++)
10329         {
10330           val = AREF (initial, i);
10331           if (! NILP (val))
10332             {
10333               struct charset *charset;
10334
10335               CHECK_CHARSET_GET_CHARSET (val, charset);
10336               ASET (initial, i, make_number (CHARSET_ID (charset)));
10337               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10338                 ASET (attrs, coding_attr_ascii_compat, Qt);
10339             }
10340           else
10341             ASET (initial, i, make_number (-1));
10342         }
10343
10344       reg_usage = args[coding_arg_iso2022_reg_usage];
10345       CHECK_CONS (reg_usage);
10346       CHECK_NUMBER_CAR (reg_usage);
10347       CHECK_NUMBER_CDR (reg_usage);
10348
10349       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10350       for (tail = request; CONSP (tail); tail = XCDR (tail))
10351         {
10352           int id;
10353           Lisp_Object tmp1;
10354
10355           val = XCAR (tail);
10356           CHECK_CONS (val);
10357           tmp1 = XCAR (val);
10358           CHECK_CHARSET_GET_ID (tmp1, id);
10359           CHECK_NATNUM_CDR (val);
10360           if (XINT (XCDR (val)) >= 4)
10361             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10362           XSETCAR (val, make_number (id));
10363         }
10364
10365       flags = args[coding_arg_iso2022_flags];
10366       CHECK_NATNUM (flags);
10367       i = XINT (flags) & INT_MAX;
10368       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10369         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10370       flags = make_number (i);
10371
10372       ASET (attrs, coding_attr_iso_initial, initial);
10373       ASET (attrs, coding_attr_iso_usage, reg_usage);
10374       ASET (attrs, coding_attr_iso_request, request);
10375       ASET (attrs, coding_attr_iso_flags, flags);
10376       setup_iso_safe_charsets (attrs);
10377
10378       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10379         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10380                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10381                     ? coding_category_iso_7_else
10382                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10383                     ? coding_category_iso_7
10384                     : coding_category_iso_7_tight);
10385       else
10386         {
10387           int id = XINT (AREF (initial, 1));
10388
10389           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10390                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10391                        || id < 0)
10392                       ? coding_category_iso_8_else
10393                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10394                       ? coding_category_iso_8_1
10395                       : coding_category_iso_8_2);
10396         }
10397       if (category != coding_category_iso_8_1
10398           && category != coding_category_iso_8_2)
10399         ASET (attrs, coding_attr_ascii_compat, Qnil);
10400     }
10401   else if (EQ (coding_type, Qemacs_mule))
10402     {
10403       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10404         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10405       ASET (attrs, coding_attr_ascii_compat, Qt);
10406       category = coding_category_emacs_mule;
10407     }
10408   else if (EQ (coding_type, Qshift_jis))
10409     {
10410
10411       struct charset *charset;
10412
10413       if (XINT (Flength (charset_list)) != 3
10414           && XINT (Flength (charset_list)) != 4)
10415         error ("There should be three or four charsets");
10416
10417       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10418       if (CHARSET_DIMENSION (charset) != 1)
10419         error ("Dimension of charset %s is not one",
10420                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10421       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10422         ASET (attrs, coding_attr_ascii_compat, Qt);
10423
10424       charset_list = XCDR (charset_list);
10425       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10426       if (CHARSET_DIMENSION (charset) != 1)
10427         error ("Dimension of charset %s is not one",
10428                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10429
10430       charset_list = XCDR (charset_list);
10431       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10432       if (CHARSET_DIMENSION (charset) != 2)
10433         error ("Dimension of charset %s is not two",
10434                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10435
10436       charset_list = XCDR (charset_list);
10437       if (! NILP (charset_list))
10438         {
10439           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10440           if (CHARSET_DIMENSION (charset) != 2)
10441             error ("Dimension of charset %s is not two",
10442                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10443         }
10444
10445       category = coding_category_sjis;
10446       Vsjis_coding_system = name;
10447     }
10448   else if (EQ (coding_type, Qbig5))
10449     {
10450       struct charset *charset;
10451
10452       if (XINT (Flength (charset_list)) != 2)
10453         error ("There should be just two charsets");
10454
10455       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10456       if (CHARSET_DIMENSION (charset) != 1)
10457         error ("Dimension of charset %s is not one",
10458                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10459       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10460         ASET (attrs, coding_attr_ascii_compat, Qt);
10461
10462       charset_list = XCDR (charset_list);
10463       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10464       if (CHARSET_DIMENSION (charset) != 2)
10465         error ("Dimension of charset %s is not two",
10466                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10467
10468       category = coding_category_big5;
10469       Vbig5_coding_system = name;
10470     }
10471   else if (EQ (coding_type, Qraw_text))
10472     {
10473       category = coding_category_raw_text;
10474       ASET (attrs, coding_attr_ascii_compat, Qt);
10475     }
10476   else if (EQ (coding_type, Qutf_8))
10477     {
10478       Lisp_Object bom;
10479
10480       if (nargs < coding_arg_utf8_max)
10481         goto short_args;
10482
10483       bom = args[coding_arg_utf8_bom];
10484       if (! NILP (bom) && ! EQ (bom, Qt))
10485         {
10486           CHECK_CONS (bom);
10487           val = XCAR (bom);
10488           CHECK_CODING_SYSTEM (val);
10489           val = XCDR (bom);
10490           CHECK_CODING_SYSTEM (val);
10491         }
10492       ASET (attrs, coding_attr_utf_bom, bom);
10493       if (NILP (bom))
10494         ASET (attrs, coding_attr_ascii_compat, Qt);
10495
10496       category = (CONSP (bom) ? coding_category_utf_8_auto
10497                   : NILP (bom) ? coding_category_utf_8_nosig
10498                   : coding_category_utf_8_sig);
10499     }
10500   else if (EQ (coding_type, Qundecided))
10501     {
10502       if (nargs < coding_arg_undecided_max)
10503         goto short_args;
10504       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10505             args[coding_arg_undecided_inhibit_null_byte_detection]);
10506       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10507             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10508       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10509             args[coding_arg_undecided_prefer_utf_8]);
10510       category = coding_category_undecided;
10511     }
10512   else
10513     error ("Invalid coding system type: %s",
10514            SDATA (SYMBOL_NAME (coding_type)));
10515
10516   ASET (attrs, coding_attr_category, make_number (category));
10517   ASET (attrs, coding_attr_plist,
10518         Fcons (QCcategory,
10519                Fcons (AREF (Vcoding_category_table, category),
10520                       CODING_ATTR_PLIST (attrs))));
10521   ASET (attrs, coding_attr_plist,
10522         Fcons (QCascii_compatible_p,
10523                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10524                       CODING_ATTR_PLIST (attrs))));
10525
10526   eol_type = args[coding_arg_eol_type];
10527   if (! NILP (eol_type)
10528       && ! EQ (eol_type, Qunix)
10529       && ! EQ (eol_type, Qdos)
10530       && ! EQ (eol_type, Qmac))
10531     error ("Invalid eol-type");
10532
10533   aliases = list1 (name);
10534
10535   if (NILP (eol_type))
10536     {
10537       eol_type = make_subsidiaries (name);
10538       for (i = 0; i < 3; i++)
10539         {
10540           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10541
10542           this_name = AREF (eol_type, i);
10543           this_aliases = list1 (this_name);
10544           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10545           this_spec = make_uninit_vector (3);
10546           ASET (this_spec, 0, attrs);
10547           ASET (this_spec, 1, this_aliases);
10548           ASET (this_spec, 2, this_eol_type);
10549           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10550           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10551           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist, Qnil);
10552           if (NILP (val))
10553             Vcoding_system_alist
10554               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10555                        Vcoding_system_alist);
10556         }
10557     }
10558
10559   spec_vec = make_uninit_vector (3);
10560   ASET (spec_vec, 0, attrs);
10561   ASET (spec_vec, 1, aliases);
10562   ASET (spec_vec, 2, eol_type);
10563
10564   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10565   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10566   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist, Qnil);
10567   if (NILP (val))
10568     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10569                                   Vcoding_system_alist);
10570
10571   {
10572     int id = coding_categories[category].id;
10573
10574     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10575       setup_coding_system (name, &coding_categories[category]);
10576   }
10577
10578   return Qnil;
10579
10580  short_args:
10581   Fsignal (Qwrong_number_of_arguments,
10582            Fcons (intern ("define-coding-system-internal"),
10583                   make_number (nargs)));
10584 }
10585
10586
10587 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10588        3, 3, 0,
10589        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10590   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10591 {
10592   Lisp_Object spec, attrs;
10593
10594   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10595   attrs = AREF (spec, 0);
10596   if (EQ (prop, QCmnemonic))
10597     {
10598       if (! STRINGP (val))
10599         CHECK_CHARACTER (val);
10600       ASET (attrs, coding_attr_mnemonic, val);
10601     }
10602   else if (EQ (prop, QCdefault_char))
10603     {
10604       if (NILP (val))
10605         val = make_number (' ');
10606       else
10607         CHECK_CHARACTER (val);
10608       ASET (attrs, coding_attr_default_char, val);
10609     }
10610   else if (EQ (prop, QCdecode_translation_table))
10611     {
10612       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10613         CHECK_SYMBOL (val);
10614       ASET (attrs, coding_attr_decode_tbl, val);
10615     }
10616   else if (EQ (prop, QCencode_translation_table))
10617     {
10618       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10619         CHECK_SYMBOL (val);
10620       ASET (attrs, coding_attr_encode_tbl, val);
10621     }
10622   else if (EQ (prop, QCpost_read_conversion))
10623     {
10624       CHECK_SYMBOL (val);
10625       ASET (attrs, coding_attr_post_read, val);
10626     }
10627   else if (EQ (prop, QCpre_write_conversion))
10628     {
10629       CHECK_SYMBOL (val);
10630       ASET (attrs, coding_attr_pre_write, val);
10631     }
10632   else if (EQ (prop, QCascii_compatible_p))
10633     {
10634       ASET (attrs, coding_attr_ascii_compat, val);
10635     }
10636
10637   ASET (attrs, coding_attr_plist,
10638         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10639   return val;
10640 }
10641
10642
10643 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10644        Sdefine_coding_system_alias, 2, 2, 0,
10645        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10646   (Lisp_Object alias, Lisp_Object coding_system)
10647 {
10648   Lisp_Object spec, aliases, eol_type, val;
10649
10650   CHECK_SYMBOL (alias);
10651   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10652   aliases = AREF (spec, 1);
10653   /* ALIASES should be a list of length more than zero, and the first
10654      element is a base coding system.  Append ALIAS at the tail of the
10655      list.  */
10656   while (!NILP (XCDR (aliases)))
10657     aliases = XCDR (aliases);
10658   XSETCDR (aliases, list1 (alias));
10659
10660   eol_type = AREF (spec, 2);
10661   if (VECTORP (eol_type))
10662     {
10663       Lisp_Object subsidiaries;
10664       int i;
10665
10666       subsidiaries = make_subsidiaries (alias);
10667       for (i = 0; i < 3; i++)
10668         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10669                                      AREF (eol_type, i));
10670     }
10671
10672   Fputhash (alias, spec, Vcoding_system_hash_table);
10673   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10674   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist, Qnil);
10675   if (NILP (val))
10676     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10677                                   Vcoding_system_alist);
10678
10679   return Qnil;
10680 }
10681
10682 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10683        1, 1, 0,
10684        doc: /* Return the base of CODING-SYSTEM.
10685 Any alias or subsidiary coding system is not a base coding system.  */)
10686   (Lisp_Object coding_system)
10687 {
10688   Lisp_Object spec, attrs;
10689
10690   if (NILP (coding_system))
10691     return (Qno_conversion);
10692   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10693   attrs = AREF (spec, 0);
10694   return CODING_ATTR_BASE_NAME (attrs);
10695 }
10696
10697 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10698        1, 1, 0,
10699        doc: /* Return the property list of CODING-SYSTEM.  */)
10700   (Lisp_Object coding_system)
10701 {
10702   Lisp_Object spec, attrs;
10703
10704   if (NILP (coding_system))
10705     coding_system = Qno_conversion;
10706   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10707   attrs = AREF (spec, 0);
10708   return CODING_ATTR_PLIST (attrs);
10709 }
10710
10711
10712 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10713        1, 1, 0,
10714        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10715   (Lisp_Object coding_system)
10716 {
10717   Lisp_Object spec;
10718
10719   if (NILP (coding_system))
10720     coding_system = Qno_conversion;
10721   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10722   return AREF (spec, 1);
10723 }
10724
10725 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10726        Scoding_system_eol_type, 1, 1, 0,
10727        doc: /* Return eol-type of CODING-SYSTEM.
10728 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10729
10730 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10731 and CR respectively.
10732
10733 A vector value indicates that a format of end-of-line should be
10734 detected automatically.  Nth element of the vector is the subsidiary
10735 coding system whose eol-type is N.  */)
10736   (Lisp_Object coding_system)
10737 {
10738   Lisp_Object spec, eol_type;
10739   int n;
10740
10741   if (NILP (coding_system))
10742     coding_system = Qno_conversion;
10743   if (! CODING_SYSTEM_P (coding_system))
10744     return Qnil;
10745   spec = CODING_SYSTEM_SPEC (coding_system);
10746   eol_type = AREF (spec, 2);
10747   if (VECTORP (eol_type))
10748     return Fcopy_sequence (eol_type);
10749   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10750   return make_number (n);
10751 }
10752
10753 #endif /* emacs */
10754
10755 \f
10756 /*** 9. Post-amble ***/
10757
10758 void
10759 init_coding_once (void)
10760 {
10761   int i;
10762
10763   for (i = 0; i < coding_category_max; i++)
10764     {
10765       coding_categories[i].id = -1;
10766       coding_priorities[i] = i;
10767     }
10768
10769   /* ISO2022 specific initialize routine.  */
10770   for (i = 0; i < 0x20; i++)
10771     iso_code_class[i] = ISO_control_0;
10772   for (i = 0x21; i < 0x7F; i++)
10773     iso_code_class[i] = ISO_graphic_plane_0;
10774   for (i = 0x80; i < 0xA0; i++)
10775     iso_code_class[i] = ISO_control_1;
10776   for (i = 0xA1; i < 0xFF; i++)
10777     iso_code_class[i] = ISO_graphic_plane_1;
10778   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10779   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10780   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10781   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10782   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10783   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10784   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10785   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10786   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10787
10788   for (i = 0; i < 256; i++)
10789     {
10790       emacs_mule_bytes[i] = 1;
10791     }
10792   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10793   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10794   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10795   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10796 }
10797
10798 #ifdef emacs
10799
10800 void
10801 syms_of_coding (void)
10802 {
10803   staticpro (&Vcoding_system_hash_table);
10804   Vcoding_system_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
10805
10806   staticpro (&Vsjis_coding_system);
10807   Vsjis_coding_system = Qnil;
10808
10809   staticpro (&Vbig5_coding_system);
10810   Vbig5_coding_system = Qnil;
10811
10812   staticpro (&Vcode_conversion_reused_workbuf);
10813   Vcode_conversion_reused_workbuf = Qnil;
10814
10815   staticpro (&Vcode_conversion_workbuf_name);
10816   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10817
10818   reused_workbuf_in_use = 0;
10819
10820   DEFSYM (Qcharset, "charset");
10821   DEFSYM (Qtarget_idx, "target-idx");
10822   DEFSYM (Qcoding_system_history, "coding-system-history");
10823   Fset (Qcoding_system_history, Qnil);
10824
10825   /* Target FILENAME is the first argument.  */
10826   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10827   /* Target FILENAME is the third argument.  */
10828   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10829
10830   DEFSYM (Qcall_process, "call-process");
10831   /* Target PROGRAM is the first argument.  */
10832   Fput (Qcall_process, Qtarget_idx, make_number (0));
10833
10834   DEFSYM (Qcall_process_region, "call-process-region");
10835   /* Target PROGRAM is the third argument.  */
10836   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10837
10838   DEFSYM (Qstart_process, "start-process");
10839   /* Target PROGRAM is the third argument.  */
10840   Fput (Qstart_process, Qtarget_idx, make_number (2));
10841
10842   DEFSYM (Qopen_network_stream, "open-network-stream");
10843   /* Target SERVICE is the fourth argument.  */
10844   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10845
10846   DEFSYM (Qunix, "unix");
10847   DEFSYM (Qdos, "dos");
10848   DEFSYM (Qmac, "mac");
10849
10850   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10851   DEFSYM (Qundecided, "undecided");
10852   DEFSYM (Qno_conversion, "no-conversion");
10853   DEFSYM (Qraw_text, "raw-text");
10854
10855   DEFSYM (Qiso_2022, "iso-2022");
10856
10857   DEFSYM (Qutf_8, "utf-8");
10858   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10859
10860 #if defined (WINDOWSNT) || defined (CYGWIN)
10861   /* No, not utf-16-le: that one has a BOM.  */
10862   DEFSYM (Qutf_16le, "utf-16le");
10863 #endif
10864
10865   DEFSYM (Qutf_16, "utf-16");
10866   DEFSYM (Qbig, "big");
10867   DEFSYM (Qlittle, "little");
10868
10869   DEFSYM (Qshift_jis, "shift-jis");
10870   DEFSYM (Qbig5, "big5");
10871
10872   DEFSYM (Qcoding_system_p, "coding-system-p");
10873
10874   /* Error signaled when there's a problem with detecting a coding system.  */
10875   DEFSYM (Qcoding_system_error, "coding-system-error");
10876   Fput (Qcoding_system_error, Qerror_conditions,
10877         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10878   Fput (Qcoding_system_error, Qerror_message,
10879         build_pure_c_string ("Invalid coding system"));
10880
10881   DEFSYM (Qtranslation_table, "translation-table");
10882   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10883   DEFSYM (Qtranslation_table_id, "translation-table-id");
10884
10885   /* Coding system emacs-mule and raw-text are for converting only
10886      end-of-line format.  */
10887   DEFSYM (Qemacs_mule, "emacs-mule");
10888
10889   DEFSYM (QCcategory, ":category");
10890   DEFSYM (QCmnemonic, ":mnemonic");
10891   DEFSYM (QCdefault_char, ":default-char");
10892   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10893   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10894   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10895   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10896   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10897
10898   Vcoding_category_table
10899     = Fmake_vector (make_number (coding_category_max), Qnil);
10900   staticpro (&Vcoding_category_table);
10901   /* Followings are target of code detection.  */
10902   ASET (Vcoding_category_table, coding_category_iso_7,
10903         intern_c_string ("coding-category-iso-7"));
10904   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10905         intern_c_string ("coding-category-iso-7-tight"));
10906   ASET (Vcoding_category_table, coding_category_iso_8_1,
10907         intern_c_string ("coding-category-iso-8-1"));
10908   ASET (Vcoding_category_table, coding_category_iso_8_2,
10909         intern_c_string ("coding-category-iso-8-2"));
10910   ASET (Vcoding_category_table, coding_category_iso_7_else,
10911         intern_c_string ("coding-category-iso-7-else"));
10912   ASET (Vcoding_category_table, coding_category_iso_8_else,
10913         intern_c_string ("coding-category-iso-8-else"));
10914   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10915         intern_c_string ("coding-category-utf-8-auto"));
10916   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10917         intern_c_string ("coding-category-utf-8"));
10918   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10919         intern_c_string ("coding-category-utf-8-sig"));
10920   ASET (Vcoding_category_table, coding_category_utf_16_be,
10921         intern_c_string ("coding-category-utf-16-be"));
10922   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10923         intern_c_string ("coding-category-utf-16-auto"));
10924   ASET (Vcoding_category_table, coding_category_utf_16_le,
10925         intern_c_string ("coding-category-utf-16-le"));
10926   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10927         intern_c_string ("coding-category-utf-16-be-nosig"));
10928   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10929         intern_c_string ("coding-category-utf-16-le-nosig"));
10930   ASET (Vcoding_category_table, coding_category_charset,
10931         intern_c_string ("coding-category-charset"));
10932   ASET (Vcoding_category_table, coding_category_sjis,
10933         intern_c_string ("coding-category-sjis"));
10934   ASET (Vcoding_category_table, coding_category_big5,
10935         intern_c_string ("coding-category-big5"));
10936   ASET (Vcoding_category_table, coding_category_ccl,
10937         intern_c_string ("coding-category-ccl"));
10938   ASET (Vcoding_category_table, coding_category_emacs_mule,
10939         intern_c_string ("coding-category-emacs-mule"));
10940   /* Followings are NOT target of code detection.  */
10941   ASET (Vcoding_category_table, coding_category_raw_text,
10942         intern_c_string ("coding-category-raw-text"));
10943   ASET (Vcoding_category_table, coding_category_undecided,
10944         intern_c_string ("coding-category-undecided"));
10945
10946   DEFSYM (Qinsufficient_source, "insufficient-source");
10947   DEFSYM (Qinvalid_source, "invalid-source");
10948   DEFSYM (Qinterrupted, "interrupted");
10949
10950   /* If a symbol has this property, evaluate the value to define the
10951      symbol as a coding system.  */
10952   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10953
10954   defsubr (&Scoding_system_p);
10955   defsubr (&Sread_coding_system);
10956   defsubr (&Sread_non_nil_coding_system);
10957   defsubr (&Scheck_coding_system);
10958   defsubr (&Sdetect_coding_region);
10959   defsubr (&Sdetect_coding_string);
10960   defsubr (&Sfind_coding_systems_region_internal);
10961   defsubr (&Sunencodable_char_position);
10962   defsubr (&Scheck_coding_systems_region);
10963   defsubr (&Sdecode_coding_region);
10964   defsubr (&Sencode_coding_region);
10965   defsubr (&Sdecode_coding_string);
10966   defsubr (&Sencode_coding_string);
10967   defsubr (&Sdecode_sjis_char);
10968   defsubr (&Sencode_sjis_char);
10969   defsubr (&Sdecode_big5_char);
10970   defsubr (&Sencode_big5_char);
10971   defsubr (&Sset_terminal_coding_system_internal);
10972   defsubr (&Sset_safe_terminal_coding_system_internal);
10973   defsubr (&Sterminal_coding_system);
10974   defsubr (&Sset_keyboard_coding_system_internal);
10975   defsubr (&Skeyboard_coding_system);
10976   defsubr (&Sfind_operation_coding_system);
10977   defsubr (&Sset_coding_system_priority);
10978   defsubr (&Sdefine_coding_system_internal);
10979   defsubr (&Sdefine_coding_system_alias);
10980   defsubr (&Scoding_system_put);
10981   defsubr (&Scoding_system_base);
10982   defsubr (&Scoding_system_plist);
10983   defsubr (&Scoding_system_aliases);
10984   defsubr (&Scoding_system_eol_type);
10985   defsubr (&Scoding_system_priority_list);
10986
10987   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10988                doc: /* List of coding systems.
10989
10990 Do not alter the value of this variable manually.  This variable should be
10991 updated by the functions `define-coding-system' and
10992 `define-coding-system-alias'.  */);
10993   Vcoding_system_list = Qnil;
10994
10995   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10996                doc: /* Alist of coding system names.
10997 Each element is one element list of coding system name.
10998 This variable is given to `completing-read' as COLLECTION argument.
10999
11000 Do not alter the value of this variable manually.  This variable should be
11001 updated by the functions `make-coding-system' and
11002 `define-coding-system-alias'.  */);
11003   Vcoding_system_alist = Qnil;
11004
11005   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
11006                doc: /* List of coding-categories (symbols) ordered by priority.
11007
11008 On detecting a coding system, Emacs tries code detection algorithms
11009 associated with each coding-category one by one in this order.  When
11010 one algorithm agrees with a byte sequence of source text, the coding
11011 system bound to the corresponding coding-category is selected.
11012
11013 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
11014   {
11015     int i;
11016
11017     Vcoding_category_list = Qnil;
11018     for (i = coding_category_max - 1; i >= 0; i--)
11019       Vcoding_category_list
11020         = Fcons (AREF (Vcoding_category_table, i),
11021                  Vcoding_category_list);
11022   }
11023
11024   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11025                doc: /* Specify the coding system for read operations.
11026 It is useful to bind this variable with `let', but do not set it globally.
11027 If the value is a coding system, it is used for decoding on read operation.
11028 If not, an appropriate element is used from one of the coding system alists.
11029 There are three such tables: `file-coding-system-alist',
11030 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11031   Vcoding_system_for_read = Qnil;
11032
11033   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11034                doc: /* Specify the coding system for write operations.
11035 Programs bind this variable with `let', but you should not set it globally.
11036 If the value is a coding system, it is used for encoding of output,
11037 when writing it to a file and when sending it to a file or subprocess.
11038
11039 If this does not specify a coding system, an appropriate element
11040 is used from one of the coding system alists.
11041 There are three such tables: `file-coding-system-alist',
11042 `process-coding-system-alist', and `network-coding-system-alist'.
11043 For output to files, if the above procedure does not specify a coding system,
11044 the value of `buffer-file-coding-system' is used.  */);
11045   Vcoding_system_for_write = Qnil;
11046
11047   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11048                doc: /*
11049 Coding system used in the latest file or process I/O.  */);
11050   Vlast_coding_system_used = Qnil;
11051
11052   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11053                doc: /*
11054 Error status of the last code conversion.
11055
11056 When an error was detected in the last code conversion, this variable
11057 is set to one of the following symbols.
11058   `insufficient-source'
11059   `inconsistent-eol'
11060   `invalid-source'
11061   `interrupted'
11062   `insufficient-memory'
11063 When no error was detected, the value doesn't change.  So, to check
11064 the error status of a code conversion by this variable, you must
11065 explicitly set this variable to nil before performing code
11066 conversion.  */);
11067   Vlast_code_conversion_error = Qnil;
11068
11069   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11070                doc: /*
11071 Non-nil means always inhibit code conversion of end-of-line format.
11072 See info node `Coding Systems' and info node `Text and Binary' concerning
11073 such conversion.  */);
11074   inhibit_eol_conversion = 0;
11075
11076   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11077                doc: /*
11078 Non-nil means process buffer inherits coding system of process output.
11079 Bind it to t if the process output is to be treated as if it were a file
11080 read from some filesystem.  */);
11081   inherit_process_coding_system = 0;
11082
11083   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11084                doc: /*
11085 Alist to decide a coding system to use for a file I/O operation.
11086 The format is ((PATTERN . VAL) ...),
11087 where PATTERN is a regular expression matching a file name,
11088 VAL is a coding system, a cons of coding systems, or a function symbol.
11089 If VAL is a coding system, it is used for both decoding and encoding
11090 the file contents.
11091 If VAL is a cons of coding systems, the car part is used for decoding,
11092 and the cdr part is used for encoding.
11093 If VAL is a function symbol, the function must return a coding system
11094 or a cons of coding systems which are used as above.  The function is
11095 called with an argument that is a list of the arguments with which
11096 `find-operation-coding-system' was called.  If the function can't decide
11097 a coding system, it can return `undecided' so that the normal
11098 code-detection is performed.
11099
11100 See also the function `find-operation-coding-system'
11101 and the variable `auto-coding-alist'.  */);
11102   Vfile_coding_system_alist = Qnil;
11103
11104   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11105                doc: /*
11106 Alist to decide a coding system to use for a process I/O operation.
11107 The format is ((PATTERN . VAL) ...),
11108 where PATTERN is a regular expression matching a program name,
11109 VAL is a coding system, a cons of coding systems, or a function symbol.
11110 If VAL is a coding system, it is used for both decoding what received
11111 from the program and encoding what sent to the program.
11112 If VAL is a cons of coding systems, the car part is used for decoding,
11113 and the cdr part is used for encoding.
11114 If VAL is a function symbol, the function must return a coding system
11115 or a cons of coding systems which are used as above.
11116
11117 See also the function `find-operation-coding-system'.  */);
11118   Vprocess_coding_system_alist = Qnil;
11119
11120   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11121                doc: /*
11122 Alist to decide a coding system to use for a network I/O operation.
11123 The format is ((PATTERN . VAL) ...),
11124 where PATTERN is a regular expression matching a network service name
11125 or is a port number to connect to,
11126 VAL is a coding system, a cons of coding systems, or a function symbol.
11127 If VAL is a coding system, it is used for both decoding what received
11128 from the network stream and encoding what sent to the network stream.
11129 If VAL is a cons of coding systems, the car part is used for decoding,
11130 and the cdr part is used for encoding.
11131 If VAL is a function symbol, the function must return a coding system
11132 or a cons of coding systems which are used as above.
11133
11134 See also the function `find-operation-coding-system'.  */);
11135   Vnetwork_coding_system_alist = Qnil;
11136
11137   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11138                doc: /* Coding system to use with system messages.
11139 Also used for decoding keyboard input on X Window system, and for
11140 encoding standard output and error streams.  */);
11141   Vlocale_coding_system = Qnil;
11142
11143   /* The eol mnemonics are reset in startup.el system-dependently.  */
11144   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11145                doc: /*
11146 String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11147   eol_mnemonic_unix = build_pure_c_string (":");
11148
11149   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11150                doc: /*
11151 String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11152   eol_mnemonic_dos = build_pure_c_string ("\\");
11153
11154   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11155                doc: /*
11156 String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11157   eol_mnemonic_mac = build_pure_c_string ("/");
11158
11159   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11160                doc: /*
11161 String displayed in mode line when end-of-line format is not yet determined.  */);
11162   eol_mnemonic_undecided = build_pure_c_string (":");
11163
11164   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11165                doc: /*
11166 Non-nil enables character translation while encoding and decoding.  */);
11167   Venable_character_translation = Qt;
11168
11169   DEFVAR_LISP ("standard-translation-table-for-decode",
11170                Vstandard_translation_table_for_decode,
11171                doc: /* Table for translating characters while decoding.  */);
11172   Vstandard_translation_table_for_decode = Qnil;
11173
11174   DEFVAR_LISP ("standard-translation-table-for-encode",
11175                Vstandard_translation_table_for_encode,
11176                doc: /* Table for translating characters while encoding.  */);
11177   Vstandard_translation_table_for_encode = Qnil;
11178
11179   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11180                doc: /* Alist of charsets vs revision numbers.
11181 While encoding, if a charset (car part of an element) is found,
11182 designate it with the escape sequence identifying revision (cdr part
11183 of the element).  */);
11184   Vcharset_revision_table = Qnil;
11185
11186   DEFVAR_LISP ("default-process-coding-system",
11187                Vdefault_process_coding_system,
11188                doc: /* Cons of coding systems used for process I/O by default.
11189 The car part is used for decoding a process output,
11190 the cdr part is used for encoding a text to be sent to a process.  */);
11191   Vdefault_process_coding_system = Qnil;
11192
11193   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11194                doc: /*
11195 Table of extra Latin codes in the range 128..159 (inclusive).
11196 This is a vector of length 256.
11197 If Nth element is non-nil, the existence of code N in a file
11198 \(or output of subprocess) doesn't prevent it to be detected as
11199 a coding system of ISO 2022 variant which has a flag
11200 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11201 or reading output of a subprocess.
11202 Only 128th through 159th elements have a meaning.  */);
11203   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11204
11205   DEFVAR_LISP ("select-safe-coding-system-function",
11206                Vselect_safe_coding_system_function,
11207                doc: /*
11208 Function to call to select safe coding system for encoding a text.
11209
11210 If set, this function is called to force a user to select a proper
11211 coding system which can encode the text in the case that a default
11212 coding system used in each operation can't encode the text.  The
11213 function should take care that the buffer is not modified while
11214 the coding system is being selected.
11215
11216 The default value is `select-safe-coding-system' (which see).  */);
11217   Vselect_safe_coding_system_function = Qnil;
11218
11219   DEFVAR_BOOL ("coding-system-require-warning",
11220                coding_system_require_warning,
11221                doc: /* Internal use only.
11222 If non-nil, on writing a file, `select-safe-coding-system-function' is
11223 called even if `coding-system-for-write' is non-nil.  The command
11224 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11225   coding_system_require_warning = 0;
11226
11227
11228   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11229                inhibit_iso_escape_detection,
11230                doc: /*
11231 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11232
11233 When Emacs reads text, it tries to detect how the text is encoded.
11234 This code detection is sensitive to escape sequences.  If Emacs sees
11235 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11236 of the ISO2022 encodings, and decodes text by the corresponding coding
11237 system (e.g. `iso-2022-7bit').
11238
11239 However, there may be a case that you want to read escape sequences in
11240 a file as is.  In such a case, you can set this variable to non-nil.
11241 Then the code detection will ignore any escape sequences, and no text is
11242 detected as encoded in some ISO-2022 encoding.  The result is that all
11243 escape sequences become visible in a buffer.
11244
11245 The default value is nil, and it is strongly recommended not to change
11246 it.  That is because many Emacs Lisp source files that contain
11247 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11248 in Emacs's distribution, and they won't be decoded correctly on
11249 reading if you suppress escape sequence detection.
11250
11251 The other way to read escape sequences in a file without decoding is
11252 to explicitly specify some coding system that doesn't use ISO-2022
11253 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11254   inhibit_iso_escape_detection = 0;
11255
11256   DEFVAR_BOOL ("inhibit-null-byte-detection",
11257                inhibit_null_byte_detection,
11258                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11259 By default, Emacs treats it as binary data, and does not attempt to
11260 decode it.  The effect is as if you specified `no-conversion' for
11261 reading that text.
11262
11263 Set this to non-nil when a regular text happens to include null bytes.
11264 Examples are Index nodes of Info files and null-byte delimited output
11265 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11266 decode text as usual.  */);
11267   inhibit_null_byte_detection = 0;
11268
11269   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11270                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11271 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11272   disable_ascii_optimization = 0;
11273
11274   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11275                doc: /* Char table for translating self-inserting characters.
11276 This is applied to the result of input methods, not their input.
11277 See also `keyboard-translate-table'.
11278
11279 Use of this variable for character code unification was rendered
11280 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11281 internal character representation.  */);
11282   Vtranslation_table_for_input = Qnil;
11283
11284   Lisp_Object args[coding_arg_undecided_max];
11285   memclear (args, sizeof args);
11286
11287   Lisp_Object plist[] =
11288     {
11289       QCname,
11290       args[coding_arg_name] = Qno_conversion,
11291       QCmnemonic,
11292       args[coding_arg_mnemonic] = make_number ('='),
11293       intern_c_string (":coding-type"),
11294       args[coding_arg_coding_type] = Qraw_text,
11295       QCascii_compatible_p,
11296       args[coding_arg_ascii_compatible_p] = Qt,
11297       QCdefault_char,
11298       args[coding_arg_default_char] = make_number (0),
11299       intern_c_string (":for-unibyte"),
11300       args[coding_arg_for_unibyte] = Qt,
11301       intern_c_string (":docstring"),
11302       (build_pure_c_string
11303        ("Do no conversion.\n"
11304         "\n"
11305         "When you visit a file with this coding, the file is read into a\n"
11306         "unibyte buffer as is, thus each byte of a file is treated as a\n"
11307         "character.")),
11308       intern_c_string (":eol-type"),
11309       args[coding_arg_eol_type] = Qunix,
11310     };
11311   args[coding_arg_plist] = CALLMANY (Flist, plist);
11312   Fdefine_coding_system_internal (coding_arg_max, args);
11313
11314   plist[1] = args[coding_arg_name] = Qundecided;
11315   plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11316   plist[5] = args[coding_arg_coding_type] = Qundecided;
11317   /* This is already set.
11318      plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11319   plist[8] = intern_c_string (":charset-list");
11320   plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11321   plist[11] = args[coding_arg_for_unibyte] = Qnil;
11322   plist[13] = build_pure_c_string ("No conversion on encoding, "
11323                                    "automatic conversion on decoding.");
11324   plist[15] = args[coding_arg_eol_type] = Qnil;
11325   args[coding_arg_plist] = CALLMANY (Flist, plist);
11326   args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11327   args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11328   Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11329
11330   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11331
11332   for (int i = 0; i < coding_category_max; i++)
11333     Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11334
11335 #if defined (DOS_NT)
11336   system_eol_type = Qdos;
11337 #else
11338   system_eol_type = Qunix;
11339 #endif
11340   staticpro (&system_eol_type);
11341 }
11342 #endif /* emacs */