src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2015 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 /* Format of end-of-line decided by system.  This is Qunix on
 307    Unix and Mac, Qdos on DOS/Windows.
 308    This has an effect only for external encoding (i.e. for output to
 309    file and process), not for in-buffer or Lisp string encoding.  */
 310 static Lisp_Object system_eol_type;
 311
 312 #ifdef emacs
 313
 314 /* Coding-systems are handed between Emacs Lisp programs and C internal
 315    routines by the following three variables.  */
 316 /* Coding system to be used to encode text for terminal display when
 317    terminal coding system is nil.  */
 318 struct coding_system safe_terminal_coding;
 319
 320 #endif /* emacs */
 321
 322 /* Two special coding systems.  */
 323 static Lisp_Object Vsjis_coding_system;
 324 static Lisp_Object Vbig5_coding_system;
 325
 326 /* ISO2022 section */
 327
 328 #define CODING_ISO_INITIAL(coding, reg)                 \
 329   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 330                      coding_attr_iso_initial),          \
 331                reg)))
 332
 333
 334 #define CODING_ISO_REQUEST(coding, charset_id)          \
 335   (((charset_id) <= (coding)->max_charset_id            \
 336     ? ((coding)->safe_charsets[charset_id] != 255       \
 337        ? (coding)->safe_charsets[charset_id]            \
 338        : -1)                                            \
 339     : -1))
 340
 341
 342 #define CODING_ISO_FLAGS(coding)        \
 343   ((coding)->spec.iso_2022.flags)
 344 #define CODING_ISO_DESIGNATION(coding, reg)     \
 345   ((coding)->spec.iso_2022.current_designation[reg])
 346 #define CODING_ISO_INVOCATION(coding, plane)    \
 347   ((coding)->spec.iso_2022.current_invocation[plane])
 348 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 349   ((coding)->spec.iso_2022.single_shifting)
 350 #define CODING_ISO_BOL(coding)  \
 351   ((coding)->spec.iso_2022.bol)
 352 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 353   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1       \
 354    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
 355 #define CODING_ISO_CMP_STATUS(coding)   \
 356   (&(coding)->spec.iso_2022.cmp_status)
 357 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 358   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 359 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 360   ((coding)->spec.iso_2022.embedded_utf_8)
 361
 362 /* Control characters of ISO2022.  */
 363                         /* code */      /* function */
 364 #define ISO_CODE_SO     0x0E            /* shift-out */
 365 #define ISO_CODE_SI     0x0F            /* shift-in */
 366 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 367 #define ISO_CODE_ESC    0x1B            /* escape */
 368 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 369 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 370 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 371
 372 /* All code (1-byte) of ISO2022 is classified into one of the
 373    followings.  */
 374 enum iso_code_class_type
 375   {
 376     ISO_control_0,              /* Control codes in the range
 377                                    0x00..0x1F and 0x7F, except for the
 378                                    following 5 codes.  */
 379     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 380     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 381     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 382     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 383     ISO_control_1,              /* Control codes in the range
 384                                    0x80..0x9F, except for the
 385                                    following 3 codes.  */
 386     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 387     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 388     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 389     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 390     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 391     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 392     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 393   };
 394
 395 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 396     `iso-flags' attribute of an iso2022 coding system.  */
 397
 398 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 399    instead of the correct short-form sequence (e.g. ESC $ A).  */
 400 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 401
 402 /* If set, reset graphic planes and registers at end-of-line to the
 403    initial state.  */
 404 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 405
 406 /* If set, reset graphic planes and registers before any control
 407    characters to the initial state.  */
 408 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 409
 410 /* If set, encode by 7-bit environment.  */
 411 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 412
 413 /* If set, use locking-shift function.  */
 414 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 415
 416 /* If set, use single-shift function.  Overwrite
 417    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 418 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 419
 420 /* If set, use designation escape sequence.  */
 421 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 422
 423 /* If set, produce revision number sequence.  */
 424 #define CODING_ISO_FLAG_REVISION        0x0080
 425
 426 /* If set, produce ISO6429's direction specifying sequence.  */
 427 #define CODING_ISO_FLAG_DIRECTION       0x0100
 428
 429 /* If set, assume designation states are reset at beginning of line on
 430    output.  */
 431 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 432
 433 /* If set, designation sequence should be placed at beginning of line
 434    on output.  */
 435 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 436
 437 /* If set, do not encode unsafe characters on output.  */
 438 #define CODING_ISO_FLAG_SAFE            0x0800
 439
 440 /* If set, extra latin codes (128..159) are accepted as a valid code
 441    on input.  */
 442 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 443
 444 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 445
 446 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 447
 448 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 449
 450 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 451
 452 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 453
 454 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 455
 456 /* A character to be produced on output if encoding of the original
 457    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 458 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 459
 460 /* UTF-8 section */
 461 #define CODING_UTF_8_BOM(coding)        \
 462   ((coding)->spec.utf_8_bom)
 463
 464 /* UTF-16 section */
 465 #define CODING_UTF_16_BOM(coding)       \
 466   ((coding)->spec.utf_16.bom)
 467
 468 #define CODING_UTF_16_ENDIAN(coding)    \
 469   ((coding)->spec.utf_16.endian)
 470
 471 #define CODING_UTF_16_SURROGATE(coding) \
 472   ((coding)->spec.utf_16.surrogate)
 473
 474
 475 /* CCL section */
 476 #define CODING_CCL_DECODER(coding)      \
 477   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 478 #define CODING_CCL_ENCODER(coding)      \
 479   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 480 #define CODING_CCL_VALIDS(coding)                                          \
 481   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 482
 483 /* Index for each coding category in `coding_categories' */
 484
 485 enum coding_category
 486   {
 487     coding_category_iso_7,
 488     coding_category_iso_7_tight,
 489     coding_category_iso_8_1,
 490     coding_category_iso_8_2,
 491     coding_category_iso_7_else,
 492     coding_category_iso_8_else,
 493     coding_category_utf_8_auto,
 494     coding_category_utf_8_nosig,
 495     coding_category_utf_8_sig,
 496     coding_category_utf_16_auto,
 497     coding_category_utf_16_be,
 498     coding_category_utf_16_le,
 499     coding_category_utf_16_be_nosig,
 500     coding_category_utf_16_le_nosig,
 501     coding_category_charset,
 502     coding_category_sjis,
 503     coding_category_big5,
 504     coding_category_ccl,
 505     coding_category_emacs_mule,
 506     /* All above are targets of code detection.  */
 507     coding_category_raw_text,
 508     coding_category_undecided,
 509     coding_category_max
 510   };
 511
 512 /* Definitions of flag bits used in detect_coding_XXXX.  */
 513 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 514 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 515 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 516 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 517 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 518 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 519 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 520 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 521 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 522 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 523 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 524 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 525 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 526 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 527 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 528 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 529 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 530 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 531 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 532 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 533
 534 /* This value is returned if detect_coding_mask () find nothing other
 535    than ASCII characters.  */
 536 #define CATEGORY_MASK_ANY               \
 537   (CATEGORY_MASK_ISO_7                  \
 538    | CATEGORY_MASK_ISO_7_TIGHT          \
 539    | CATEGORY_MASK_ISO_8_1              \
 540    | CATEGORY_MASK_ISO_8_2              \
 541    | CATEGORY_MASK_ISO_7_ELSE           \
 542    | CATEGORY_MASK_ISO_8_ELSE           \
 543    | CATEGORY_MASK_UTF_8_AUTO           \
 544    | CATEGORY_MASK_UTF_8_NOSIG          \
 545    | CATEGORY_MASK_UTF_8_SIG            \
 546    | CATEGORY_MASK_UTF_16_AUTO          \
 547    | CATEGORY_MASK_UTF_16_BE            \
 548    | CATEGORY_MASK_UTF_16_LE            \
 549    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 550    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 551    | CATEGORY_MASK_CHARSET              \
 552    | CATEGORY_MASK_SJIS                 \
 553    | CATEGORY_MASK_BIG5                 \
 554    | CATEGORY_MASK_CCL                  \
 555    | CATEGORY_MASK_EMACS_MULE)
 556
 557
 558 #define CATEGORY_MASK_ISO_7BIT \
 559   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 560
 561 #define CATEGORY_MASK_ISO_8BIT \
 562   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 563
 564 #define CATEGORY_MASK_ISO_ELSE \
 565   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 566
 567 #define CATEGORY_MASK_ISO_ESCAPE        \
 568   (CATEGORY_MASK_ISO_7                  \
 569    | CATEGORY_MASK_ISO_7_TIGHT          \
 570    | CATEGORY_MASK_ISO_7_ELSE           \
 571    | CATEGORY_MASK_ISO_8_ELSE)
 572
 573 #define CATEGORY_MASK_ISO       \
 574   (  CATEGORY_MASK_ISO_7BIT     \
 575      | CATEGORY_MASK_ISO_8BIT   \
 576      | CATEGORY_MASK_ISO_ELSE)
 577
 578 #define CATEGORY_MASK_UTF_16            \
 579   (CATEGORY_MASK_UTF_16_AUTO            \
 580    | CATEGORY_MASK_UTF_16_BE            \
 581    | CATEGORY_MASK_UTF_16_LE            \
 582    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 583    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 584
 585 #define CATEGORY_MASK_UTF_8     \
 586   (CATEGORY_MASK_UTF_8_AUTO     \
 587    | CATEGORY_MASK_UTF_8_NOSIG  \
 588    | CATEGORY_MASK_UTF_8_SIG)
 589
 590 /* Table of coding categories (Lisp symbols).  This variable is for
 591    internal use only.  */
 592 static Lisp_Object Vcoding_category_table;
 593
 594 /* Table of coding-categories ordered by priority.  */
 595 static enum coding_category coding_priorities[coding_category_max];
 596
 597 /* Nth element is a coding context for the coding system bound to the
 598    Nth coding category.  */
 599 static struct coding_system coding_categories[coding_category_max];
 600
 601 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 602
 603 static int
 604 encode_inhibit_flag (Lisp_Object flag)
 605 {
 606   return NILP (flag) ? -1 : EQ (flag, Qt);
 607 }
 608
 609 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 610    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 611
 612 static bool
 613 inhibit_flag (int encoded_flag, bool var)
 614 {
 615   return 0 < encoded_flag + var;
 616 }
 617
 618 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 619   do {                                                  \
 620     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 621     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 622   } while (0)
 623
 624 static void
 625 CHECK_NATNUM_CAR (Lisp_Object x)
 626 {
 627   Lisp_Object tmp = XCAR (x);
 628   CHECK_NATNUM (tmp);
 629   XSETCAR (x, tmp);
 630 }
 631
 632 static void
 633 CHECK_NATNUM_CDR (Lisp_Object x)
 634 {
 635   Lisp_Object tmp = XCDR (x);
 636   CHECK_NATNUM (tmp);
 637   XSETCDR (x, tmp);
 638 }
 639
 640 /* True if CODING's destination can be grown.  */
 641
 642 static bool
 643 growable_destination (struct coding_system *coding)
 644 {
 645   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
 646 }
 647
 648
 649 /* Safely get one byte from the source text pointed by SRC which ends
 650    at SRC_END, and set C to that byte.  If there are not enough bytes
 651    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 652    and a multibyte character is found at SRC, set C to the
 653    negative value of the character code.  The caller should declare
 654    and set these variables appropriately in advance:
 655         src, src_end, multibytep */
 656
 657 #define ONE_MORE_BYTE(c)                                \
 658   do {                                                  \
 659     if (src == src_end)                                 \
 660       {                                                 \
 661         if (src_base < src)                             \
 662           record_conversion_result                      \
 663             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 664         goto no_more_source;                            \
 665       }                                                 \
 666     c = *src++;                                         \
 667     if (multibytep && (c & 0x80))                       \
 668       {                                                 \
 669         if ((c & 0xFE) == 0xC0)                         \
 670           c = ((c & 1) << 6) | *src++;                  \
 671         else                                            \
 672           {                                             \
 673             src--;                                      \
 674             c = - string_char (src, &src, NULL);        \
 675             record_conversion_result                    \
 676               (coding, CODING_RESULT_INVALID_SRC);      \
 677           }                                             \
 678       }                                                 \
 679     consumed_chars++;                                   \
 680   } while (0)
 681
 682 /* Safely get two bytes from the source text pointed by SRC which ends
 683    at SRC_END, and set C1 and C2 to those bytes while skipping the
 684    heading multibyte characters.  If there are not enough bytes in the
 685    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 686    a multibyte character is found for C2, set C2 to the negative value
 687    of the character code.  The caller should declare and set these
 688    variables appropriately in advance:
 689         src, src_end, multibytep
 690    It is intended that this macro is used in detect_coding_utf_16.  */
 691
 692 #define TWO_MORE_BYTES(c1, c2)                          \
 693   do {                                                  \
 694     do {                                                \
 695       if (src == src_end)                               \
 696         goto no_more_source;                            \
 697       c1 = *src++;                                      \
 698       if (multibytep && (c1 & 0x80))                    \
 699         {                                               \
 700           if ((c1 & 0xFE) == 0xC0)                      \
 701             c1 = ((c1 & 1) << 6) | *src++;              \
 702           else                                          \
 703             {                                           \
 704               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 705               c1 = -1;                                  \
 706             }                                           \
 707         }                                               \
 708     } while (c1 < 0);                                   \
 709     if (src == src_end)                                 \
 710       goto no_more_source;                              \
 711     c2 = *src++;                                        \
 712     if (multibytep && (c2 & 0x80))                      \
 713       {                                                 \
 714         if ((c2 & 0xFE) == 0xC0)                        \
 715           c2 = ((c2 & 1) << 6) | *src++;                \
 716         else                                            \
 717           c2 = -1;                                      \
 718       }                                                 \
 719   } while (0)
 720
 721
 722 /* Store a byte C in the place pointed by DST and increment DST to the
 723    next free point, and increment PRODUCED_CHARS.  The caller should
 724    assure that C is 0..127, and declare and set the variable `dst'
 725    appropriately in advance.
 726 */
 727
 728
 729 #define EMIT_ONE_ASCII_BYTE(c)  \
 730   do {                          \
 731     produced_chars++;           \
 732     *dst++ = (c);               \
 733   } while (0)
 734
 735
 736 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 737
 738 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 739   do {                                  \
 740     produced_chars += 2;                \
 741     *dst++ = (c1), *dst++ = (c2);       \
 742   } while (0)
 743
 744
 745 /* Store a byte C in the place pointed by DST and increment DST to the
 746    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 747    store in an appropriate multibyte form.  The caller should
 748    declare and set the variables `dst' and `multibytep' appropriately
 749    in advance.  */
 750
 751 #define EMIT_ONE_BYTE(c)                \
 752   do {                                  \
 753     produced_chars++;                   \
 754     if (multibytep)                     \
 755       {                                 \
 756         unsigned ch = (c);              \
 757         if (ch >= 0x80)                 \
 758           ch = BYTE8_TO_CHAR (ch);      \
 759         CHAR_STRING_ADVANCE (ch, dst);  \
 760       }                                 \
 761     else                                \
 762       *dst++ = (c);                     \
 763   } while (0)
 764
 765
 766 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 767
 768 #define EMIT_TWO_BYTES(c1, c2)          \
 769   do {                                  \
 770     produced_chars += 2;                \
 771     if (multibytep)                     \
 772       {                                 \
 773         unsigned ch;                    \
 774                                         \
 775         ch = (c1);                      \
 776         if (ch >= 0x80)                 \
 777           ch = BYTE8_TO_CHAR (ch);      \
 778         CHAR_STRING_ADVANCE (ch, dst);  \
 779         ch = (c2);                      \
 780         if (ch >= 0x80)                 \
 781           ch = BYTE8_TO_CHAR (ch);      \
 782         CHAR_STRING_ADVANCE (ch, dst);  \
 783       }                                 \
 784     else                                \
 785       {                                 \
 786         *dst++ = (c1);                  \
 787         *dst++ = (c2);                  \
 788       }                                 \
 789   } while (0)
 790
 791
 792 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 793   do {                                  \
 794     EMIT_ONE_BYTE (c1);                 \
 795     EMIT_TWO_BYTES (c2, c3);            \
 796   } while (0)
 797
 798
 799 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 800   do {                                          \
 801     EMIT_TWO_BYTES (c1, c2);                    \
 802     EMIT_TWO_BYTES (c3, c4);                    \
 803   } while (0)
 804
 805
 806 static void
 807 record_conversion_result (struct coding_system *coding,
 808                           enum coding_result_code result)
 809 {
 810   coding->result = result;
 811   switch (result)
 812     {
 813     case CODING_RESULT_INSUFFICIENT_SRC:
 814       Vlast_code_conversion_error = Qinsufficient_source;
 815       break;
 816     case CODING_RESULT_INVALID_SRC:
 817       Vlast_code_conversion_error = Qinvalid_source;
 818       break;
 819     case CODING_RESULT_INTERRUPT:
 820       Vlast_code_conversion_error = Qinterrupted;
 821       break;
 822     case CODING_RESULT_INSUFFICIENT_DST:
 823       /* Don't record this error in Vlast_code_conversion_error
 824          because it happens just temporarily and is resolved when the
 825          whole conversion is finished.  */
 826       break;
 827     case CODING_RESULT_SUCCESS:
 828       break;
 829     default:
 830       Vlast_code_conversion_error = intern ("Unknown error");
 831     }
 832 }
 833
 834 /* These wrapper macros are used to preserve validity of pointers into
 835    buffer text across calls to decode_char, encode_char, etc, which
 836    could cause relocation of buffers if it loads a charset map,
 837    because loading a charset map allocates large structures.  */
 838
 839 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 840   do {                                                                       \
 841     ptrdiff_t offset;                                                        \
 842                                                                              \
 843     charset_map_loaded = 0;                                                  \
 844     c = DECODE_CHAR (charset, code);                                         \
 845     if (charset_map_loaded                                                   \
 846         && (offset = coding_change_source (coding)))                         \
 847       {                                                                      \
 848         src += offset;                                                       \
 849         src_base += offset;                                                  \
 850         src_end += offset;                                                   \
 851       }                                                                      \
 852   } while (0)
 853
 854 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 855   do {                                                                  \
 856     ptrdiff_t offset;                                                   \
 857                                                                         \
 858     charset_map_loaded = 0;                                             \
 859     code = ENCODE_CHAR (charset, c);                                    \
 860     if (charset_map_loaded                                              \
 861         && (offset = coding_change_destination (coding)))               \
 862       {                                                                 \
 863         dst += offset;                                                  \
 864         dst_end += offset;                                              \
 865       }                                                                 \
 866   } while (0)
 867
 868 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 869   do {                                                                  \
 870     ptrdiff_t offset;                                                   \
 871                                                                         \
 872     charset_map_loaded = 0;                                             \
 873     charset = char_charset (c, charset_list, code_return);              \
 874     if (charset_map_loaded                                              \
 875         && (offset = coding_change_destination (coding)))               \
 876       {                                                                 \
 877         dst += offset;                                                  \
 878         dst_end += offset;                                              \
 879       }                                                                 \
 880   } while (0)
 881
 882 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 883   do {                                                                  \
 884     ptrdiff_t offset;                                                   \
 885                                                                         \
 886     charset_map_loaded = 0;                                             \
 887     result = CHAR_CHARSET_P (c, charset);                               \
 888     if (charset_map_loaded                                              \
 889         && (offset = coding_change_destination (coding)))               \
 890       {                                                                 \
 891         dst += offset;                                                  \
 892         dst_end += offset;                                              \
 893       }                                                                 \
 894   } while (0)
 895
 896
 897 /* If there are at least BYTES length of room at dst, allocate memory
 898    for coding->destination and update dst and dst_end.  We don't have
 899    to take care of coding->source which will be relocated.  It is
 900    handled by calling coding_set_source in encode_coding.  */
 901
 902 #define ASSURE_DESTINATION(bytes)                               \
 903   do {                                                          \
 904     if (dst + (bytes) >= dst_end)                               \
 905       {                                                         \
 906         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 907                                                                 \
 908         dst = alloc_destination (coding, more_bytes, dst);      \
 909         dst_end = coding->destination + coding->dst_bytes;      \
 910       }                                                         \
 911   } while (0)
 912
 913
 914 /* Store multibyte form of the character C in P, and advance P to the
 915    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 916    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 917    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 918
 919 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 920
 921 /* Return the character code of character whose multibyte form is at
 922    P, and advance P to the end of the multibyte form.  This used to be
 923    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 924    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 925
 926 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 927
 928 /* Set coding->source from coding->src_object.  */
 929
 930 static void
 931 coding_set_source (struct coding_system *coding)
 932 {
 933   if (BUFFERP (coding->src_object))
 934     {
 935       struct buffer *buf = XBUFFER (coding->src_object);
 936
 937       if (coding->src_pos < 0)
 938         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 939       else
 940         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 941     }
 942   else if (STRINGP (coding->src_object))
 943     {
 944       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 945     }
 946   else
 947     {
 948       /* Otherwise, the source is C string and is never relocated
 949          automatically.  Thus we don't have to update anything.  */
 950     }
 951 }
 952
 953
 954 /* Set coding->source from coding->src_object, and return how many
 955    bytes coding->source was changed.  */
 956
 957 static ptrdiff_t
 958 coding_change_source (struct coding_system *coding)
 959 {
 960   const unsigned char *orig = coding->source;
 961   coding_set_source (coding);
 962   return coding->source - orig;
 963 }
 964
 965
 966 /* Set coding->destination from coding->dst_object.  */
 967
 968 static void
 969 coding_set_destination (struct coding_system *coding)
 970 {
 971   if (BUFFERP (coding->dst_object))
 972     {
 973       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 974         {
 975           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 976           coding->dst_bytes = (GAP_END_ADDR
 977                                - (coding->src_bytes - coding->consumed)
 978                                - coding->destination);
 979         }
 980       else
 981         {
 982           /* We are sure that coding->dst_pos_byte is before the gap
 983              of the buffer. */
 984           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 985                                  + coding->dst_pos_byte - BEG_BYTE);
 986           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 987                                - coding->destination);
 988         }
 989     }
 990   else
 991     {
 992       /* Otherwise, the destination is C string and is never relocated
 993          automatically.  Thus we don't have to update anything.  */
 994     }
 995 }
 996
 997
 998 /* Set coding->destination from coding->dst_object, and return how
 999    many bytes coding->destination was changed.  */
1000
1001 static ptrdiff_t
1002 coding_change_destination (struct coding_system *coding)
1003 {
1004   const unsigned char *orig = coding->destination;
1005   coding_set_destination (coding);
1006   return coding->destination - orig;
1007 }
1008
1009
1010 static void
1011 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1012 {
1013   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1014     string_overflow ();
1015   coding->destination = xrealloc (coding->destination,
1016                                   coding->dst_bytes + bytes);
1017   coding->dst_bytes += bytes;
1018 }
1019
1020 static void
1021 coding_alloc_by_making_gap (struct coding_system *coding,
1022                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1023 {
1024   if (EQ (coding->src_object, coding->dst_object))
1025     {
1026       /* The gap may contain the produced data at the head and not-yet
1027          consumed data at the tail.  To preserve those data, we at
1028          first make the gap size to zero, then increase the gap
1029          size.  */
1030       ptrdiff_t add = GAP_SIZE;
1031
1032       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1033       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1034       make_gap (bytes);
1035       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1036       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1037     }
1038   else
1039     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1040 }
1041
1042
1043 static unsigned char *
1044 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1045                    unsigned char *dst)
1046 {
1047   ptrdiff_t offset = dst - coding->destination;
1048
1049   if (BUFFERP (coding->dst_object))
1050     {
1051       struct buffer *buf = XBUFFER (coding->dst_object);
1052
1053       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1054     }
1055   else
1056     coding_alloc_by_realloc (coding, nbytes);
1057   coding_set_destination (coding);
1058   dst = coding->destination + offset;
1059   return dst;
1060 }
1061
1062 /** Macros for annotations.  */
1063
1064 /* An annotation data is stored in the array coding->charbuf in this
1065    format:
1066      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1067    LENGTH is the number of elements in the annotation.
1068    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1069    NCHARS is the number of characters in the text annotated.
1070
1071    The format of the following elements depend on ANNOTATION_MASK.
1072
1073    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1074    follows:
1075      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1076
1077    NBYTES is the number of bytes specified in the header part of
1078    old-style emacs-mule encoding, or 0 for the other kind of
1079    composition.
1080
1081    METHOD is one of enum composition_method.
1082
1083    Optional COMPOSITION-COMPONENTS are characters and composition
1084    rules.
1085
1086    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1087    follows.
1088
1089    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1090    recover from an invalid annotation, and should be skipped by
1091    produce_annotation.  */
1092
1093 /* Maximum length of the header of annotation data.  */
1094 #define MAX_ANNOTATION_LENGTH 5
1095
1096 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1097   do {                                                  \
1098     *(buf)++ = -(len);                                  \
1099     *(buf)++ = (mask);                                  \
1100     *(buf)++ = (nchars);                                \
1101     coding->annotated = 1;                              \
1102   } while (0);
1103
1104 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1105   do {                                                                      \
1106     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1107     *buf++ = nbytes;                                                        \
1108     *buf++ = method;                                                        \
1109   } while (0)
1110
1111
1112 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1113   do {                                                                  \
1114     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1115     *buf++ = id;                                                        \
1116   } while (0)
1117
1118
1119 /* Bitmasks for coding->eol_seen.  */
1120
1121 #define EOL_SEEN_NONE   0
1122 #define EOL_SEEN_LF     1
1123 #define EOL_SEEN_CR     2
1124 #define EOL_SEEN_CRLF   4
1125
1126 \f
1127 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1128
1129
1130
1131 \f
1132 /*** 3. UTF-8 ***/
1133
1134 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1135    Return true if a text is encoded in UTF-8.  */
1136
1137 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1138 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1139 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1140 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1141 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1142 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1143
1144 #define UTF_8_BOM_1 0xEF
1145 #define UTF_8_BOM_2 0xBB
1146 #define UTF_8_BOM_3 0xBF
1147
1148 /* Unlike the other detect_coding_XXX, this function counts the number
1149    of characters and checks the EOL format.  */
1150
1151 static bool
1152 detect_coding_utf_8 (struct coding_system *coding,
1153                      struct coding_detection_info *detect_info)
1154 {
1155   const unsigned char *src = coding->source, *src_base;
1156   const unsigned char *src_end = coding->source + coding->src_bytes;
1157   bool multibytep = coding->src_multibyte;
1158   ptrdiff_t consumed_chars = 0;
1159   bool bom_found = 0;
1160   ptrdiff_t nchars = coding->head_ascii;
1161   int eol_seen = coding->eol_seen;
1162
1163   detect_info->checked |= CATEGORY_MASK_UTF_8;
1164   /* A coding system of this category is always ASCII compatible.  */
1165   src += nchars;
1166
1167   if (src == coding->source     /* BOM should be at the head.  */
1168       && src + 3 < src_end      /* BOM is 3-byte long.  */
1169       && src[0] == UTF_8_BOM_1
1170       && src[1] == UTF_8_BOM_2
1171       && src[2] == UTF_8_BOM_3)
1172     {
1173       bom_found = 1;
1174       src += 3;
1175       nchars++;
1176     }
1177
1178   while (1)
1179     {
1180       int c, c1, c2, c3, c4;
1181
1182       src_base = src;
1183       ONE_MORE_BYTE (c);
1184       if (c < 0 || UTF_8_1_OCTET_P (c))
1185         {
1186           nchars++;
1187           if (c == '\r')
1188             {
1189               if (src < src_end && *src == '\n')
1190                 {
1191                   eol_seen |= EOL_SEEN_CRLF;
1192                   src++;
1193                   nchars++;
1194                 }
1195               else
1196                 eol_seen |= EOL_SEEN_CR;
1197             }
1198           else if (c == '\n')
1199             eol_seen |= EOL_SEEN_LF;
1200           continue;
1201         }
1202       ONE_MORE_BYTE (c1);
1203       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1204         break;
1205       if (UTF_8_2_OCTET_LEADING_P (c))
1206         {
1207           nchars++;
1208           continue;
1209         }
1210       ONE_MORE_BYTE (c2);
1211       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1212         break;
1213       if (UTF_8_3_OCTET_LEADING_P (c))
1214         {
1215           nchars++;
1216           continue;
1217         }
1218       ONE_MORE_BYTE (c3);
1219       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1220         break;
1221       if (UTF_8_4_OCTET_LEADING_P (c))
1222         {
1223           nchars++;
1224           continue;
1225         }
1226       ONE_MORE_BYTE (c4);
1227       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1228         break;
1229       if (UTF_8_5_OCTET_LEADING_P (c))
1230         {
1231           nchars++;
1232           continue;
1233         }
1234       break;
1235     }
1236   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1237   return 0;
1238
1239  no_more_source:
1240   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1241     {
1242       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1243       return 0;
1244     }
1245   if (bom_found)
1246     {
1247       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1248       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1249     }
1250   else
1251     {
1252       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1253       if (nchars < src_end - coding->source)
1254         /* The found characters are less than source bytes, which
1255            means that we found a valid non-ASCII characters.  */
1256         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1257     }
1258   coding->detected_utf8_bytes = src_base - coding->source;
1259   coding->detected_utf8_chars = nchars;
1260   return 1;
1261 }
1262
1263
1264 static void
1265 decode_coding_utf_8 (struct coding_system *coding)
1266 {
1267   const unsigned char *src = coding->source + coding->consumed;
1268   const unsigned char *src_end = coding->source + coding->src_bytes;
1269   const unsigned char *src_base;
1270   int *charbuf = coding->charbuf + coding->charbuf_used;
1271   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1272   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1273   bool multibytep = coding->src_multibyte;
1274   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1275   bool eol_dos
1276     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1277   int byte_after_cr = -1;
1278
1279   if (bom != utf_without_bom)
1280     {
1281       int c1, c2, c3;
1282
1283       src_base = src;
1284       ONE_MORE_BYTE (c1);
1285       if (! UTF_8_3_OCTET_LEADING_P (c1))
1286         src = src_base;
1287       else
1288         {
1289           ONE_MORE_BYTE (c2);
1290           if (! UTF_8_EXTRA_OCTET_P (c2))
1291             src = src_base;
1292           else
1293             {
1294               ONE_MORE_BYTE (c3);
1295               if (! UTF_8_EXTRA_OCTET_P (c3))
1296                 src = src_base;
1297               else
1298                 {
1299                   if ((c1 != UTF_8_BOM_1)
1300                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1301                     src = src_base;
1302                   else
1303                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1304                 }
1305             }
1306         }
1307     }
1308   CODING_UTF_8_BOM (coding) = utf_without_bom;
1309
1310   while (1)
1311     {
1312       int c, c1, c2, c3, c4, c5;
1313
1314       src_base = src;
1315       consumed_chars_base = consumed_chars;
1316
1317       if (charbuf >= charbuf_end)
1318         {
1319           if (byte_after_cr >= 0)
1320             src_base--;
1321           break;
1322         }
1323
1324       /* In the simple case, rapidly handle ordinary characters */
1325       if (multibytep && ! eol_dos
1326           && charbuf < charbuf_end - 6 && src < src_end - 6)
1327         {
1328           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1329             {
1330               c1 = *src;
1331               if (c1 & 0x80)
1332                 break;
1333               src++;
1334               consumed_chars++;
1335               *charbuf++ = c1;
1336
1337               c1 = *src;
1338               if (c1 & 0x80)
1339                 break;
1340               src++;
1341               consumed_chars++;
1342               *charbuf++ = c1;
1343
1344               c1 = *src;
1345               if (c1 & 0x80)
1346                 break;
1347               src++;
1348               consumed_chars++;
1349               *charbuf++ = c1;
1350
1351               c1 = *src;
1352               if (c1 & 0x80)
1353                 break;
1354               src++;
1355               consumed_chars++;
1356               *charbuf++ = c1;
1357             }
1358           /* If we handled at least one character, restart the main loop.  */
1359           if (src != src_base)
1360             continue;
1361         }
1362
1363       if (byte_after_cr >= 0)
1364         c1 = byte_after_cr, byte_after_cr = -1;
1365       else
1366         ONE_MORE_BYTE (c1);
1367       if (c1 < 0)
1368         {
1369           c = - c1;
1370         }
1371       else if (UTF_8_1_OCTET_P (c1))
1372         {
1373           if (eol_dos && c1 == '\r')
1374             ONE_MORE_BYTE (byte_after_cr);
1375           c = c1;
1376         }
1377       else
1378         {
1379           ONE_MORE_BYTE (c2);
1380           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1381             goto invalid_code;
1382           if (UTF_8_2_OCTET_LEADING_P (c1))
1383             {
1384               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1385               /* Reject overlong sequences here and below.  Encoders
1386                  producing them are incorrect, they can be misleading,
1387                  and they mess up read/write invariance.  */
1388               if (c < 128)
1389                 goto invalid_code;
1390             }
1391           else
1392             {
1393               ONE_MORE_BYTE (c3);
1394               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1395                 goto invalid_code;
1396               if (UTF_8_3_OCTET_LEADING_P (c1))
1397                 {
1398                   c = (((c1 & 0xF) << 12)
1399                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1400                   if (c < 0x800
1401                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1402                     goto invalid_code;
1403                 }
1404               else
1405                 {
1406                   ONE_MORE_BYTE (c4);
1407                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1408                     goto invalid_code;
1409                   if (UTF_8_4_OCTET_LEADING_P (c1))
1410                     {
1411                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1412                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1413                     if (c < 0x10000)
1414                       goto invalid_code;
1415                     }
1416                   else
1417                     {
1418                       ONE_MORE_BYTE (c5);
1419                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1420                         goto invalid_code;
1421                       if (UTF_8_5_OCTET_LEADING_P (c1))
1422                         {
1423                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1424                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1425                                | (c5 & 0x3F));
1426                           if ((c > MAX_CHAR) || (c < 0x200000))
1427                             goto invalid_code;
1428                         }
1429                       else
1430                         goto invalid_code;
1431                     }
1432                 }
1433             }
1434         }
1435
1436       *charbuf++ = c;
1437       continue;
1438
1439     invalid_code:
1440       src = src_base;
1441       consumed_chars = consumed_chars_base;
1442       ONE_MORE_BYTE (c);
1443       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1444     }
1445
1446  no_more_source:
1447   coding->consumed_char += consumed_chars_base;
1448   coding->consumed = src_base - coding->source;
1449   coding->charbuf_used = charbuf - coding->charbuf;
1450 }
1451
1452
1453 static bool
1454 encode_coding_utf_8 (struct coding_system *coding)
1455 {
1456   bool multibytep = coding->dst_multibyte;
1457   int *charbuf = coding->charbuf;
1458   int *charbuf_end = charbuf + coding->charbuf_used;
1459   unsigned char *dst = coding->destination + coding->produced;
1460   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1461   ptrdiff_t produced_chars = 0;
1462   int c;
1463
1464   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1465     {
1466       ASSURE_DESTINATION (3);
1467       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1468       CODING_UTF_8_BOM (coding) = utf_without_bom;
1469     }
1470
1471   if (multibytep)
1472     {
1473       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1474
1475       while (charbuf < charbuf_end)
1476         {
1477           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1478
1479           ASSURE_DESTINATION (safe_room);
1480           c = *charbuf++;
1481           if (CHAR_BYTE8_P (c))
1482             {
1483               c = CHAR_TO_BYTE8 (c);
1484               EMIT_ONE_BYTE (c);
1485             }
1486           else
1487             {
1488               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1489               for (p = str; p < pend; p++)
1490                 EMIT_ONE_BYTE (*p);
1491             }
1492         }
1493     }
1494   else
1495     {
1496       int safe_room = MAX_MULTIBYTE_LENGTH;
1497
1498       while (charbuf < charbuf_end)
1499         {
1500           ASSURE_DESTINATION (safe_room);
1501           c = *charbuf++;
1502           if (CHAR_BYTE8_P (c))
1503             *dst++ = CHAR_TO_BYTE8 (c);
1504           else
1505             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1506         }
1507       produced_chars = dst - (coding->destination + coding->produced);
1508     }
1509   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1510   coding->produced_char += produced_chars;
1511   coding->produced = dst - coding->destination;
1512   return 0;
1513 }
1514
1515
1516 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1517    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1518
1519 #define UTF_16_HIGH_SURROGATE_P(val) \
1520   (((val) & 0xFC00) == 0xD800)
1521
1522 #define UTF_16_LOW_SURROGATE_P(val) \
1523   (((val) & 0xFC00) == 0xDC00)
1524
1525
1526 static bool
1527 detect_coding_utf_16 (struct coding_system *coding,
1528                       struct coding_detection_info *detect_info)
1529 {
1530   const unsigned char *src = coding->source;
1531   const unsigned char *src_end = coding->source + coding->src_bytes;
1532   bool multibytep = coding->src_multibyte;
1533   int c1, c2;
1534
1535   detect_info->checked |= CATEGORY_MASK_UTF_16;
1536   if (coding->mode & CODING_MODE_LAST_BLOCK
1537       && (coding->src_chars & 1))
1538     {
1539       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1540       return 0;
1541     }
1542
1543   TWO_MORE_BYTES (c1, c2);
1544   if ((c1 == 0xFF) && (c2 == 0xFE))
1545     {
1546       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1547                              | CATEGORY_MASK_UTF_16_AUTO);
1548       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1549                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1550                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1551     }
1552   else if ((c1 == 0xFE) && (c2 == 0xFF))
1553     {
1554       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1555                              | CATEGORY_MASK_UTF_16_AUTO);
1556       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1557                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1558                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1559     }
1560   else if (c2 < 0)
1561     {
1562       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1563       return 0;
1564     }
1565   else
1566     {
1567       /* We check the dispersion of Eth and Oth bytes where E is even and
1568          O is odd.  If both are high, we assume binary data.*/
1569       unsigned char e[256], o[256];
1570       unsigned e_num = 1, o_num = 1;
1571
1572       memset (e, 0, 256);
1573       memset (o, 0, 256);
1574       e[c1] = 1;
1575       o[c2] = 1;
1576
1577       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1578                                 |CATEGORY_MASK_UTF_16_BE
1579                                 | CATEGORY_MASK_UTF_16_LE);
1580
1581       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1582              != CATEGORY_MASK_UTF_16)
1583         {
1584           TWO_MORE_BYTES (c1, c2);
1585           if (c2 < 0)
1586             break;
1587           if (! e[c1])
1588             {
1589               e[c1] = 1;
1590               e_num++;
1591               if (e_num >= 128)
1592                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1593             }
1594           if (! o[c2])
1595             {
1596               o[c2] = 1;
1597               o_num++;
1598               if (o_num >= 128)
1599                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1600             }
1601         }
1602       return 0;
1603     }
1604
1605  no_more_source:
1606   return 1;
1607 }
1608
1609 static void
1610 decode_coding_utf_16 (struct coding_system *coding)
1611 {
1612   const unsigned char *src = coding->source + coding->consumed;
1613   const unsigned char *src_end = coding->source + coding->src_bytes;
1614   const unsigned char *src_base;
1615   int *charbuf = coding->charbuf + coding->charbuf_used;
1616   /* We may produces at most 3 chars in one loop.  */
1617   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1618   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1619   bool multibytep = coding->src_multibyte;
1620   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1621   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1622   int surrogate = CODING_UTF_16_SURROGATE (coding);
1623   bool eol_dos
1624     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1625   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1626
1627   if (bom == utf_with_bom)
1628     {
1629       int c, c1, c2;
1630
1631       src_base = src;
1632       ONE_MORE_BYTE (c1);
1633       ONE_MORE_BYTE (c2);
1634       c = (c1 << 8) | c2;
1635
1636       if (endian == utf_16_big_endian
1637           ? c != 0xFEFF : c != 0xFFFE)
1638         {
1639           /* The first two bytes are not BOM.  Treat them as bytes
1640              for a normal character.  */
1641           src = src_base;
1642         }
1643       CODING_UTF_16_BOM (coding) = utf_without_bom;
1644     }
1645   else if (bom == utf_detect_bom)
1646     {
1647       /* We have already tried to detect BOM and failed in
1648          detect_coding.  */
1649       CODING_UTF_16_BOM (coding) = utf_without_bom;
1650     }
1651
1652   while (1)
1653     {
1654       int c, c1, c2;
1655
1656       src_base = src;
1657       consumed_chars_base = consumed_chars;
1658
1659       if (charbuf >= charbuf_end)
1660         {
1661           if (byte_after_cr1 >= 0)
1662             src_base -= 2;
1663           break;
1664         }
1665
1666       if (byte_after_cr1 >= 0)
1667         c1 = byte_after_cr1, byte_after_cr1 = -1;
1668       else
1669         ONE_MORE_BYTE (c1);
1670       if (c1 < 0)
1671         {
1672           *charbuf++ = -c1;
1673           continue;
1674         }
1675       if (byte_after_cr2 >= 0)
1676         c2 = byte_after_cr2, byte_after_cr2 = -1;
1677       else
1678         ONE_MORE_BYTE (c2);
1679       if (c2 < 0)
1680         {
1681           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1682           *charbuf++ = -c2;
1683           continue;
1684         }
1685       c = (endian == utf_16_big_endian
1686            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1687
1688       if (surrogate)
1689         {
1690           if (! UTF_16_LOW_SURROGATE_P (c))
1691             {
1692               if (endian == utf_16_big_endian)
1693                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1694               else
1695                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1696               *charbuf++ = c1;
1697               *charbuf++ = c2;
1698               if (UTF_16_HIGH_SURROGATE_P (c))
1699                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1700               else
1701                 *charbuf++ = c;
1702             }
1703           else
1704             {
1705               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1706               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1707               *charbuf++ = 0x10000 + c;
1708             }
1709         }
1710       else
1711         {
1712           if (UTF_16_HIGH_SURROGATE_P (c))
1713             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1714           else
1715             {
1716               if (eol_dos && c == '\r')
1717                 {
1718                   ONE_MORE_BYTE (byte_after_cr1);
1719                   ONE_MORE_BYTE (byte_after_cr2);
1720                 }
1721               *charbuf++ = c;
1722             }
1723         }
1724     }
1725
1726  no_more_source:
1727   coding->consumed_char += consumed_chars_base;
1728   coding->consumed = src_base - coding->source;
1729   coding->charbuf_used = charbuf - coding->charbuf;
1730 }
1731
1732 static bool
1733 encode_coding_utf_16 (struct coding_system *coding)
1734 {
1735   bool multibytep = coding->dst_multibyte;
1736   int *charbuf = coding->charbuf;
1737   int *charbuf_end = charbuf + coding->charbuf_used;
1738   unsigned char *dst = coding->destination + coding->produced;
1739   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1740   int safe_room = 8;
1741   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1742   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1743   ptrdiff_t produced_chars = 0;
1744   int c;
1745
1746   if (bom != utf_without_bom)
1747     {
1748       ASSURE_DESTINATION (safe_room);
1749       if (big_endian)
1750         EMIT_TWO_BYTES (0xFE, 0xFF);
1751       else
1752         EMIT_TWO_BYTES (0xFF, 0xFE);
1753       CODING_UTF_16_BOM (coding) = utf_without_bom;
1754     }
1755
1756   while (charbuf < charbuf_end)
1757     {
1758       ASSURE_DESTINATION (safe_room);
1759       c = *charbuf++;
1760       if (c > MAX_UNICODE_CHAR)
1761         c = coding->default_char;
1762
1763       if (c < 0x10000)
1764         {
1765           if (big_endian)
1766             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1767           else
1768             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1769         }
1770       else
1771         {
1772           int c1, c2;
1773
1774           c -= 0x10000;
1775           c1 = (c >> 10) + 0xD800;
1776           c2 = (c & 0x3FF) + 0xDC00;
1777           if (big_endian)
1778             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1779           else
1780             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1781         }
1782     }
1783   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1784   coding->produced = dst - coding->destination;
1785   coding->produced_char += produced_chars;
1786   return 0;
1787 }
1788
1789 \f
1790 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1791
1792 /* Emacs' internal format for representation of multiple character
1793    sets is a kind of multi-byte encoding, i.e. characters are
1794    represented by variable-length sequences of one-byte codes.
1795
1796    ASCII characters and control characters (e.g. `tab', `newline') are
1797    represented by one-byte sequences which are their ASCII codes, in
1798    the range 0x00 through 0x7F.
1799
1800    8-bit characters of the range 0x80..0x9F are represented by
1801    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1802    code + 0x20).
1803
1804    8-bit characters of the range 0xA0..0xFF are represented by
1805    one-byte sequences which are their 8-bit code.
1806
1807    The other characters are represented by a sequence of `base
1808    leading-code', optional `extended leading-code', and one or two
1809    `position-code's.  The length of the sequence is determined by the
1810    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1811    whereas extended leading-code and position-code take the range 0xA0
1812    through 0xFF.  See `charset.h' for more details about leading-code
1813    and position-code.
1814
1815    --- CODE RANGE of Emacs' internal format ---
1816    character set        range
1817    -------------        -----
1818    ascii                0x00..0x7F
1819    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1820    eight-bit-graphic    0xA0..0xBF
1821    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1822    ---------------------------------------------
1823
1824    As this is the internal character representation, the format is
1825    usually not used externally (i.e. in a file or in a data sent to a
1826    process).  But, it is possible to have a text externally in this
1827    format (i.e. by encoding by the coding system `emacs-mule').
1828
1829    In that case, a sequence of one-byte codes has a slightly different
1830    form.
1831
1832    At first, all characters in eight-bit-control are represented by
1833    one-byte sequences which are their 8-bit code.
1834
1835    Next, character composition data are represented by the byte
1836    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1837    where,
1838         METHOD is 0xF2 plus one of composition method (enum
1839         composition_method),
1840
1841         BYTES is 0xA0 plus a byte length of this composition data,
1842
1843         CHARS is 0xA0 plus a number of characters composed by this
1844         data,
1845
1846         COMPONENTs are characters of multibyte form or composition
1847         rules encoded by two-byte of ASCII codes.
1848
1849    In addition, for backward compatibility, the following formats are
1850    also recognized as composition data on decoding.
1851
1852    0x80 MSEQ ...
1853    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1854
1855    Here,
1856         MSEQ is a multibyte form but in these special format:
1857           ASCII: 0xA0 ASCII_CODE+0x80,
1858           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1859         RULE is a one byte code of the range 0xA0..0xF0 that
1860         represents a composition rule.
1861   */
1862
1863 char emacs_mule_bytes[256];
1864
1865
1866 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1867    Return true if a text is encoded in 'emacs-mule'.  */
1868
1869 static bool
1870 detect_coding_emacs_mule (struct coding_system *coding,
1871                           struct coding_detection_info *detect_info)
1872 {
1873   const unsigned char *src = coding->source, *src_base;
1874   const unsigned char *src_end = coding->source + coding->src_bytes;
1875   bool multibytep = coding->src_multibyte;
1876   ptrdiff_t consumed_chars = 0;
1877   int c;
1878   int found = 0;
1879
1880   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1881   /* A coding system of this category is always ASCII compatible.  */
1882   src += coding->head_ascii;
1883
1884   while (1)
1885     {
1886       src_base = src;
1887       ONE_MORE_BYTE (c);
1888       if (c < 0)
1889         continue;
1890       if (c == 0x80)
1891         {
1892           /* Perhaps the start of composite character.  We simply skip
1893              it because analyzing it is too heavy for detecting.  But,
1894              at least, we check that the composite character
1895              constitutes of more than 4 bytes.  */
1896           const unsigned char *src_start;
1897
1898         repeat:
1899           src_start = src;
1900           do
1901             {
1902               ONE_MORE_BYTE (c);
1903             }
1904           while (c >= 0xA0);
1905
1906           if (src - src_start <= 4)
1907             break;
1908           found = CATEGORY_MASK_EMACS_MULE;
1909           if (c == 0x80)
1910             goto repeat;
1911         }
1912
1913       if (c < 0x80)
1914         {
1915           if (c < 0x20
1916               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1917             break;
1918         }
1919       else
1920         {
1921           int more_bytes = emacs_mule_bytes[c] - 1;
1922
1923           while (more_bytes > 0)
1924             {
1925               ONE_MORE_BYTE (c);
1926               if (c < 0xA0)
1927                 {
1928                   src--;        /* Unread the last byte.  */
1929                   break;
1930                 }
1931               more_bytes--;
1932             }
1933           if (more_bytes != 0)
1934             break;
1935           found = CATEGORY_MASK_EMACS_MULE;
1936         }
1937     }
1938   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1939   return 0;
1940
1941  no_more_source:
1942   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1943     {
1944       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1945       return 0;
1946     }
1947   detect_info->found |= found;
1948   return 1;
1949 }
1950
1951
1952 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1953    character.  If CMP_STATUS indicates that we must expect MSEQ or
1954    RULE described above, decode it and return the negative value of
1955    the decoded character or rule.  If an invalid byte is found, return
1956    -1.  If SRC is too short, return -2.  */
1957
1958 static int
1959 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1960                  int *nbytes, int *nchars, int *id,
1961                  struct composition_status *cmp_status)
1962 {
1963   const unsigned char *src_end = coding->source + coding->src_bytes;
1964   const unsigned char *src_base = src;
1965   bool multibytep = coding->src_multibyte;
1966   int charset_ID;
1967   unsigned code;
1968   int c;
1969   ptrdiff_t consumed_chars = 0;
1970   bool mseq_found = 0;
1971
1972   ONE_MORE_BYTE (c);
1973   if (c < 0)
1974     {
1975       c = -c;
1976       charset_ID = emacs_mule_charset[0];
1977     }
1978   else
1979     {
1980       if (c >= 0xA0)
1981         {
1982           if (cmp_status->state != COMPOSING_NO
1983               && cmp_status->old_form)
1984             {
1985               if (cmp_status->state == COMPOSING_CHAR)
1986                 {
1987                   if (c == 0xA0)
1988                     {
1989                       ONE_MORE_BYTE (c);
1990                       c -= 0x80;
1991                       if (c < 0)
1992                         goto invalid_code;
1993                     }
1994                   else
1995                     c -= 0x20;
1996                   mseq_found = 1;
1997                 }
1998               else
1999                 {
2000                   *nbytes = src - src_base;
2001                   *nchars = consumed_chars;
2002                   return -c;
2003                 }
2004             }
2005           else
2006             goto invalid_code;
2007         }
2008
2009       switch (emacs_mule_bytes[c])
2010         {
2011         case 2:
2012           if ((charset_ID = emacs_mule_charset[c]) < 0)
2013             goto invalid_code;
2014           ONE_MORE_BYTE (c);
2015           if (c < 0xA0)
2016             goto invalid_code;
2017           code = c & 0x7F;
2018           break;
2019
2020         case 3:
2021           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2022               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2023             {
2024               ONE_MORE_BYTE (c);
2025               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2026                 goto invalid_code;
2027               ONE_MORE_BYTE (c);
2028               if (c < 0xA0)
2029                 goto invalid_code;
2030               code = c & 0x7F;
2031             }
2032           else
2033             {
2034               if ((charset_ID = emacs_mule_charset[c]) < 0)
2035                 goto invalid_code;
2036               ONE_MORE_BYTE (c);
2037               if (c < 0xA0)
2038                 goto invalid_code;
2039               code = (c & 0x7F) << 8;
2040               ONE_MORE_BYTE (c);
2041               if (c < 0xA0)
2042                 goto invalid_code;
2043               code |= c & 0x7F;
2044             }
2045           break;
2046
2047         case 4:
2048           ONE_MORE_BYTE (c);
2049           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2050             goto invalid_code;
2051           ONE_MORE_BYTE (c);
2052           if (c < 0xA0)
2053             goto invalid_code;
2054           code = (c & 0x7F) << 8;
2055           ONE_MORE_BYTE (c);
2056           if (c < 0xA0)
2057             goto invalid_code;
2058           code |= c & 0x7F;
2059           break;
2060
2061         case 1:
2062           code = c;
2063           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2064           break;
2065
2066         default:
2067           emacs_abort ();
2068         }
2069       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2070                           CHARSET_FROM_ID (charset_ID), code, c);
2071       if (c < 0)
2072         goto invalid_code;
2073     }
2074   *nbytes = src - src_base;
2075   *nchars = consumed_chars;
2076   if (id)
2077     *id = charset_ID;
2078   return (mseq_found ? -c : c);
2079
2080  no_more_source:
2081   return -2;
2082
2083  invalid_code:
2084   return -1;
2085 }
2086
2087
2088 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2089
2090 /* Handle these composition sequence ('|': the end of header elements,
2091    BYTES and CHARS >= 0xA0):
2092
2093    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2094    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2095    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2096
2097    and these old form:
2098
2099    (4) relative composition: 0x80 | MSEQ ... MSEQ
2100    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2101
2102    When the starter 0x80 and the following header elements are found,
2103    this annotation header is produced.
2104
2105         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2106
2107    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2108    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2109
2110    Then, upon reading the following elements, these codes are produced
2111    until the composition end is found:
2112
2113    (1) CHAR ... CHAR
2114    (2) ALT ... ALT CHAR ... CHAR
2115    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2116    (4) CHAR ... CHAR
2117    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2118
2119    When the composition end is found, LENGTH and NCHARS in the
2120    annotation header is updated as below:
2121
2122    (1) LENGTH: unchanged, NCHARS: unchanged
2123    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2124    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2125    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2126    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2127
2128    If an error is found while composing, the annotation header is
2129    changed to the original composition header (plus filler -1s) as
2130    below:
2131
2132    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2133    (5)          [ 0x80 0xFF -1 -1- -1 ]
2134
2135    and the sequence [ -2 DECODED-RULE ] is changed to the original
2136    byte sequence as below:
2137         o the original byte sequence is B: [ B -1 ]
2138         o the original byte sequence is B1 B2: [ B1 B2 ]
2139
2140    Most of the routines are implemented by macros because many
2141    variables and labels in the caller decode_coding_emacs_mule must be
2142    accessible, and they are usually called just once (thus doesn't
2143    increase the size of compiled object).  */
2144
2145 /* Decode a composition rule represented by C as a component of
2146    composition sequence of Emacs 20 style.  Set RULE to the decoded
2147    rule. */
2148
2149 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2150   do {                                                  \
2151     int gref, nref;                                     \
2152                                                         \
2153     c -= 0xA0;                                          \
2154     if (c < 0 || c >= 81)                               \
2155       goto invalid_code;                                \
2156     gref = c / 9, nref = c % 9;                         \
2157     if (gref == 4) gref = 10;                           \
2158     if (nref == 4) nref = 10;                           \
2159     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2160   } while (0)
2161
2162
2163 /* Decode a composition rule represented by C and the following byte
2164    at SRC as a component of composition sequence of Emacs 21 style.
2165    Set RULE to the decoded rule.  */
2166
2167 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2168   do {                                                  \
2169     int gref, nref;                                     \
2170                                                         \
2171     gref = c - 0x20;                                    \
2172     if (gref < 0 || gref >= 81)                         \
2173       goto invalid_code;                                \
2174     ONE_MORE_BYTE (c);                                  \
2175     nref = c - 0x20;                                    \
2176     if (nref < 0 || nref >= 81)                         \
2177       goto invalid_code;                                \
2178     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2179   } while (0)
2180
2181
2182 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2183    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2184    byte length of this composition information, CHARS is the number of
2185    characters composed by this composition.  */
2186
2187 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2188   do {                                                                  \
2189     enum composition_method method = c - 0xF2;                          \
2190     int nbytes, nchars;                                                 \
2191                                                                         \
2192     ONE_MORE_BYTE (c);                                                  \
2193     if (c < 0)                                                          \
2194       goto invalid_code;                                                \
2195     nbytes = c - 0xA0;                                                  \
2196     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2197       goto invalid_code;                                                \
2198     ONE_MORE_BYTE (c);                                                  \
2199     nchars = c - 0xA0;                                                  \
2200     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2201       goto invalid_code;                                                \
2202     cmp_status->old_form = 0;                                           \
2203     cmp_status->method = method;                                        \
2204     if (method == COMPOSITION_RELATIVE)                                 \
2205       cmp_status->state = COMPOSING_CHAR;                               \
2206     else                                                                \
2207       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2208     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2209     cmp_status->nchars = nchars;                                        \
2210     cmp_status->ncomps = nbytes - 4;                                    \
2211     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2212   } while (0)
2213
2214
2215 /* Start of Emacs 20 style format for relative composition.  */
2216
2217 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2218   do {                                                          \
2219     cmp_status->old_form = 1;                                   \
2220     cmp_status->method = COMPOSITION_RELATIVE;                  \
2221     cmp_status->state = COMPOSING_CHAR;                         \
2222     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2223     cmp_status->nchars = cmp_status->ncomps = 0;                \
2224     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2225   } while (0)
2226
2227
2228 /* Start of Emacs 20 style format for rule-base composition.  */
2229
2230 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2231   do {                                                          \
2232     cmp_status->old_form = 1;                                   \
2233     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2234     cmp_status->state = COMPOSING_CHAR;                         \
2235     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2236     cmp_status->nchars = cmp_status->ncomps = 0;                \
2237     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2238   } while (0)
2239
2240
2241 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2242   do {                                                  \
2243     const unsigned char *current_src = src;             \
2244                                                         \
2245     ONE_MORE_BYTE (c);                                  \
2246     if (c < 0)                                          \
2247       goto invalid_code;                                \
2248     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2249         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2250       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2251     else if (c < 0xA0)                                  \
2252       goto invalid_code;                                \
2253     else if (c < 0xC0)                                  \
2254       {                                                 \
2255         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2256         /* Re-read C as a composition component.  */    \
2257         src = current_src;                              \
2258       }                                                 \
2259     else if (c == 0xFF)                                 \
2260       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2261     else                                                \
2262       goto invalid_code;                                \
2263   } while (0)
2264
2265 #define EMACS_MULE_COMPOSITION_END()                            \
2266   do {                                                          \
2267     int idx = - cmp_status->length;                             \
2268                                                                 \
2269     if (cmp_status->old_form)                                   \
2270       charbuf[idx + 2] = cmp_status->nchars;                    \
2271     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2272       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2273     cmp_status->state = COMPOSING_NO;                           \
2274   } while (0)
2275
2276
2277 static int
2278 emacs_mule_finish_composition (int *charbuf,
2279                                struct composition_status *cmp_status)
2280 {
2281   int idx = - cmp_status->length;
2282   int new_chars;
2283
2284   if (cmp_status->old_form && cmp_status->nchars > 0)
2285     {
2286       charbuf[idx + 2] = cmp_status->nchars;
2287       new_chars = 0;
2288       if (cmp_status->method == COMPOSITION_WITH_RULE
2289           && cmp_status->state == COMPOSING_CHAR)
2290         {
2291           /* The last rule was invalid.  */
2292           int rule = charbuf[-1] + 0xA0;
2293
2294           charbuf[-2] = BYTE8_TO_CHAR (rule);
2295           charbuf[-1] = -1;
2296           new_chars = 1;
2297         }
2298     }
2299   else
2300     {
2301       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2302
2303       if (cmp_status->method == COMPOSITION_WITH_RULE)
2304         {
2305           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2306           charbuf[idx++] = -3;
2307           charbuf[idx++] = 0;
2308           new_chars = 1;
2309         }
2310       else
2311         {
2312           int nchars = charbuf[idx + 1] + 0xA0;
2313           int nbytes = charbuf[idx + 2] + 0xA0;
2314
2315           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2316           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2317           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2318           charbuf[idx++] = -1;
2319           new_chars = 4;
2320         }
2321     }
2322   cmp_status->state = COMPOSING_NO;
2323   return new_chars;
2324 }
2325
2326 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2327   do {                                                                    \
2328     if (cmp_status->state != COMPOSING_NO)                                \
2329       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2330   } while (0)
2331
2332
2333 static void
2334 decode_coding_emacs_mule (struct coding_system *coding)
2335 {
2336   const unsigned char *src = coding->source + coding->consumed;
2337   const unsigned char *src_end = coding->source + coding->src_bytes;
2338   const unsigned char *src_base;
2339   int *charbuf = coding->charbuf + coding->charbuf_used;
2340   /* We may produce two annotations (charset and composition) in one
2341      loop and one more charset annotation at the end.  */
2342   int *charbuf_end
2343     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2344       /* We can produce up to 2 characters in a loop.  */
2345       - 1;
2346   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2347   bool multibytep = coding->src_multibyte;
2348   ptrdiff_t char_offset = coding->produced_char;
2349   ptrdiff_t last_offset = char_offset;
2350   int last_id = charset_ascii;
2351   bool eol_dos
2352     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2353   int byte_after_cr = -1;
2354   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2355
2356   if (cmp_status->state != COMPOSING_NO)
2357     {
2358       int i;
2359
2360       if (charbuf_end - charbuf < cmp_status->length)
2361         emacs_abort ();
2362       for (i = 0; i < cmp_status->length; i++)
2363         *charbuf++ = cmp_status->carryover[i];
2364       coding->annotated = 1;
2365     }
2366
2367   while (1)
2368     {
2369       int c, id IF_LINT (= 0);
2370
2371       src_base = src;
2372       consumed_chars_base = consumed_chars;
2373
2374       if (charbuf >= charbuf_end)
2375         {
2376           if (byte_after_cr >= 0)
2377             src_base--;
2378           break;
2379         }
2380
2381       if (byte_after_cr >= 0)
2382         c = byte_after_cr, byte_after_cr = -1;
2383       else
2384         ONE_MORE_BYTE (c);
2385
2386       if (c < 0 || c == 0x80)
2387         {
2388           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2389           if (c < 0)
2390             {
2391               *charbuf++ = -c;
2392               char_offset++;
2393             }
2394           else
2395             DECODE_EMACS_MULE_COMPOSITION_START ();
2396           continue;
2397         }
2398
2399       if (c < 0x80)
2400         {
2401           if (eol_dos && c == '\r')
2402             ONE_MORE_BYTE (byte_after_cr);
2403           id = charset_ascii;
2404           if (cmp_status->state != COMPOSING_NO)
2405             {
2406               if (cmp_status->old_form)
2407                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2408               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2409                 cmp_status->ncomps--;
2410             }
2411         }
2412       else
2413         {
2414           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2415           /* emacs_mule_char can load a charset map from a file, which
2416              allocates a large structure and might cause buffer text
2417              to be relocated as result.  Thus, we need to remember the
2418              original pointer to buffer text, and fix up all related
2419              pointers after the call.  */
2420           const unsigned char *orig = coding->source;
2421           ptrdiff_t offset;
2422
2423           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2424                                cmp_status);
2425           offset = coding->source - orig;
2426           if (offset)
2427             {
2428               src += offset;
2429               src_base += offset;
2430               src_end += offset;
2431             }
2432           if (c < 0)
2433             {
2434               if (c == -1)
2435                 goto invalid_code;
2436               if (c == -2)
2437                 break;
2438             }
2439           src = src_base + nbytes;
2440           consumed_chars = consumed_chars_base + nchars;
2441           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2442             cmp_status->ncomps -= nchars;
2443         }
2444
2445       /* Now if C >= 0, we found a normally encoded character, if C <
2446          0, we found an old-style composition component character or
2447          rule.  */
2448
2449       if (cmp_status->state == COMPOSING_NO)
2450         {
2451           if (last_id != id)
2452             {
2453               if (last_id != charset_ascii)
2454                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2455                                   last_id);
2456               last_id = id;
2457               last_offset = char_offset;
2458             }
2459           *charbuf++ = c;
2460           char_offset++;
2461         }
2462       else if (cmp_status->state == COMPOSING_CHAR)
2463         {
2464           if (cmp_status->old_form)
2465             {
2466               if (c >= 0)
2467                 {
2468                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2469                   *charbuf++ = c;
2470                   char_offset++;
2471                 }
2472               else
2473                 {
2474                   *charbuf++ = -c;
2475                   cmp_status->nchars++;
2476                   cmp_status->length++;
2477                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2478                     EMACS_MULE_COMPOSITION_END ();
2479                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2480                     cmp_status->state = COMPOSING_RULE;
2481                 }
2482             }
2483           else
2484             {
2485               *charbuf++ = c;
2486               cmp_status->length++;
2487               cmp_status->nchars--;
2488               if (cmp_status->nchars == 0)
2489                 EMACS_MULE_COMPOSITION_END ();
2490             }
2491         }
2492       else if (cmp_status->state == COMPOSING_RULE)
2493         {
2494           int rule;
2495
2496           if (c >= 0)
2497             {
2498               EMACS_MULE_COMPOSITION_END ();
2499               *charbuf++ = c;
2500               char_offset++;
2501             }
2502           else
2503             {
2504               c = -c;
2505               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2506               if (rule < 0)
2507                 goto invalid_code;
2508               *charbuf++ = -2;
2509               *charbuf++ = rule;
2510               cmp_status->length += 2;
2511               cmp_status->state = COMPOSING_CHAR;
2512             }
2513         }
2514       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2515         {
2516           *charbuf++ = c;
2517           cmp_status->length++;
2518           if (cmp_status->ncomps == 0)
2519             cmp_status->state = COMPOSING_CHAR;
2520           else if (cmp_status->ncomps > 0)
2521             {
2522               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2523                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2524             }
2525           else
2526             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2527         }
2528       else                      /* COMPOSING_COMPONENT_RULE */
2529         {
2530           int rule;
2531
2532           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2533           if (rule < 0)
2534             goto invalid_code;
2535           *charbuf++ = -2;
2536           *charbuf++ = rule;
2537           cmp_status->length += 2;
2538           cmp_status->ncomps--;
2539           if (cmp_status->ncomps > 0)
2540             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2541           else
2542             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2543         }
2544       continue;
2545
2546     invalid_code:
2547       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2548       src = src_base;
2549       consumed_chars = consumed_chars_base;
2550       ONE_MORE_BYTE (c);
2551       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2552       char_offset++;
2553     }
2554
2555  no_more_source:
2556   if (cmp_status->state != COMPOSING_NO)
2557     {
2558       if (coding->mode & CODING_MODE_LAST_BLOCK)
2559         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2560       else
2561         {
2562           int i;
2563
2564           charbuf -= cmp_status->length;
2565           for (i = 0; i < cmp_status->length; i++)
2566             cmp_status->carryover[i] = charbuf[i];
2567         }
2568     }
2569   if (last_id != charset_ascii)
2570     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2571   coding->consumed_char += consumed_chars_base;
2572   coding->consumed = src_base - coding->source;
2573   coding->charbuf_used = charbuf - coding->charbuf;
2574 }
2575
2576
2577 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2578   do {                                          \
2579     if (id < 0xA0)                              \
2580       codes[0] = id, codes[1] = 0;              \
2581     else if (id < 0xE0)                         \
2582       codes[0] = 0x9A, codes[1] = id;           \
2583     else if (id < 0xF0)                         \
2584       codes[0] = 0x9B, codes[1] = id;           \
2585     else if (id < 0xF5)                         \
2586       codes[0] = 0x9C, codes[1] = id;           \
2587     else                                        \
2588       codes[0] = 0x9D, codes[1] = id;           \
2589   } while (0);
2590
2591
2592 static bool
2593 encode_coding_emacs_mule (struct coding_system *coding)
2594 {
2595   bool multibytep = coding->dst_multibyte;
2596   int *charbuf = coding->charbuf;
2597   int *charbuf_end = charbuf + coding->charbuf_used;
2598   unsigned char *dst = coding->destination + coding->produced;
2599   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2600   int safe_room = 8;
2601   ptrdiff_t produced_chars = 0;
2602   Lisp_Object attrs, charset_list;
2603   int c;
2604   int preferred_charset_id = -1;
2605
2606   CODING_GET_INFO (coding, attrs, charset_list);
2607   if (! EQ (charset_list, Vemacs_mule_charset_list))
2608     {
2609       charset_list = Vemacs_mule_charset_list;
2610       ASET (attrs, coding_attr_charset_list, charset_list);
2611     }
2612
2613   while (charbuf < charbuf_end)
2614     {
2615       ASSURE_DESTINATION (safe_room);
2616       c = *charbuf++;
2617
2618       if (c < 0)
2619         {
2620           /* Handle an annotation.  */
2621           switch (*charbuf)
2622             {
2623             case CODING_ANNOTATE_COMPOSITION_MASK:
2624               /* Not yet implemented.  */
2625               break;
2626             case CODING_ANNOTATE_CHARSET_MASK:
2627               preferred_charset_id = charbuf[3];
2628               if (preferred_charset_id >= 0
2629                   && NILP (Fmemq (make_number (preferred_charset_id),
2630                                   charset_list)))
2631                 preferred_charset_id = -1;
2632               break;
2633             default:
2634               emacs_abort ();
2635             }
2636           charbuf += -c - 1;
2637           continue;
2638         }
2639
2640       if (ASCII_CHAR_P (c))
2641         EMIT_ONE_ASCII_BYTE (c);
2642       else if (CHAR_BYTE8_P (c))
2643         {
2644           c = CHAR_TO_BYTE8 (c);
2645           EMIT_ONE_BYTE (c);
2646         }
2647       else
2648         {
2649           struct charset *charset;
2650           unsigned code;
2651           int dimension;
2652           int emacs_mule_id;
2653           unsigned char leading_codes[2];
2654
2655           if (preferred_charset_id >= 0)
2656             {
2657               bool result;
2658
2659               charset = CHARSET_FROM_ID (preferred_charset_id);
2660               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2661               if (result)
2662                 code = ENCODE_CHAR (charset, c);
2663               else
2664                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2665                                      &code, charset);
2666             }
2667           else
2668             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2669                                  &code, charset);
2670           if (! charset)
2671             {
2672               c = coding->default_char;
2673               if (ASCII_CHAR_P (c))
2674                 {
2675                   EMIT_ONE_ASCII_BYTE (c);
2676                   continue;
2677                 }
2678               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2679                                    &code, charset);
2680             }
2681           dimension = CHARSET_DIMENSION (charset);
2682           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2683           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2684           EMIT_ONE_BYTE (leading_codes[0]);
2685           if (leading_codes[1])
2686             EMIT_ONE_BYTE (leading_codes[1]);
2687           if (dimension == 1)
2688             EMIT_ONE_BYTE (code | 0x80);
2689           else
2690             {
2691               code |= 0x8080;
2692               EMIT_ONE_BYTE (code >> 8);
2693               EMIT_ONE_BYTE (code & 0xFF);
2694             }
2695         }
2696     }
2697   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2698   coding->produced_char += produced_chars;
2699   coding->produced = dst - coding->destination;
2700   return 0;
2701 }
2702
2703 \f
2704 /*** 7. ISO2022 handlers ***/
2705
2706 /* The following note describes the coding system ISO2022 briefly.
2707    Since the intention of this note is to help understand the
2708    functions in this file, some parts are NOT ACCURATE or are OVERLY
2709    SIMPLIFIED.  For thorough understanding, please refer to the
2710    original document of ISO2022.  This is equivalent to the standard
2711    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2712
2713    ISO2022 provides many mechanisms to encode several character sets
2714    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2715    is encoded using bytes less than 128.  This may make the encoded
2716    text a little bit longer, but the text passes more easily through
2717    several types of gateway, some of which strip off the MSB (Most
2718    Significant Bit).
2719
2720    There are two kinds of character sets: control character sets and
2721    graphic character sets.  The former contain control characters such
2722    as `newline' and `escape' to provide control functions (control
2723    functions are also provided by escape sequences).  The latter
2724    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2725    two control character sets and many graphic character sets.
2726
2727    Graphic character sets are classified into one of the following
2728    four classes, according to the number of bytes (DIMENSION) and
2729    number of characters in one dimension (CHARS) of the set:
2730    - DIMENSION1_CHARS94
2731    - DIMENSION1_CHARS96
2732    - DIMENSION2_CHARS94
2733    - DIMENSION2_CHARS96
2734
2735    In addition, each character set is assigned an identification tag,
2736    unique for each set, called the "final character" (denoted as <F>
2737    hereafter).  The <F> of each character set is decided by ECMA(*)
2738    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2739    (0x30..0x3F are for private use only).
2740
2741    Note (*): ECMA = European Computer Manufacturers Association
2742
2743    Here are examples of graphic character sets [NAME(<F>)]:
2744         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2745         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2746         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2747         o DIMENSION2_CHARS96 -- none for the moment
2748
2749    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2750         C0 [0x00..0x1F] -- control character plane 0
2751         GL [0x20..0x7F] -- graphic character plane 0
2752         C1 [0x80..0x9F] -- control character plane 1
2753         GR [0xA0..0xFF] -- graphic character plane 1
2754
2755    A control character set is directly designated and invoked to C0 or
2756    C1 by an escape sequence.  The most common case is that:
2757    - ISO646's  control character set is designated/invoked to C0, and
2758    - ISO6429's control character set is designated/invoked to C1,
2759    and usually these designations/invocations are omitted in encoded
2760    text.  In a 7-bit environment, only C0 can be used, and a control
2761    character for C1 is encoded by an appropriate escape sequence to
2762    fit into the environment.  All control characters for C1 are
2763    defined to have corresponding escape sequences.
2764
2765    A graphic character set is at first designated to one of four
2766    graphic registers (G0 through G3), then these graphic registers are
2767    invoked to GL or GR.  These designations and invocations can be
2768    done independently.  The most common case is that G0 is invoked to
2769    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2770    these invocations and designations are omitted in encoded text.
2771    In a 7-bit environment, only GL can be used.
2772
2773    When a graphic character set of CHARS94 is invoked to GL, codes
2774    0x20 and 0x7F of the GL area work as control characters SPACE and
2775    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2776    be used.
2777
2778    There are two ways of invocation: locking-shift and single-shift.
2779    With locking-shift, the invocation lasts until the next different
2780    invocation, whereas with single-shift, the invocation affects the
2781    following character only and doesn't affect the locking-shift
2782    state.  Invocations are done by the following control characters or
2783    escape sequences:
2784
2785    ----------------------------------------------------------------------
2786    abbrev  function                  cntrl escape seq   description
2787    ----------------------------------------------------------------------
2788    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2789    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2790    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2791    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2792    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2793    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2794    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2795    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2796    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2797    ----------------------------------------------------------------------
2798    (*) These are not used by any known coding system.
2799
2800    Control characters for these functions are defined by macros
2801    ISO_CODE_XXX in `coding.h'.
2802
2803    Designations are done by the following escape sequences:
2804    ----------------------------------------------------------------------
2805    escape sequence      description
2806    ----------------------------------------------------------------------
2807    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2808    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2809    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2810    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2811    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2812    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2813    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2814    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2815    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2816    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2817    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2818    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2819    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2820    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2821    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2822    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2823    ----------------------------------------------------------------------
2824
2825    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2826    of dimension 1, chars 94, and final character <F>, etc...
2827
2828    Note (*): Although these designations are not allowed in ISO2022,
2829    Emacs accepts them on decoding, and produces them on encoding
2830    CHARS96 character sets in a coding system which is characterized as
2831    7-bit environment, non-locking-shift, and non-single-shift.
2832
2833    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2834    '(' must be omitted.  We refer to this as "short-form" hereafter.
2835
2836    Now you may notice that there are a lot of ways of encoding the
2837    same multilingual text in ISO2022.  Actually, there exist many
2838    coding systems such as Compound Text (used in X11's inter client
2839    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2840    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2841    localized platforms), and all of these are variants of ISO2022.
2842
2843    In addition to the above, Emacs handles two more kinds of escape
2844    sequences: ISO6429's direction specification and Emacs' private
2845    sequence for specifying character composition.
2846
2847    ISO6429's direction specification takes the following form:
2848         o CSI ']'      -- end of the current direction
2849         o CSI '0' ']'  -- end of the current direction
2850         o CSI '1' ']'  -- start of left-to-right text
2851         o CSI '2' ']'  -- start of right-to-left text
2852    The control character CSI (0x9B: control sequence introducer) is
2853    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2854
2855    Character composition specification takes the following form:
2856         o ESC '0' -- start relative composition
2857         o ESC '1' -- end composition
2858         o ESC '2' -- start rule-base composition (*)
2859         o ESC '3' -- start relative composition with alternate chars  (**)
2860         o ESC '4' -- start rule-base composition with alternate chars  (**)
2861   Since these are not standard escape sequences of any ISO standard,
2862   the use of them with these meanings is restricted to Emacs only.
2863
2864   (*) This form is used only in Emacs 20.7 and older versions,
2865   but newer versions can safely decode it.
2866   (**) This form is used only in Emacs 21.1 and newer versions,
2867   and older versions can't decode it.
2868
2869   Here's a list of example usages of these composition escape
2870   sequences (categorized by `enum composition_method').
2871
2872   COMPOSITION_RELATIVE:
2873         ESC 0 CHAR [ CHAR ] ESC 1
2874   COMPOSITION_WITH_RULE:
2875         ESC 2 CHAR [ RULE CHAR ] ESC 1
2876   COMPOSITION_WITH_ALTCHARS:
2877         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2878   COMPOSITION_WITH_RULE_ALTCHARS:
2879         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2880
2881 static enum iso_code_class_type iso_code_class[256];
2882
2883 #define SAFE_CHARSET_P(coding, id)      \
2884   ((id) <= (coding)->max_charset_id     \
2885    && (coding)->safe_charsets[id] != 255)
2886
2887 static void
2888 setup_iso_safe_charsets (Lisp_Object attrs)
2889 {
2890   Lisp_Object charset_list, safe_charsets;
2891   Lisp_Object request;
2892   Lisp_Object reg_usage;
2893   Lisp_Object tail;
2894   EMACS_INT reg94, reg96;
2895   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2896   int max_charset_id;
2897
2898   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2899   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2900       && ! EQ (charset_list, Viso_2022_charset_list))
2901     {
2902       charset_list = Viso_2022_charset_list;
2903       ASET (attrs, coding_attr_charset_list, charset_list);
2904       ASET (attrs, coding_attr_safe_charsets, Qnil);
2905     }
2906
2907   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2908     return;
2909
2910   max_charset_id = 0;
2911   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2912     {
2913       int id = XINT (XCAR (tail));
2914       if (max_charset_id < id)
2915         max_charset_id = id;
2916     }
2917
2918   safe_charsets = make_uninit_string (max_charset_id + 1);
2919   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2920   request = AREF (attrs, coding_attr_iso_request);
2921   reg_usage = AREF (attrs, coding_attr_iso_usage);
2922   reg94 = XINT (XCAR (reg_usage));
2923   reg96 = XINT (XCDR (reg_usage));
2924
2925   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2926     {
2927       Lisp_Object id;
2928       Lisp_Object reg;
2929       struct charset *charset;
2930
2931       id = XCAR (tail);
2932       charset = CHARSET_FROM_ID (XINT (id));
2933       reg = Fcdr (Fassq (id, request));
2934       if (! NILP (reg))
2935         SSET (safe_charsets, XINT (id), XINT (reg));
2936       else if (charset->iso_chars_96)
2937         {
2938           if (reg96 < 4)
2939             SSET (safe_charsets, XINT (id), reg96);
2940         }
2941       else
2942         {
2943           if (reg94 < 4)
2944             SSET (safe_charsets, XINT (id), reg94);
2945         }
2946     }
2947   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2948 }
2949
2950
2951 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2952    Return true if a text is encoded in one of ISO-2022 based coding
2953    systems.  */
2954
2955 static bool
2956 detect_coding_iso_2022 (struct coding_system *coding,
2957                         struct coding_detection_info *detect_info)
2958 {
2959   const unsigned char *src = coding->source, *src_base = src;
2960   const unsigned char *src_end = coding->source + coding->src_bytes;
2961   bool multibytep = coding->src_multibyte;
2962   bool single_shifting = 0;
2963   int id;
2964   int c, c1;
2965   ptrdiff_t consumed_chars = 0;
2966   int i;
2967   int rejected = 0;
2968   int found = 0;
2969   int composition_count = -1;
2970
2971   detect_info->checked |= CATEGORY_MASK_ISO;
2972
2973   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2974     {
2975       struct coding_system *this = &(coding_categories[i]);
2976       Lisp_Object attrs, val;
2977
2978       if (this->id < 0)
2979         continue;
2980       attrs = CODING_ID_ATTRS (this->id);
2981       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2982           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2983         setup_iso_safe_charsets (attrs);
2984       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2985       this->max_charset_id = SCHARS (val) - 1;
2986       this->safe_charsets = SDATA (val);
2987     }
2988
2989   /* A coding system of this category is always ASCII compatible.  */
2990   src += coding->head_ascii;
2991
2992   while (rejected != CATEGORY_MASK_ISO)
2993     {
2994       src_base = src;
2995       ONE_MORE_BYTE (c);
2996       switch (c)
2997         {
2998         case ISO_CODE_ESC:
2999           if (inhibit_iso_escape_detection)
3000             break;
3001           single_shifting = 0;
3002           ONE_MORE_BYTE (c);
3003           if (c == 'N' || c == 'O')
3004             {
3005               /* ESC <Fe> for SS2 or SS3.  */
3006               single_shifting = 1;
3007               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3008             }
3009           else if (c == '1')
3010             {
3011               /* End of composition.  */
3012               if (composition_count < 0
3013                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3014                 /* Invalid */
3015                 break;
3016               composition_count = -1;
3017               found |= CATEGORY_MASK_ISO;
3018             }
3019           else if (c >= '0' && c <= '4')
3020             {
3021               /* ESC <Fp> for start/end composition.  */
3022               composition_count = 0;
3023             }
3024           else
3025             {
3026               if (c >= '(' && c <= '/')
3027                 {
3028                   /* Designation sequence for a charset of dimension 1.  */
3029                   ONE_MORE_BYTE (c1);
3030                   if (c1 < ' ' || c1 >= 0x80
3031                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3032                     {
3033                       /* Invalid designation sequence.  Just ignore.  */
3034                       if (c1 >= 0x80)
3035                         rejected |= (CATEGORY_MASK_ISO_7BIT
3036                                      | CATEGORY_MASK_ISO_7_ELSE);
3037                       break;
3038                     }
3039                 }
3040               else if (c == '$')
3041                 {
3042                   /* Designation sequence for a charset of dimension 2.  */
3043                   ONE_MORE_BYTE (c);
3044                   if (c >= '@' && c <= 'B')
3045                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3046                     id = iso_charset_table[1][0][c];
3047                   else if (c >= '(' && c <= '/')
3048                     {
3049                       ONE_MORE_BYTE (c1);
3050                       if (c1 < ' ' || c1 >= 0x80
3051                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3052                         {
3053                           /* Invalid designation sequence.  Just ignore.  */
3054                           if (c1 >= 0x80)
3055                             rejected |= (CATEGORY_MASK_ISO_7BIT
3056                                          | CATEGORY_MASK_ISO_7_ELSE);
3057                           break;
3058                         }
3059                     }
3060                   else
3061                     {
3062                       /* Invalid designation sequence.  Just ignore it.  */
3063                       if (c >= 0x80)
3064                         rejected |= (CATEGORY_MASK_ISO_7BIT
3065                                      | CATEGORY_MASK_ISO_7_ELSE);
3066                       break;
3067                     }
3068                 }
3069               else
3070                 {
3071                   /* Invalid escape sequence.  Just ignore it.  */
3072                   if (c >= 0x80)
3073                     rejected |= (CATEGORY_MASK_ISO_7BIT
3074                                  | CATEGORY_MASK_ISO_7_ELSE);
3075                   break;
3076                 }
3077
3078               /* We found a valid designation sequence for CHARSET.  */
3079               rejected |= CATEGORY_MASK_ISO_8BIT;
3080               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3081                                   id))
3082                 found |= CATEGORY_MASK_ISO_7;
3083               else
3084                 rejected |= CATEGORY_MASK_ISO_7;
3085               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3086                                   id))
3087                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3088               else
3089                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3090               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3091                                   id))
3092                 found |= CATEGORY_MASK_ISO_7_ELSE;
3093               else
3094                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3095               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3096                                   id))
3097                 found |= CATEGORY_MASK_ISO_8_ELSE;
3098               else
3099                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3100             }
3101           break;
3102
3103         case ISO_CODE_SO:
3104         case ISO_CODE_SI:
3105           /* Locking shift out/in.  */
3106           if (inhibit_iso_escape_detection)
3107             break;
3108           single_shifting = 0;
3109           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3110           break;
3111
3112         case ISO_CODE_CSI:
3113           /* Control sequence introducer.  */
3114           single_shifting = 0;
3115           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3116           found |= CATEGORY_MASK_ISO_8_ELSE;
3117           goto check_extra_latin;
3118
3119         case ISO_CODE_SS2:
3120         case ISO_CODE_SS3:
3121           /* Single shift.   */
3122           if (inhibit_iso_escape_detection)
3123             break;
3124           single_shifting = 0;
3125           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3126           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3127               & CODING_ISO_FLAG_SINGLE_SHIFT)
3128             {
3129               found |= CATEGORY_MASK_ISO_8_1;
3130               single_shifting = 1;
3131             }
3132           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3133               & CODING_ISO_FLAG_SINGLE_SHIFT)
3134             {
3135               found |= CATEGORY_MASK_ISO_8_2;
3136               single_shifting = 1;
3137             }
3138           if (single_shifting)
3139             break;
3140           goto check_extra_latin;
3141
3142         default:
3143           if (c < 0)
3144             continue;
3145           if (c < 0x80)
3146             {
3147               if (composition_count >= 0)
3148                 composition_count++;
3149               single_shifting = 0;
3150               break;
3151             }
3152           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3153           if (c >= 0xA0)
3154             {
3155               found |= CATEGORY_MASK_ISO_8_1;
3156               /* Check the length of succeeding codes of the range
3157                  0xA0..0FF.  If the byte length is even, we include
3158                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3159                  only when we are not single shifting.  */
3160               if (! single_shifting
3161                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3162                 {
3163                   ptrdiff_t len = 1;
3164                   while (src < src_end)
3165                     {
3166                       src_base = src;
3167                       ONE_MORE_BYTE (c);
3168                       if (c < 0xA0)
3169                         {
3170                           src = src_base;
3171                           break;
3172                         }
3173                       len++;
3174                     }
3175
3176                   if (len & 1 && src < src_end)
3177                     {
3178                       rejected |= CATEGORY_MASK_ISO_8_2;
3179                       if (composition_count >= 0)
3180                         composition_count += len;
3181                     }
3182                   else
3183                     {
3184                       found |= CATEGORY_MASK_ISO_8_2;
3185                       if (composition_count >= 0)
3186                         composition_count += len / 2;
3187                     }
3188                 }
3189               break;
3190             }
3191         check_extra_latin:
3192           if (! VECTORP (Vlatin_extra_code_table)
3193               || NILP (AREF (Vlatin_extra_code_table, c)))
3194             {
3195               rejected = CATEGORY_MASK_ISO;
3196               break;
3197             }
3198           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3199               & CODING_ISO_FLAG_LATIN_EXTRA)
3200             found |= CATEGORY_MASK_ISO_8_1;
3201           else
3202             rejected |= CATEGORY_MASK_ISO_8_1;
3203           rejected |= CATEGORY_MASK_ISO_8_2;
3204           break;
3205         }
3206     }
3207   detect_info->rejected |= CATEGORY_MASK_ISO;
3208   return 0;
3209
3210  no_more_source:
3211   detect_info->rejected |= rejected;
3212   detect_info->found |= (found & ~rejected);
3213   return 1;
3214 }
3215
3216
3217 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3218    escape sequence should be kept.  */
3219 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3220   do {                                                                  \
3221     int id, prev;                                                       \
3222                                                                         \
3223     if (final < '0' || final >= 128                                     \
3224         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3225         || !SAFE_CHARSET_P (coding, id))                                \
3226       {                                                                 \
3227         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3228         chars_96 = -1;                                                  \
3229         break;                                                          \
3230       }                                                                 \
3231     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3232     if (id == charset_jisx0201_roman)                                   \
3233       {                                                                 \
3234         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3235           id = charset_ascii;                                           \
3236       }                                                                 \
3237     else if (id == charset_jisx0208_1978)                               \
3238       {                                                                 \
3239         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3240           id = charset_jisx0208;                                        \
3241       }                                                                 \
3242     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3243     /* If there was an invalid designation to REG previously, and this  \
3244        designation is ASCII to REG, we should keep this designation     \
3245        sequence.  */                                                    \
3246     if (prev == -2 && id == charset_ascii)                              \
3247       chars_96 = -1;                                                    \
3248   } while (0)
3249
3250
3251 /* Handle these composition sequence (ALT: alternate char):
3252
3253    (1) relative composition: ESC 0 CHAR ... ESC 1
3254    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3255    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3256    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3257
3258    When the start sequence (ESC 0/2/3/4) is found, this annotation
3259    header is produced.
3260
3261         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3262
3263    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3264    produced until the end sequence (ESC 1) is found:
3265
3266    (1) CHAR ... CHAR
3267    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3268    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3269    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3270
3271    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3272    annotation header is updated as below:
3273
3274    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3275    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3276    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3277    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3278
3279    If an error is found while composing, the annotation header is
3280    changed to:
3281
3282         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3283
3284    and the sequence [ -2 DECODED-RULE ] is changed to the original
3285    byte sequence as below:
3286         o the original byte sequence is B: [ B -1 ]
3287         o the original byte sequence is B1 B2: [ B1 B2 ]
3288    and the sequence [ -1 -1 ] is changed to the original byte
3289    sequence:
3290         [ ESC '0' ]
3291 */
3292
3293 /* Decode a composition rule C1 and maybe one more byte from the
3294    source, and set RULE to the encoded composition rule.  If the rule
3295    is invalid, goto invalid_code.  */
3296
3297 #define DECODE_COMPOSITION_RULE(rule)                                   \
3298   do {                                                                  \
3299     rule = c1 - 32;                                                     \
3300     if (rule < 0)                                                       \
3301       goto invalid_code;                                                \
3302     if (rule < 81)              /* old format (before ver.21) */        \
3303       {                                                                 \
3304         int gref = (rule) / 9;                                          \
3305         int nref = (rule) % 9;                                          \
3306         if (gref == 4) gref = 10;                                       \
3307         if (nref == 4) nref = 10;                                       \
3308         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3309       }                                                                 \
3310     else                        /* new format (after ver.21) */         \
3311       {                                                                 \
3312         int b;                                                          \
3313                                                                         \
3314         ONE_MORE_BYTE (b);                                              \
3315         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3316           goto invalid_code;                                            \
3317         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3318         rule += 0x100;   /* Distinguish it from the old format.  */     \
3319       }                                                                 \
3320   } while (0)
3321
3322 #define ENCODE_COMPOSITION_RULE(rule)                           \
3323   do {                                                          \
3324     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3325                                                                 \
3326     if (rule < 0x100)           /* old format */                \
3327       {                                                         \
3328         if (gref == 10) gref = 4;                               \
3329         if (nref == 10) nref = 4;                               \
3330         charbuf[idx] = 32 + gref * 9 + nref;                    \
3331         charbuf[idx + 1] = -1;                                  \
3332         new_chars++;                                            \
3333       }                                                         \
3334     else                                /* new format */        \
3335       {                                                         \
3336         charbuf[idx] = 32 + 81 + gref;                          \
3337         charbuf[idx + 1] = 32 + nref;                           \
3338         new_chars += 2;                                         \
3339       }                                                         \
3340   } while (0)
3341
3342 /* Finish the current composition as invalid.  */
3343
3344 static int
3345 finish_composition (int *charbuf, struct composition_status *cmp_status)
3346 {
3347   int idx = - cmp_status->length;
3348   int new_chars;
3349
3350   /* Recover the original ESC sequence */
3351   charbuf[idx++] = ISO_CODE_ESC;
3352   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3353                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3354                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3355                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3356                     : '4');
3357   charbuf[idx++] = -2;
3358   charbuf[idx++] = 0;
3359   charbuf[idx++] = -1;
3360   new_chars = cmp_status->nchars;
3361   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3362     for (; idx < 0; idx++)
3363       {
3364         int elt = charbuf[idx];
3365
3366         if (elt == -2)
3367           {
3368             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3369             idx++;
3370           }
3371         else if (elt == -1)
3372           {
3373             charbuf[idx++] = ISO_CODE_ESC;
3374             charbuf[idx] = '0';
3375             new_chars += 2;
3376           }
3377       }
3378   cmp_status->state = COMPOSING_NO;
3379   return new_chars;
3380 }
3381
3382 /* If characters are under composition, finish the composition.  */
3383 #define MAYBE_FINISH_COMPOSITION()                              \
3384   do {                                                          \
3385     if (cmp_status->state != COMPOSING_NO)                      \
3386       char_offset += finish_composition (charbuf, cmp_status);  \
3387   } while (0)
3388
3389 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3390
3391    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3392    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3393    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3394    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3395
3396    Produce this annotation sequence now:
3397
3398    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3399 */
3400
3401 #define DECODE_COMPOSITION_START(c1)                                       \
3402   do {                                                                     \
3403     if (c1 == '0'                                                          \
3404         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3405              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3406             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3407                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3408       {                                                                    \
3409         *charbuf++ = -1;                                                   \
3410         *charbuf++= -1;                                                    \
3411         cmp_status->state = COMPOSING_CHAR;                                \
3412         cmp_status->length += 2;                                           \
3413       }                                                                    \
3414     else                                                                   \
3415       {                                                                    \
3416         MAYBE_FINISH_COMPOSITION ();                                       \
3417         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3418                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3419                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3420                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3421         cmp_status->state                                                  \
3422           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3423         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3424         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3425         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3426         coding->annotated = 1;                                             \
3427       }                                                                    \
3428   } while (0)
3429
3430
3431 /* Handle composition end sequence ESC 1.  */
3432
3433 #define DECODE_COMPOSITION_END()                                        \
3434   do {                                                                  \
3435     if (cmp_status->nchars == 0                                         \
3436         || ((cmp_status->state == COMPOSING_CHAR)                       \
3437             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3438       {                                                                 \
3439         MAYBE_FINISH_COMPOSITION ();                                    \
3440         goto invalid_code;                                              \
3441       }                                                                 \
3442     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3443       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3444     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3445       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3446     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3447     char_offset += cmp_status->nchars;                                  \
3448     cmp_status->state = COMPOSING_NO;                                   \
3449   } while (0)
3450
3451 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3452
3453 #define STORE_COMPOSITION_RULE(rule)    \
3454   do {                                  \
3455     *charbuf++ = -2;                    \
3456     *charbuf++ = rule;                  \
3457     cmp_status->length += 2;            \
3458     cmp_status->state--;                \
3459   } while (0)
3460
3461 /* Store a composed char or a component char C in charbuf, and update
3462    cmp_status.  */
3463
3464 #define STORE_COMPOSITION_CHAR(c)                                       \
3465   do {                                                                  \
3466     *charbuf++ = (c);                                                   \
3467     cmp_status->length++;                                               \
3468     if (cmp_status->state == COMPOSING_CHAR)                            \
3469       cmp_status->nchars++;                                             \
3470     else                                                                \
3471       cmp_status->ncomps++;                                             \
3472     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3473         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3474             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3475       cmp_status->state++;                                              \
3476   } while (0)
3477
3478
3479 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3480
3481 static void
3482 decode_coding_iso_2022 (struct coding_system *coding)
3483 {
3484   const unsigned char *src = coding->source + coding->consumed;
3485   const unsigned char *src_end = coding->source + coding->src_bytes;
3486   const unsigned char *src_base;
3487   int *charbuf = coding->charbuf + coding->charbuf_used;
3488   /* We may produce two annotations (charset and composition) in one
3489      loop and one more charset annotation at the end.  */
3490   int *charbuf_end
3491     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3492   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3493   bool multibytep = coding->src_multibyte;
3494   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3495   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3496   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3497   int charset_id_2, charset_id_3;
3498   struct charset *charset;
3499   int c;
3500   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3501   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3502   ptrdiff_t char_offset = coding->produced_char;
3503   ptrdiff_t last_offset = char_offset;
3504   int last_id = charset_ascii;
3505   bool eol_dos
3506     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3507   int byte_after_cr = -1;
3508   int i;
3509
3510   setup_iso_safe_charsets (attrs);
3511   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3512
3513   if (cmp_status->state != COMPOSING_NO)
3514     {
3515       if (charbuf_end - charbuf < cmp_status->length)
3516         emacs_abort ();
3517       for (i = 0; i < cmp_status->length; i++)
3518         *charbuf++ = cmp_status->carryover[i];
3519       coding->annotated = 1;
3520     }
3521
3522   while (1)
3523     {
3524       int c1, c2, c3;
3525
3526       src_base = src;
3527       consumed_chars_base = consumed_chars;
3528
3529       if (charbuf >= charbuf_end)
3530         {
3531           if (byte_after_cr >= 0)
3532             src_base--;
3533           break;
3534         }
3535
3536       if (byte_after_cr >= 0)
3537         c1 = byte_after_cr, byte_after_cr = -1;
3538       else
3539         ONE_MORE_BYTE (c1);
3540       if (c1 < 0)
3541         goto invalid_code;
3542
3543       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3544         {
3545           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3546           char_offset++;
3547           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3548           continue;
3549         }
3550
3551       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3552         {
3553           if (c1 == ISO_CODE_ESC)
3554             {
3555               if (src + 1 >= src_end)
3556                 goto no_more_source;
3557               *charbuf++ = ISO_CODE_ESC;
3558               char_offset++;
3559               if (src[0] == '%' && src[1] == '@')
3560                 {
3561                   src += 2;
3562                   consumed_chars += 2;
3563                   char_offset += 2;
3564                   /* We are sure charbuf can contain two more chars. */
3565                   *charbuf++ = '%';
3566                   *charbuf++ = '@';
3567                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3568                 }
3569             }
3570           else
3571             {
3572               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3573               char_offset++;
3574             }
3575           continue;
3576         }
3577
3578       if ((cmp_status->state == COMPOSING_RULE
3579            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3580           && c1 != ISO_CODE_ESC)
3581         {
3582           int rule;
3583
3584           DECODE_COMPOSITION_RULE (rule);
3585           STORE_COMPOSITION_RULE (rule);
3586           continue;
3587         }
3588
3589       /* We produce at most one character.  */
3590       switch (iso_code_class [c1])
3591         {
3592         case ISO_0x20_or_0x7F:
3593           if (charset_id_0 < 0
3594               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3595             /* This is SPACE or DEL.  */
3596             charset = CHARSET_FROM_ID (charset_ascii);
3597           else
3598             charset = CHARSET_FROM_ID (charset_id_0);
3599           break;
3600
3601         case ISO_graphic_plane_0:
3602           if (charset_id_0 < 0)
3603             charset = CHARSET_FROM_ID (charset_ascii);
3604           else
3605             charset = CHARSET_FROM_ID (charset_id_0);
3606           break;
3607
3608         case ISO_0xA0_or_0xFF:
3609           if (charset_id_1 < 0
3610               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3611               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3612             goto invalid_code;
3613           /* This is a graphic character, we fall down ... */
3614
3615         case ISO_graphic_plane_1:
3616           if (charset_id_1 < 0)
3617             goto invalid_code;
3618           charset = CHARSET_FROM_ID (charset_id_1);
3619           break;
3620
3621         case ISO_control_0:
3622           if (eol_dos && c1 == '\r')
3623             ONE_MORE_BYTE (byte_after_cr);
3624           MAYBE_FINISH_COMPOSITION ();
3625           charset = CHARSET_FROM_ID (charset_ascii);
3626           break;
3627
3628         case ISO_control_1:
3629           goto invalid_code;
3630
3631         case ISO_shift_out:
3632           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3633               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3634             goto invalid_code;
3635           CODING_ISO_INVOCATION (coding, 0) = 1;
3636           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3637           continue;
3638
3639         case ISO_shift_in:
3640           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3641             goto invalid_code;
3642           CODING_ISO_INVOCATION (coding, 0) = 0;
3643           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3644           continue;
3645
3646         case ISO_single_shift_2_7:
3647           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3648             goto invalid_code;
3649         case ISO_single_shift_2:
3650           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3651             goto invalid_code;
3652           /* SS2 is handled as an escape sequence of ESC 'N' */
3653           c1 = 'N';
3654           goto label_escape_sequence;
3655
3656         case ISO_single_shift_3:
3657           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3658             goto invalid_code;
3659           /* SS2 is handled as an escape sequence of ESC 'O' */
3660           c1 = 'O';
3661           goto label_escape_sequence;
3662
3663         case ISO_control_sequence_introducer:
3664           /* CSI is handled as an escape sequence of ESC '[' ...  */
3665           c1 = '[';
3666           goto label_escape_sequence;
3667
3668         case ISO_escape:
3669           ONE_MORE_BYTE (c1);
3670         label_escape_sequence:
3671           /* Escape sequences handled here are invocation,
3672              designation, direction specification, and character
3673              composition specification.  */
3674           switch (c1)
3675             {
3676             case '&':           /* revision of following character set */
3677               ONE_MORE_BYTE (c1);
3678               if (!(c1 >= '@' && c1 <= '~'))
3679                 goto invalid_code;
3680               ONE_MORE_BYTE (c1);
3681               if (c1 != ISO_CODE_ESC)
3682                 goto invalid_code;
3683               ONE_MORE_BYTE (c1);
3684               goto label_escape_sequence;
3685
3686             case '$':           /* designation of 2-byte character set */
3687               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3688                 goto invalid_code;
3689               {
3690                 int reg, chars96;
3691
3692                 ONE_MORE_BYTE (c1);
3693                 if (c1 >= '@' && c1 <= 'B')
3694                   {     /* designation of JISX0208.1978, GB2312.1980,
3695                            or JISX0208.1980 */
3696                     reg = 0, chars96 = 0;
3697                   }
3698                 else if (c1 >= 0x28 && c1 <= 0x2B)
3699                   { /* designation of DIMENSION2_CHARS94 character set */
3700                     reg = c1 - 0x28, chars96 = 0;
3701                     ONE_MORE_BYTE (c1);
3702                   }
3703                 else if (c1 >= 0x2C && c1 <= 0x2F)
3704                   { /* designation of DIMENSION2_CHARS96 character set */
3705                     reg = c1 - 0x2C, chars96 = 1;
3706                     ONE_MORE_BYTE (c1);
3707                   }
3708                 else
3709                   goto invalid_code;
3710                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3711                 /* We must update these variables now.  */
3712                 if (reg == 0)
3713                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3714                 else if (reg == 1)
3715                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3716                 if (chars96 < 0)
3717                   goto invalid_code;
3718               }
3719               continue;
3720
3721             case 'n':           /* invocation of locking-shift-2 */
3722               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3723                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3724                 goto invalid_code;
3725               CODING_ISO_INVOCATION (coding, 0) = 2;
3726               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3727               continue;
3728
3729             case 'o':           /* invocation of locking-shift-3 */
3730               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3731                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3732                 goto invalid_code;
3733               CODING_ISO_INVOCATION (coding, 0) = 3;
3734               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3735               continue;
3736
3737             case 'N':           /* invocation of single-shift-2 */
3738               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3739                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3740                 goto invalid_code;
3741               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3742               if (charset_id_2 < 0)
3743                 charset = CHARSET_FROM_ID (charset_ascii);
3744               else
3745                 charset = CHARSET_FROM_ID (charset_id_2);
3746               ONE_MORE_BYTE (c1);
3747               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3748                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3749                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3750                           ? c1 >= 0x80 : c1 < 0x80)))
3751                 goto invalid_code;
3752               break;
3753
3754             case 'O':           /* invocation of single-shift-3 */
3755               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3756                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3757                 goto invalid_code;
3758               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3759               if (charset_id_3 < 0)
3760                 charset = CHARSET_FROM_ID (charset_ascii);
3761               else
3762                 charset = CHARSET_FROM_ID (charset_id_3);
3763               ONE_MORE_BYTE (c1);
3764               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3765                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3766                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3767                           ? c1 >= 0x80 : c1 < 0x80)))
3768                 goto invalid_code;
3769               break;
3770
3771             case '0': case '2': case '3': case '4': /* start composition */
3772               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3773                 goto invalid_code;
3774               if (last_id != charset_ascii)
3775                 {
3776                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3777                   last_id = charset_ascii;
3778                   last_offset = char_offset;
3779                 }
3780               DECODE_COMPOSITION_START (c1);
3781               continue;
3782
3783             case '1':           /* end composition */
3784               if (cmp_status->state == COMPOSING_NO)
3785                 goto invalid_code;
3786               DECODE_COMPOSITION_END ();
3787               continue;
3788
3789             case '[':           /* specification of direction */
3790               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3791                 goto invalid_code;
3792               /* For the moment, nested direction is not supported.
3793                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3794                  left-to-right, and nonzero means right-to-left.  */
3795               ONE_MORE_BYTE (c1);
3796               switch (c1)
3797                 {
3798                 case ']':       /* end of the current direction */
3799                   coding->mode &= ~CODING_MODE_DIRECTION;
3800
3801                 case '0':       /* end of the current direction */
3802                 case '1':       /* start of left-to-right direction */
3803                   ONE_MORE_BYTE (c1);
3804                   if (c1 == ']')
3805                     coding->mode &= ~CODING_MODE_DIRECTION;
3806                   else
3807                     goto invalid_code;
3808                   break;
3809
3810                 case '2':       /* start of right-to-left direction */
3811                   ONE_MORE_BYTE (c1);
3812                   if (c1 == ']')
3813                     coding->mode |= CODING_MODE_DIRECTION;
3814                   else
3815                     goto invalid_code;
3816                   break;
3817
3818                 default:
3819                   goto invalid_code;
3820                 }
3821               continue;
3822
3823             case '%':
3824               ONE_MORE_BYTE (c1);
3825               if (c1 == '/')
3826                 {
3827                   /* CTEXT extended segment:
3828                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3829                      We keep these bytes as is for the moment.
3830                      They may be decoded by post-read-conversion.  */
3831                   int dim, M, L;
3832                   int size;
3833
3834                   ONE_MORE_BYTE (dim);
3835                   if (dim < '0' || dim > '4')
3836                     goto invalid_code;
3837                   ONE_MORE_BYTE (M);
3838                   if (M < 128)
3839                     goto invalid_code;
3840                   ONE_MORE_BYTE (L);
3841                   if (L < 128)
3842                     goto invalid_code;
3843                   size = ((M - 128) * 128) + (L - 128);
3844                   if (charbuf + 6 > charbuf_end)
3845                     goto break_loop;
3846                   *charbuf++ = ISO_CODE_ESC;
3847                   *charbuf++ = '%';
3848                   *charbuf++ = '/';
3849                   *charbuf++ = dim;
3850                   *charbuf++ = BYTE8_TO_CHAR (M);
3851                   *charbuf++ = BYTE8_TO_CHAR (L);
3852                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3853                 }
3854               else if (c1 == 'G')
3855                 {
3856                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3857                      ESC % G --UTF-8-BYTES-- ESC % @
3858                      We keep these bytes as is for the moment.
3859                      They may be decoded by post-read-conversion.  */
3860                   if (charbuf + 3 > charbuf_end)
3861                     goto break_loop;
3862                   *charbuf++ = ISO_CODE_ESC;
3863                   *charbuf++ = '%';
3864                   *charbuf++ = 'G';
3865                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3866                 }
3867               else
3868                 goto invalid_code;
3869               continue;
3870               break;
3871
3872             default:
3873               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3874                 goto invalid_code;
3875               {
3876                 int reg, chars96;
3877
3878                 if (c1 >= 0x28 && c1 <= 0x2B)
3879                   { /* designation of DIMENSION1_CHARS94 character set */
3880                     reg = c1 - 0x28, chars96 = 0;
3881                     ONE_MORE_BYTE (c1);
3882                   }
3883                 else if (c1 >= 0x2C && c1 <= 0x2F)
3884                   { /* designation of DIMENSION1_CHARS96 character set */
3885                     reg = c1 - 0x2C, chars96 = 1;
3886                     ONE_MORE_BYTE (c1);
3887                   }
3888                 else
3889                   goto invalid_code;
3890                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3891                 /* We must update these variables now.  */
3892                 if (reg == 0)
3893                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3894                 else if (reg == 1)
3895                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3896                 if (chars96 < 0)
3897                   goto invalid_code;
3898               }
3899               continue;
3900             }
3901           break;
3902
3903         default:
3904           emacs_abort ();
3905         }
3906
3907       if (cmp_status->state == COMPOSING_NO
3908           && charset->id != charset_ascii
3909           && last_id != charset->id)
3910         {
3911           if (last_id != charset_ascii)
3912             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3913           last_id = charset->id;
3914           last_offset = char_offset;
3915         }
3916
3917       /* Now we know CHARSET and 1st position code C1 of a character.
3918          Produce a decoded character while getting 2nd and 3rd
3919          position codes C2, C3 if necessary.  */
3920       if (CHARSET_DIMENSION (charset) > 1)
3921         {
3922           ONE_MORE_BYTE (c2);
3923           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3924               || ((c1 & 0x80) != (c2 & 0x80)))
3925             /* C2 is not in a valid range.  */
3926             goto invalid_code;
3927           if (CHARSET_DIMENSION (charset) == 2)
3928             c1 = (c1 << 8) | c2;
3929           else
3930             {
3931               ONE_MORE_BYTE (c3);
3932               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3933                   || ((c1 & 0x80) != (c3 & 0x80)))
3934                 /* C3 is not in a valid range.  */
3935                 goto invalid_code;
3936               c1 = (c1 << 16) | (c2 << 8) | c2;
3937             }
3938         }
3939       c1 &= 0x7F7F7F;
3940       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3941       if (c < 0)
3942         {
3943           MAYBE_FINISH_COMPOSITION ();
3944           for (; src_base < src; src_base++, char_offset++)
3945             {
3946               if (ASCII_CHAR_P (*src_base))
3947                 *charbuf++ = *src_base;
3948               else
3949                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3950             }
3951         }
3952       else if (cmp_status->state == COMPOSING_NO)
3953         {
3954           *charbuf++ = c;
3955           char_offset++;
3956         }
3957       else if ((cmp_status->state == COMPOSING_CHAR
3958                 ? cmp_status->nchars
3959                 : cmp_status->ncomps)
3960                >= MAX_COMPOSITION_COMPONENTS)
3961         {
3962           /* Too long composition.  */
3963           MAYBE_FINISH_COMPOSITION ();
3964           *charbuf++ = c;
3965           char_offset++;
3966         }
3967       else
3968         STORE_COMPOSITION_CHAR (c);
3969       continue;
3970
3971     invalid_code:
3972       MAYBE_FINISH_COMPOSITION ();
3973       src = src_base;
3974       consumed_chars = consumed_chars_base;
3975       ONE_MORE_BYTE (c);
3976       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
3977       char_offset++;
3978       /* Reset the invocation and designation status to the safest
3979          one; i.e. designate ASCII to the graphic register 0, and
3980          invoke that register to the graphic plane 0.  This typically
3981          helps the case that an designation sequence for ASCII "ESC (
3982          B" is somehow broken (e.g. broken by a newline).  */
3983       CODING_ISO_INVOCATION (coding, 0) = 0;
3984       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3985       charset_id_0 = charset_ascii;
3986       continue;
3987
3988     break_loop:
3989       break;
3990     }
3991
3992  no_more_source:
3993   if (cmp_status->state != COMPOSING_NO)
3994     {
3995       if (coding->mode & CODING_MODE_LAST_BLOCK)
3996         MAYBE_FINISH_COMPOSITION ();
3997       else
3998         {
3999           charbuf -= cmp_status->length;
4000           for (i = 0; i < cmp_status->length; i++)
4001             cmp_status->carryover[i] = charbuf[i];
4002         }
4003     }
4004   else if (last_id != charset_ascii)
4005     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4006   coding->consumed_char += consumed_chars_base;
4007   coding->consumed = src_base - coding->source;
4008   coding->charbuf_used = charbuf - coding->charbuf;
4009 }
4010
4011
4012 /* ISO2022 encoding stuff.  */
4013
4014 /*
4015    It is not enough to say just "ISO2022" on encoding, we have to
4016    specify more details.  In Emacs, each coding system of ISO2022
4017    variant has the following specifications:
4018         1. Initial designation to G0 thru G3.
4019         2. Allows short-form designation?
4020         3. ASCII should be designated to G0 before control characters?
4021         4. ASCII should be designated to G0 at end of line?
4022         5. 7-bit environment or 8-bit environment?
4023         6. Use locking-shift?
4024         7. Use Single-shift?
4025    And the following two are only for Japanese:
4026         8. Use ASCII in place of JIS0201-1976-Roman?
4027         9. Use JISX0208-1983 in place of JISX0208-1978?
4028    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4029    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4030    details.
4031 */
4032
4033 /* Produce codes (escape sequence) for designating CHARSET to graphic
4034    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4035    '@', 'A', or 'B' and the coding system CODING allows, produce
4036    designation sequence of short-form.  */
4037
4038 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4039   do {                                                                  \
4040     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4041     const char *intermediate_char_94 = "()*+";                          \
4042     const char *intermediate_char_96 = ",-./";                          \
4043     int revision = -1;                                                  \
4044                                                                         \
4045     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4046       revision = CHARSET_ISO_REVISION (charset);                        \
4047                                                                         \
4048     if (revision >= 0)                                                  \
4049       {                                                                 \
4050         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4051         EMIT_ONE_BYTE ('@' + revision);                                 \
4052       }                                                                 \
4053     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4054     if (CHARSET_DIMENSION (charset) == 1)                               \
4055       {                                                                 \
4056         int b;                                                          \
4057         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4058           b = intermediate_char_94[reg];                                \
4059         else                                                            \
4060           b = intermediate_char_96[reg];                                \
4061         EMIT_ONE_ASCII_BYTE (b);                                        \
4062       }                                                                 \
4063     else                                                                \
4064       {                                                                 \
4065         EMIT_ONE_ASCII_BYTE ('$');                                      \
4066         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4067           {                                                             \
4068             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4069                 || reg != 0                                             \
4070                 || final_char < '@' || final_char > 'B')                \
4071               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4072           }                                                             \
4073         else                                                            \
4074           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4075       }                                                                 \
4076     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4077                                                                         \
4078     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4079   } while (0)
4080
4081
4082 /* The following two macros produce codes (control character or escape
4083    sequence) for ISO2022 single-shift functions (single-shift-2 and
4084    single-shift-3).  */
4085
4086 #define ENCODE_SINGLE_SHIFT_2                                           \
4087   do {                                                                  \
4088     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4089       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4090     else                                                                \
4091       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4092     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4093   } while (0)
4094
4095
4096 #define ENCODE_SINGLE_SHIFT_3                                           \
4097   do {                                                                  \
4098     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4099       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4100     else                                                                \
4101       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4102     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4103   } while (0)
4104
4105
4106 /* The following four macros produce codes (control character or
4107    escape sequence) for ISO2022 locking-shift functions (shift-in,
4108    shift-out, locking-shift-2, and locking-shift-3).  */
4109
4110 #define ENCODE_SHIFT_IN                                 \
4111   do {                                                  \
4112     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4113     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4114   } while (0)
4115
4116
4117 #define ENCODE_SHIFT_OUT                                \
4118   do {                                                  \
4119     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4120     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4121   } while (0)
4122
4123
4124 #define ENCODE_LOCKING_SHIFT_2                          \
4125   do {                                                  \
4126     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4127     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4128   } while (0)
4129
4130
4131 #define ENCODE_LOCKING_SHIFT_3                          \
4132   do {                                                  \
4133     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4134     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4135   } while (0)
4136
4137
4138 /* Produce codes for a DIMENSION1 character whose character set is
4139    CHARSET and whose position-code is C1.  Designation and invocation
4140    sequences are also produced in advance if necessary.  */
4141
4142 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4143   do {                                                                  \
4144     int id = CHARSET_ID (charset);                                      \
4145                                                                         \
4146     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4147         && id == charset_ascii)                                         \
4148       {                                                                 \
4149         id = charset_jisx0201_roman;                                    \
4150         charset = CHARSET_FROM_ID (id);                                 \
4151       }                                                                 \
4152                                                                         \
4153     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4154       {                                                                 \
4155         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4156           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4157         else                                                            \
4158           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4159         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4160         break;                                                          \
4161       }                                                                 \
4162     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4163       {                                                                 \
4164         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4165         break;                                                          \
4166       }                                                                 \
4167     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4168       {                                                                 \
4169         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4170         break;                                                          \
4171       }                                                                 \
4172     else                                                                \
4173       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4174          must invoke it, or, at first, designate it to some graphic     \
4175          register.  Then repeat the loop to actually produce the        \
4176          character.  */                                                 \
4177       dst = encode_invocation_designation (charset, coding, dst,        \
4178                                            &produced_chars);            \
4179   } while (1)
4180
4181
4182 /* Produce codes for a DIMENSION2 character whose character set is
4183    CHARSET and whose position-codes are C1 and C2.  Designation and
4184    invocation codes are also produced in advance if necessary.  */
4185
4186 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4187   do {                                                                  \
4188     int id = CHARSET_ID (charset);                                      \
4189                                                                         \
4190     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4191         && id == charset_jisx0208)                                      \
4192       {                                                                 \
4193         id = charset_jisx0208_1978;                                     \
4194         charset = CHARSET_FROM_ID (id);                                 \
4195       }                                                                 \
4196                                                                         \
4197     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4198       {                                                                 \
4199         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4200           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4201         else                                                            \
4202           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4203         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4204         break;                                                          \
4205       }                                                                 \
4206     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4207       {                                                                 \
4208         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4209         break;                                                          \
4210       }                                                                 \
4211     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4212       {                                                                 \
4213         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4214         break;                                                          \
4215       }                                                                 \
4216     else                                                                \
4217       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4218          must invoke it, or, at first, designate it to some graphic     \
4219          register.  Then repeat the loop to actually produce the        \
4220          character.  */                                                 \
4221       dst = encode_invocation_designation (charset, coding, dst,        \
4222                                            &produced_chars);            \
4223   } while (1)
4224
4225
4226 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4227   do {                                                                     \
4228     unsigned code;                                                         \
4229     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4230                                                                            \
4231     if (CHARSET_DIMENSION (charset) == 1)                                  \
4232       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4233     else                                                                   \
4234       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4235   } while (0)
4236
4237
4238 /* Produce designation and invocation codes at a place pointed by DST
4239    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4240    Return new DST.  */
4241
4242 static unsigned char *
4243 encode_invocation_designation (struct charset *charset,
4244                                struct coding_system *coding,
4245                                unsigned char *dst, ptrdiff_t *p_nchars)
4246 {
4247   bool multibytep = coding->dst_multibyte;
4248   ptrdiff_t produced_chars = *p_nchars;
4249   int reg;                      /* graphic register number */
4250   int id = CHARSET_ID (charset);
4251
4252   /* At first, check designations.  */
4253   for (reg = 0; reg < 4; reg++)
4254     if (id == CODING_ISO_DESIGNATION (coding, reg))
4255       break;
4256
4257   if (reg >= 4)
4258     {
4259       /* CHARSET is not yet designated to any graphic registers.  */
4260       /* At first check the requested designation.  */
4261       reg = CODING_ISO_REQUEST (coding, id);
4262       if (reg < 0)
4263         /* Since CHARSET requests no special designation, designate it
4264            to graphic register 0.  */
4265         reg = 0;
4266
4267       ENCODE_DESIGNATION (charset, reg, coding);
4268     }
4269
4270   if (CODING_ISO_INVOCATION (coding, 0) != reg
4271       && CODING_ISO_INVOCATION (coding, 1) != reg)
4272     {
4273       /* Since the graphic register REG is not invoked to any graphic
4274          planes, invoke it to graphic plane 0.  */
4275       switch (reg)
4276         {
4277         case 0:                 /* graphic register 0 */
4278           ENCODE_SHIFT_IN;
4279           break;
4280
4281         case 1:                 /* graphic register 1 */
4282           ENCODE_SHIFT_OUT;
4283           break;
4284
4285         case 2:                 /* graphic register 2 */
4286           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4287             ENCODE_SINGLE_SHIFT_2;
4288           else
4289             ENCODE_LOCKING_SHIFT_2;
4290           break;
4291
4292         case 3:                 /* graphic register 3 */
4293           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4294             ENCODE_SINGLE_SHIFT_3;
4295           else
4296             ENCODE_LOCKING_SHIFT_3;
4297           break;
4298         }
4299     }
4300
4301   *p_nchars = produced_chars;
4302   return dst;
4303 }
4304
4305
4306 /* Produce codes for designation and invocation to reset the graphic
4307    planes and registers to initial state.  */
4308 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4309   do {                                                                  \
4310     int reg;                                                            \
4311     struct charset *charset;                                            \
4312                                                                         \
4313     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4314       ENCODE_SHIFT_IN;                                                  \
4315     for (reg = 0; reg < 4; reg++)                                       \
4316       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4317           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4318               != CODING_ISO_INITIAL (coding, reg)))                     \
4319         {                                                               \
4320           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4321           ENCODE_DESIGNATION (charset, reg, coding);                    \
4322         }                                                               \
4323   } while (0)
4324
4325
4326 /* Produce designation sequences of charsets in the line started from
4327    CHARBUF to a place pointed by DST, and return the number of
4328    produced bytes.  DST should not directly point a buffer text area
4329    which may be relocated by char_charset call.
4330
4331    If the current block ends before any end-of-line, we may fail to
4332    find all the necessary designations.  */
4333
4334 static ptrdiff_t
4335 encode_designation_at_bol (struct coding_system *coding,
4336                            int *charbuf, int *charbuf_end,
4337                            unsigned char *dst)
4338 {
4339   unsigned char *orig = dst;
4340   struct charset *charset;
4341   /* Table of charsets to be designated to each graphic register.  */
4342   int r[4];
4343   int c, found = 0, reg;
4344   ptrdiff_t produced_chars = 0;
4345   bool multibytep = coding->dst_multibyte;
4346   Lisp_Object attrs;
4347   Lisp_Object charset_list;
4348
4349   attrs = CODING_ID_ATTRS (coding->id);
4350   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4351   if (EQ (charset_list, Qiso_2022))
4352     charset_list = Viso_2022_charset_list;
4353
4354   for (reg = 0; reg < 4; reg++)
4355     r[reg] = -1;
4356
4357   while (charbuf < charbuf_end && found < 4)
4358     {
4359       int id;
4360
4361       c = *charbuf++;
4362       if (c == '\n')
4363         break;
4364       charset = char_charset (c, charset_list, NULL);
4365       id = CHARSET_ID (charset);
4366       reg = CODING_ISO_REQUEST (coding, id);
4367       if (reg >= 0 && r[reg] < 0)
4368         {
4369           found++;
4370           r[reg] = id;
4371         }
4372     }
4373
4374   if (found)
4375     {
4376       for (reg = 0; reg < 4; reg++)
4377         if (r[reg] >= 0
4378             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4379           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4380     }
4381
4382   return dst - orig;
4383 }
4384
4385 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4386
4387 static bool
4388 encode_coding_iso_2022 (struct coding_system *coding)
4389 {
4390   bool multibytep = coding->dst_multibyte;
4391   int *charbuf = coding->charbuf;
4392   int *charbuf_end = charbuf + coding->charbuf_used;
4393   unsigned char *dst = coding->destination + coding->produced;
4394   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4395   int safe_room = 16;
4396   bool bol_designation
4397     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4398        && CODING_ISO_BOL (coding));
4399   ptrdiff_t produced_chars = 0;
4400   Lisp_Object attrs, eol_type, charset_list;
4401   bool ascii_compatible;
4402   int c;
4403   int preferred_charset_id = -1;
4404
4405   CODING_GET_INFO (coding, attrs, charset_list);
4406   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4407   if (VECTORP (eol_type))
4408     eol_type = Qunix;
4409
4410   setup_iso_safe_charsets (attrs);
4411   /* Charset list may have been changed.  */
4412   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4413   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4414
4415   ascii_compatible
4416     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4417        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4418                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4419
4420   while (charbuf < charbuf_end)
4421     {
4422       ASSURE_DESTINATION (safe_room);
4423
4424       if (bol_designation)
4425         {
4426           /* We have to produce designation sequences if any now.  */
4427           unsigned char desig_buf[16];
4428           ptrdiff_t nbytes;
4429           ptrdiff_t offset;
4430
4431           charset_map_loaded = 0;
4432           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4433                                               desig_buf);
4434           if (charset_map_loaded
4435               && (offset = coding_change_destination (coding)))
4436             {
4437               dst += offset;
4438               dst_end += offset;
4439             }
4440           memcpy (dst, desig_buf, nbytes);
4441           dst += nbytes;
4442           /* We are sure that designation sequences are all ASCII bytes.  */
4443           produced_chars += nbytes;
4444           bol_designation = 0;
4445           ASSURE_DESTINATION (safe_room);
4446         }
4447
4448       c = *charbuf++;
4449
4450       if (c < 0)
4451         {
4452           /* Handle an annotation.  */
4453           switch (*charbuf)
4454             {
4455             case CODING_ANNOTATE_COMPOSITION_MASK:
4456               /* Not yet implemented.  */
4457               break;
4458             case CODING_ANNOTATE_CHARSET_MASK:
4459               preferred_charset_id = charbuf[2];
4460               if (preferred_charset_id >= 0
4461                   && NILP (Fmemq (make_number (preferred_charset_id),
4462                                   charset_list)))
4463                 preferred_charset_id = -1;
4464               break;
4465             default:
4466               emacs_abort ();
4467             }
4468           charbuf += -c - 1;
4469           continue;
4470         }
4471
4472       /* Now encode the character C.  */
4473       if (c < 0x20 || c == 0x7F)
4474         {
4475           if (c == '\n'
4476               || (c == '\r' && EQ (eol_type, Qmac)))
4477             {
4478               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4479                 ENCODE_RESET_PLANE_AND_REGISTER ();
4480               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4481                 {
4482                   int i;
4483
4484                   for (i = 0; i < 4; i++)
4485                     CODING_ISO_DESIGNATION (coding, i)
4486                       = CODING_ISO_INITIAL (coding, i);
4487                 }
4488               bol_designation = ((CODING_ISO_FLAGS (coding)
4489                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4490                                  != 0);
4491             }
4492           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4493             ENCODE_RESET_PLANE_AND_REGISTER ();
4494           EMIT_ONE_ASCII_BYTE (c);
4495         }
4496       else if (ASCII_CHAR_P (c))
4497         {
4498           if (ascii_compatible)
4499             EMIT_ONE_ASCII_BYTE (c);
4500           else
4501             {
4502               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4503               ENCODE_ISO_CHARACTER (charset, c);
4504             }
4505         }
4506       else if (CHAR_BYTE8_P (c))
4507         {
4508           c = CHAR_TO_BYTE8 (c);
4509           EMIT_ONE_BYTE (c);
4510         }
4511       else
4512         {
4513           struct charset *charset;
4514
4515           if (preferred_charset_id >= 0)
4516             {
4517               bool result;
4518
4519               charset = CHARSET_FROM_ID (preferred_charset_id);
4520               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4521               if (! result)
4522                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4523                                      NULL, charset);
4524             }
4525           else
4526             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4527                                  NULL, charset);
4528           if (!charset)
4529             {
4530               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4531                 {
4532                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4533                   charset = CHARSET_FROM_ID (charset_ascii);
4534                 }
4535               else
4536                 {
4537                   c = coding->default_char;
4538                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4539                                        charset_list, NULL, charset);
4540                 }
4541             }
4542           ENCODE_ISO_CHARACTER (charset, c);
4543         }
4544     }
4545
4546   if (coding->mode & CODING_MODE_LAST_BLOCK
4547       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4548     {
4549       ASSURE_DESTINATION (safe_room);
4550       ENCODE_RESET_PLANE_AND_REGISTER ();
4551     }
4552   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4553   CODING_ISO_BOL (coding) = bol_designation;
4554   coding->produced_char += produced_chars;
4555   coding->produced = dst - coding->destination;
4556   return 0;
4557 }
4558
4559 \f
4560 /*** 8,9. SJIS and BIG5 handlers ***/
4561
4562 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4563    quite widely.  So, for the moment, Emacs supports them in the bare
4564    C code.  But, in the future, they may be supported only by CCL.  */
4565
4566 /* SJIS is a coding system encoding three character sets: ASCII, right
4567    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4568    as is.  A character of charset katakana-jisx0201 is encoded by
4569    "position-code + 0x80".  A character of charset japanese-jisx0208
4570    is encoded in 2-byte but two position-codes are divided and shifted
4571    so that it fit in the range below.
4572
4573    --- CODE RANGE of SJIS ---
4574    (character set)      (range)
4575    ASCII                0x00 .. 0x7F
4576    KATAKANA-JISX0201    0xA0 .. 0xDF
4577    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4578             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4579    -------------------------------
4580
4581 */
4582
4583 /* BIG5 is a coding system encoding two character sets: ASCII and
4584    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4585    character set and is encoded in two-byte.
4586
4587    --- CODE RANGE of BIG5 ---
4588    (character set)      (range)
4589    ASCII                0x00 .. 0x7F
4590    Big5 (1st byte)      0xA1 .. 0xFE
4591         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4592    --------------------------
4593
4594   */
4595
4596 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4597    Return true if a text is encoded in SJIS.  */
4598
4599 static bool
4600 detect_coding_sjis (struct coding_system *coding,
4601                     struct coding_detection_info *detect_info)
4602 {
4603   const unsigned char *src = coding->source, *src_base;
4604   const unsigned char *src_end = coding->source + coding->src_bytes;
4605   bool multibytep = coding->src_multibyte;
4606   ptrdiff_t consumed_chars = 0;
4607   int found = 0;
4608   int c;
4609   Lisp_Object attrs, charset_list;
4610   int max_first_byte_of_2_byte_code;
4611
4612   CODING_GET_INFO (coding, attrs, charset_list);
4613   max_first_byte_of_2_byte_code
4614     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4615
4616   detect_info->checked |= CATEGORY_MASK_SJIS;
4617   /* A coding system of this category is always ASCII compatible.  */
4618   src += coding->head_ascii;
4619
4620   while (1)
4621     {
4622       src_base = src;
4623       ONE_MORE_BYTE (c);
4624       if (c < 0x80)
4625         continue;
4626       if ((c >= 0x81 && c <= 0x9F)
4627           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4628         {
4629           ONE_MORE_BYTE (c);
4630           if (c < 0x40 || c == 0x7F || c > 0xFC)
4631             break;
4632           found = CATEGORY_MASK_SJIS;
4633         }
4634       else if (c >= 0xA0 && c < 0xE0)
4635         found = CATEGORY_MASK_SJIS;
4636       else
4637         break;
4638     }
4639   detect_info->rejected |= CATEGORY_MASK_SJIS;
4640   return 0;
4641
4642  no_more_source:
4643   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4644     {
4645       detect_info->rejected |= CATEGORY_MASK_SJIS;
4646       return 0;
4647     }
4648   detect_info->found |= found;
4649   return 1;
4650 }
4651
4652 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4653    Return true if a text is encoded in BIG5.  */
4654
4655 static bool
4656 detect_coding_big5 (struct coding_system *coding,
4657                     struct coding_detection_info *detect_info)
4658 {
4659   const unsigned char *src = coding->source, *src_base;
4660   const unsigned char *src_end = coding->source + coding->src_bytes;
4661   bool multibytep = coding->src_multibyte;
4662   ptrdiff_t consumed_chars = 0;
4663   int found = 0;
4664   int c;
4665
4666   detect_info->checked |= CATEGORY_MASK_BIG5;
4667   /* A coding system of this category is always ASCII compatible.  */
4668   src += coding->head_ascii;
4669
4670   while (1)
4671     {
4672       src_base = src;
4673       ONE_MORE_BYTE (c);
4674       if (c < 0x80)
4675         continue;
4676       if (c >= 0xA1)
4677         {
4678           ONE_MORE_BYTE (c);
4679           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4680             return 0;
4681           found = CATEGORY_MASK_BIG5;
4682         }
4683       else
4684         break;
4685     }
4686   detect_info->rejected |= CATEGORY_MASK_BIG5;
4687   return 0;
4688
4689  no_more_source:
4690   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4691     {
4692       detect_info->rejected |= CATEGORY_MASK_BIG5;
4693       return 0;
4694     }
4695   detect_info->found |= found;
4696   return 1;
4697 }
4698
4699 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4700
4701 static void
4702 decode_coding_sjis (struct coding_system *coding)
4703 {
4704   const unsigned char *src = coding->source + coding->consumed;
4705   const unsigned char *src_end = coding->source + coding->src_bytes;
4706   const unsigned char *src_base;
4707   int *charbuf = coding->charbuf + coding->charbuf_used;
4708   /* We may produce one charset annotation in one loop and one more at
4709      the end.  */
4710   int *charbuf_end
4711     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4712   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4713   bool multibytep = coding->src_multibyte;
4714   struct charset *charset_roman, *charset_kanji, *charset_kana;
4715   struct charset *charset_kanji2;
4716   Lisp_Object attrs, charset_list, val;
4717   ptrdiff_t char_offset = coding->produced_char;
4718   ptrdiff_t last_offset = char_offset;
4719   int last_id = charset_ascii;
4720   bool eol_dos
4721     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4722   int byte_after_cr = -1;
4723
4724   CODING_GET_INFO (coding, attrs, charset_list);
4725
4726   val = charset_list;
4727   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4728   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4729   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4730   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4731
4732   while (1)
4733     {
4734       int c, c1;
4735       struct charset *charset;
4736
4737       src_base = src;
4738       consumed_chars_base = consumed_chars;
4739
4740       if (charbuf >= charbuf_end)
4741         {
4742           if (byte_after_cr >= 0)
4743             src_base--;
4744           break;
4745         }
4746
4747       if (byte_after_cr >= 0)
4748         c = byte_after_cr, byte_after_cr = -1;
4749       else
4750         ONE_MORE_BYTE (c);
4751       if (c < 0)
4752         goto invalid_code;
4753       if (c < 0x80)
4754         {
4755           if (eol_dos && c == '\r')
4756             ONE_MORE_BYTE (byte_after_cr);
4757           charset = charset_roman;
4758         }
4759       else if (c == 0x80 || c == 0xA0)
4760         goto invalid_code;
4761       else if (c >= 0xA1 && c <= 0xDF)
4762         {
4763           /* SJIS -> JISX0201-Kana */
4764           c &= 0x7F;
4765           charset = charset_kana;
4766         }
4767       else if (c <= 0xEF)
4768         {
4769           /* SJIS -> JISX0208 */
4770           ONE_MORE_BYTE (c1);
4771           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4772             goto invalid_code;
4773           c = (c << 8) | c1;
4774           SJIS_TO_JIS (c);
4775           charset = charset_kanji;
4776         }
4777       else if (c <= 0xFC && charset_kanji2)
4778         {
4779           /* SJIS -> JISX0213-2 */
4780           ONE_MORE_BYTE (c1);
4781           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4782             goto invalid_code;
4783           c = (c << 8) | c1;
4784           SJIS_TO_JIS2 (c);
4785           charset = charset_kanji2;
4786         }
4787       else
4788         goto invalid_code;
4789       if (charset->id != charset_ascii
4790           && last_id != charset->id)
4791         {
4792           if (last_id != charset_ascii)
4793             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4794           last_id = charset->id;
4795           last_offset = char_offset;
4796         }
4797       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4798       *charbuf++ = c;
4799       char_offset++;
4800       continue;
4801
4802     invalid_code:
4803       src = src_base;
4804       consumed_chars = consumed_chars_base;
4805       ONE_MORE_BYTE (c);
4806       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4807       char_offset++;
4808     }
4809
4810  no_more_source:
4811   if (last_id != charset_ascii)
4812     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4813   coding->consumed_char += consumed_chars_base;
4814   coding->consumed = src_base - coding->source;
4815   coding->charbuf_used = charbuf - coding->charbuf;
4816 }
4817
4818 static void
4819 decode_coding_big5 (struct coding_system *coding)
4820 {
4821   const unsigned char *src = coding->source + coding->consumed;
4822   const unsigned char *src_end = coding->source + coding->src_bytes;
4823   const unsigned char *src_base;
4824   int *charbuf = coding->charbuf + coding->charbuf_used;
4825   /* We may produce one charset annotation in one loop and one more at
4826      the end.  */
4827   int *charbuf_end
4828     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4829   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4830   bool multibytep = coding->src_multibyte;
4831   struct charset *charset_roman, *charset_big5;
4832   Lisp_Object attrs, charset_list, val;
4833   ptrdiff_t char_offset = coding->produced_char;
4834   ptrdiff_t last_offset = char_offset;
4835   int last_id = charset_ascii;
4836   bool eol_dos
4837     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4838   int byte_after_cr = -1;
4839
4840   CODING_GET_INFO (coding, attrs, charset_list);
4841   val = charset_list;
4842   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4843   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4844
4845   while (1)
4846     {
4847       int c, c1;
4848       struct charset *charset;
4849
4850       src_base = src;
4851       consumed_chars_base = consumed_chars;
4852
4853       if (charbuf >= charbuf_end)
4854         {
4855           if (byte_after_cr >= 0)
4856             src_base--;
4857           break;
4858         }
4859
4860       if (byte_after_cr >= 0)
4861         c = byte_after_cr, byte_after_cr = -1;
4862       else
4863         ONE_MORE_BYTE (c);
4864
4865       if (c < 0)
4866         goto invalid_code;
4867       if (c < 0x80)
4868         {
4869           if (eol_dos && c == '\r')
4870             ONE_MORE_BYTE (byte_after_cr);
4871           charset = charset_roman;
4872         }
4873       else
4874         {
4875           /* BIG5 -> Big5 */
4876           if (c < 0xA1 || c > 0xFE)
4877             goto invalid_code;
4878           ONE_MORE_BYTE (c1);
4879           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4880             goto invalid_code;
4881           c = c << 8 | c1;
4882           charset = charset_big5;
4883         }
4884       if (charset->id != charset_ascii
4885           && last_id != charset->id)
4886         {
4887           if (last_id != charset_ascii)
4888             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4889           last_id = charset->id;
4890           last_offset = char_offset;
4891         }
4892       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4893       *charbuf++ = c;
4894       char_offset++;
4895       continue;
4896
4897     invalid_code:
4898       src = src_base;
4899       consumed_chars = consumed_chars_base;
4900       ONE_MORE_BYTE (c);
4901       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4902       char_offset++;
4903     }
4904
4905  no_more_source:
4906   if (last_id != charset_ascii)
4907     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4908   coding->consumed_char += consumed_chars_base;
4909   coding->consumed = src_base - coding->source;
4910   coding->charbuf_used = charbuf - coding->charbuf;
4911 }
4912
4913 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4914    This function can encode charsets `ascii', `katakana-jisx0201',
4915    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4916    are sure that all these charsets are registered as official charset
4917    (i.e. do not have extended leading-codes).  Characters of other
4918    charsets are produced without any encoding.  */
4919
4920 static bool
4921 encode_coding_sjis (struct coding_system *coding)
4922 {
4923   bool multibytep = coding->dst_multibyte;
4924   int *charbuf = coding->charbuf;
4925   int *charbuf_end = charbuf + coding->charbuf_used;
4926   unsigned char *dst = coding->destination + coding->produced;
4927   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4928   int safe_room = 4;
4929   ptrdiff_t produced_chars = 0;
4930   Lisp_Object attrs, charset_list, val;
4931   bool ascii_compatible;
4932   struct charset *charset_kanji, *charset_kana;
4933   struct charset *charset_kanji2;
4934   int c;
4935
4936   CODING_GET_INFO (coding, attrs, charset_list);
4937   val = XCDR (charset_list);
4938   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4939   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4940   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4941
4942   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4943
4944   while (charbuf < charbuf_end)
4945     {
4946       ASSURE_DESTINATION (safe_room);
4947       c = *charbuf++;
4948       /* Now encode the character C.  */
4949       if (ASCII_CHAR_P (c) && ascii_compatible)
4950         EMIT_ONE_ASCII_BYTE (c);
4951       else if (CHAR_BYTE8_P (c))
4952         {
4953           c = CHAR_TO_BYTE8 (c);
4954           EMIT_ONE_BYTE (c);
4955         }
4956       else
4957         {
4958           unsigned code;
4959           struct charset *charset;
4960           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4961                                &code, charset);
4962
4963           if (!charset)
4964             {
4965               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4966                 {
4967                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4968                   charset = CHARSET_FROM_ID (charset_ascii);
4969                 }
4970               else
4971                 {
4972                   c = coding->default_char;
4973                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4974                                        charset_list, &code, charset);
4975                 }
4976             }
4977           if (code == CHARSET_INVALID_CODE (charset))
4978             emacs_abort ();
4979           if (charset == charset_kanji)
4980             {
4981               int c1, c2;
4982               JIS_TO_SJIS (code);
4983               c1 = code >> 8, c2 = code & 0xFF;
4984               EMIT_TWO_BYTES (c1, c2);
4985             }
4986           else if (charset == charset_kana)
4987             EMIT_ONE_BYTE (code | 0x80);
4988           else if (charset_kanji2 && charset == charset_kanji2)
4989             {
4990               int c1, c2;
4991
4992               c1 = code >> 8;
4993               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4994                   || c1 == 0x28
4995                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4996                 {
4997                   JIS_TO_SJIS2 (code);
4998                   c1 = code >> 8, c2 = code & 0xFF;
4999                   EMIT_TWO_BYTES (c1, c2);
5000                 }
5001               else
5002                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5003             }
5004           else
5005             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5006         }
5007     }
5008   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5009   coding->produced_char += produced_chars;
5010   coding->produced = dst - coding->destination;
5011   return 0;
5012 }
5013
5014 static bool
5015 encode_coding_big5 (struct coding_system *coding)
5016 {
5017   bool multibytep = coding->dst_multibyte;
5018   int *charbuf = coding->charbuf;
5019   int *charbuf_end = charbuf + coding->charbuf_used;
5020   unsigned char *dst = coding->destination + coding->produced;
5021   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5022   int safe_room = 4;
5023   ptrdiff_t produced_chars = 0;
5024   Lisp_Object attrs, charset_list, val;
5025   bool ascii_compatible;
5026   struct charset *charset_big5;
5027   int c;
5028
5029   CODING_GET_INFO (coding, attrs, charset_list);
5030   val = XCDR (charset_list);
5031   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5032   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5033
5034   while (charbuf < charbuf_end)
5035     {
5036       ASSURE_DESTINATION (safe_room);
5037       c = *charbuf++;
5038       /* Now encode the character C.  */
5039       if (ASCII_CHAR_P (c) && ascii_compatible)
5040         EMIT_ONE_ASCII_BYTE (c);
5041       else if (CHAR_BYTE8_P (c))
5042         {
5043           c = CHAR_TO_BYTE8 (c);
5044           EMIT_ONE_BYTE (c);
5045         }
5046       else
5047         {
5048           unsigned code;
5049           struct charset *charset;
5050           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5051                                &code, charset);
5052
5053           if (! charset)
5054             {
5055               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5056                 {
5057                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5058                   charset = CHARSET_FROM_ID (charset_ascii);
5059                 }
5060               else
5061                 {
5062                   c = coding->default_char;
5063                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5064                                        charset_list, &code, charset);
5065                 }
5066             }
5067           if (code == CHARSET_INVALID_CODE (charset))
5068             emacs_abort ();
5069           if (charset == charset_big5)
5070             {
5071               int c1, c2;
5072
5073               c1 = code >> 8, c2 = code & 0xFF;
5074               EMIT_TWO_BYTES (c1, c2);
5075             }
5076           else
5077             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5078         }
5079     }
5080   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5081   coding->produced_char += produced_chars;
5082   coding->produced = dst - coding->destination;
5083   return 0;
5084 }
5085
5086 \f
5087 /*** 10. CCL handlers ***/
5088
5089 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5090    Return true if a text is encoded in a coding system of which
5091    encoder/decoder are written in CCL program.  */
5092
5093 static bool
5094 detect_coding_ccl (struct coding_system *coding,
5095                    struct coding_detection_info *detect_info)
5096 {
5097   const unsigned char *src = coding->source, *src_base;
5098   const unsigned char *src_end = coding->source + coding->src_bytes;
5099   bool multibytep = coding->src_multibyte;
5100   ptrdiff_t consumed_chars = 0;
5101   int found = 0;
5102   unsigned char *valids;
5103   ptrdiff_t head_ascii = coding->head_ascii;
5104   Lisp_Object attrs;
5105
5106   detect_info->checked |= CATEGORY_MASK_CCL;
5107
5108   coding = &coding_categories[coding_category_ccl];
5109   valids = CODING_CCL_VALIDS (coding);
5110   attrs = CODING_ID_ATTRS (coding->id);
5111   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5112     src += head_ascii;
5113
5114   while (1)
5115     {
5116       int c;
5117
5118       src_base = src;
5119       ONE_MORE_BYTE (c);
5120       if (c < 0 || ! valids[c])
5121         break;
5122       if ((valids[c] > 1))
5123         found = CATEGORY_MASK_CCL;
5124     }
5125   detect_info->rejected |= CATEGORY_MASK_CCL;
5126   return 0;
5127
5128  no_more_source:
5129   detect_info->found |= found;
5130   return 1;
5131 }
5132
5133 static void
5134 decode_coding_ccl (struct coding_system *coding)
5135 {
5136   const unsigned char *src = coding->source + coding->consumed;
5137   const unsigned char *src_end = coding->source + coding->src_bytes;
5138   int *charbuf = coding->charbuf + coding->charbuf_used;
5139   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5140   ptrdiff_t consumed_chars = 0;
5141   bool multibytep = coding->src_multibyte;
5142   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5143   int source_charbuf[1024];
5144   int source_byteidx[1025];
5145   Lisp_Object attrs, charset_list;
5146
5147   CODING_GET_INFO (coding, attrs, charset_list);
5148
5149   while (1)
5150     {
5151       const unsigned char *p = src;
5152       ptrdiff_t offset;
5153       int i = 0;
5154
5155       if (multibytep)
5156         {
5157           while (i < 1024 && p < src_end)
5158             {
5159               source_byteidx[i] = p - src;
5160               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5161             }
5162           source_byteidx[i] = p - src;
5163         }
5164       else
5165         while (i < 1024 && p < src_end)
5166           source_charbuf[i++] = *p++;
5167
5168       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5169         ccl->last_block = true;
5170       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5171       charset_map_loaded = 0;
5172       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5173                   charset_list);
5174       if (charset_map_loaded
5175           && (offset = coding_change_source (coding)))
5176         {
5177           p += offset;
5178           src += offset;
5179           src_end += offset;
5180         }
5181       charbuf += ccl->produced;
5182       if (multibytep)
5183         src += source_byteidx[ccl->consumed];
5184       else
5185         src += ccl->consumed;
5186       consumed_chars += ccl->consumed;
5187       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5188         break;
5189     }
5190
5191   switch (ccl->status)
5192     {
5193     case CCL_STAT_SUSPEND_BY_SRC:
5194       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5195       break;
5196     case CCL_STAT_SUSPEND_BY_DST:
5197       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5198       break;
5199     case CCL_STAT_QUIT:
5200     case CCL_STAT_INVALID_CMD:
5201       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5202       break;
5203     default:
5204       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5205       break;
5206     }
5207   coding->consumed_char += consumed_chars;
5208   coding->consumed = src - coding->source;
5209   coding->charbuf_used = charbuf - coding->charbuf;
5210 }
5211
5212 static bool
5213 encode_coding_ccl (struct coding_system *coding)
5214 {
5215   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5216   bool multibytep = coding->dst_multibyte;
5217   int *charbuf = coding->charbuf;
5218   int *charbuf_end = charbuf + coding->charbuf_used;
5219   unsigned char *dst = coding->destination + coding->produced;
5220   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5221   int destination_charbuf[1024];
5222   ptrdiff_t produced_chars = 0;
5223   int i;
5224   Lisp_Object attrs, charset_list;
5225
5226   CODING_GET_INFO (coding, attrs, charset_list);
5227   if (coding->consumed_char == coding->src_chars
5228       && coding->mode & CODING_MODE_LAST_BLOCK)
5229     ccl->last_block = true;
5230
5231   do
5232     {
5233       ptrdiff_t offset;
5234
5235       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5236       charset_map_loaded = 0;
5237       ccl_driver (ccl, charbuf, destination_charbuf,
5238                   charbuf_end - charbuf, 1024, charset_list);
5239       if (charset_map_loaded
5240           && (offset = coding_change_destination (coding)))
5241         dst += offset;
5242       if (multibytep)
5243         {
5244           ASSURE_DESTINATION (ccl->produced * 2);
5245           for (i = 0; i < ccl->produced; i++)
5246             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5247         }
5248       else
5249         {
5250           ASSURE_DESTINATION (ccl->produced);
5251           for (i = 0; i < ccl->produced; i++)
5252             *dst++ = destination_charbuf[i] & 0xFF;
5253           produced_chars += ccl->produced;
5254         }
5255       charbuf += ccl->consumed;
5256       if (ccl->status == CCL_STAT_QUIT
5257           || ccl->status == CCL_STAT_INVALID_CMD)
5258         break;
5259     }
5260   while (charbuf < charbuf_end);
5261
5262   switch (ccl->status)
5263     {
5264     case CCL_STAT_SUSPEND_BY_SRC:
5265       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5266       break;
5267     case CCL_STAT_SUSPEND_BY_DST:
5268       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5269       break;
5270     case CCL_STAT_QUIT:
5271     case CCL_STAT_INVALID_CMD:
5272       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5273       break;
5274     default:
5275       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5276       break;
5277     }
5278
5279   coding->produced_char += produced_chars;
5280   coding->produced = dst - coding->destination;
5281   return 0;
5282 }
5283
5284 \f
5285 /*** 10, 11. no-conversion handlers ***/
5286
5287 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5288
5289 static void
5290 decode_coding_raw_text (struct coding_system *coding)
5291 {
5292   bool eol_dos
5293     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5294
5295   coding->chars_at_source = 1;
5296   coding->consumed_char = coding->src_chars;
5297   coding->consumed = coding->src_bytes;
5298   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5299     {
5300       coding->consumed_char--;
5301       coding->consumed--;
5302       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5303     }
5304   else
5305     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5306 }
5307
5308 static bool
5309 encode_coding_raw_text (struct coding_system *coding)
5310 {
5311   bool multibytep = coding->dst_multibyte;
5312   int *charbuf = coding->charbuf;
5313   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5314   unsigned char *dst = coding->destination + coding->produced;
5315   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5316   ptrdiff_t produced_chars = 0;
5317   int c;
5318
5319   if (multibytep)
5320     {
5321       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5322
5323       if (coding->src_multibyte)
5324         while (charbuf < charbuf_end)
5325           {
5326             ASSURE_DESTINATION (safe_room);
5327             c = *charbuf++;
5328             if (ASCII_CHAR_P (c))
5329               EMIT_ONE_ASCII_BYTE (c);
5330             else if (CHAR_BYTE8_P (c))
5331               {
5332                 c = CHAR_TO_BYTE8 (c);
5333                 EMIT_ONE_BYTE (c);
5334               }
5335             else
5336               {
5337                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5338
5339                 CHAR_STRING_ADVANCE (c, p1);
5340                 do
5341                   {
5342                     EMIT_ONE_BYTE (*p0);
5343                     p0++;
5344                   }
5345                 while (p0 < p1);
5346               }
5347           }
5348       else
5349         while (charbuf < charbuf_end)
5350           {
5351             ASSURE_DESTINATION (safe_room);
5352             c = *charbuf++;
5353             EMIT_ONE_BYTE (c);
5354           }
5355     }
5356   else
5357     {
5358       if (coding->src_multibyte)
5359         {
5360           int safe_room = MAX_MULTIBYTE_LENGTH;
5361
5362           while (charbuf < charbuf_end)
5363             {
5364               ASSURE_DESTINATION (safe_room);
5365               c = *charbuf++;
5366               if (ASCII_CHAR_P (c))
5367                 *dst++ = c;
5368               else if (CHAR_BYTE8_P (c))
5369                 *dst++ = CHAR_TO_BYTE8 (c);
5370               else
5371                 CHAR_STRING_ADVANCE (c, dst);
5372             }
5373         }
5374       else
5375         {
5376           ASSURE_DESTINATION (charbuf_end - charbuf);
5377           while (charbuf < charbuf_end && dst < dst_end)
5378             *dst++ = *charbuf++;
5379         }
5380       produced_chars = dst - (coding->destination + coding->produced);
5381     }
5382   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5383   coding->produced_char += produced_chars;
5384   coding->produced = dst - coding->destination;
5385   return 0;
5386 }
5387
5388 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5389    Return true if a text is encoded in a charset-based coding system.  */
5390
5391 static bool
5392 detect_coding_charset (struct coding_system *coding,
5393                        struct coding_detection_info *detect_info)
5394 {
5395   const unsigned char *src = coding->source, *src_base;
5396   const unsigned char *src_end = coding->source + coding->src_bytes;
5397   bool multibytep = coding->src_multibyte;
5398   ptrdiff_t consumed_chars = 0;
5399   Lisp_Object attrs, valids, name;
5400   int found = 0;
5401   ptrdiff_t head_ascii = coding->head_ascii;
5402   bool check_latin_extra = 0;
5403
5404   detect_info->checked |= CATEGORY_MASK_CHARSET;
5405
5406   coding = &coding_categories[coding_category_charset];
5407   attrs = CODING_ID_ATTRS (coding->id);
5408   valids = AREF (attrs, coding_attr_charset_valids);
5409   name = CODING_ID_NAME (coding->id);
5410   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5411                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5412       || strncmp (SSDATA (SYMBOL_NAME (name)),
5413                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5414     check_latin_extra = 1;
5415
5416   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5417     src += head_ascii;
5418
5419   while (1)
5420     {
5421       int c;
5422       Lisp_Object val;
5423       struct charset *charset;
5424       int dim, idx;
5425
5426       src_base = src;
5427       ONE_MORE_BYTE (c);
5428       if (c < 0)
5429         continue;
5430       val = AREF (valids, c);
5431       if (NILP (val))
5432         break;
5433       if (c >= 0x80)
5434         {
5435           if (c < 0xA0
5436               && check_latin_extra
5437               && (!VECTORP (Vlatin_extra_code_table)
5438                   || NILP (AREF (Vlatin_extra_code_table, c))))
5439             break;
5440           found = CATEGORY_MASK_CHARSET;
5441         }
5442       if (INTEGERP (val))
5443         {
5444           charset = CHARSET_FROM_ID (XFASTINT (val));
5445           dim = CHARSET_DIMENSION (charset);
5446           for (idx = 1; idx < dim; idx++)
5447             {
5448               if (src == src_end)
5449                 goto too_short;
5450               ONE_MORE_BYTE (c);
5451               if (c < charset->code_space[(dim - 1 - idx) * 4]
5452                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5453                 break;
5454             }
5455           if (idx < dim)
5456             break;
5457         }
5458       else
5459         {
5460           idx = 1;
5461           for (; CONSP (val); val = XCDR (val))
5462             {
5463               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5464               dim = CHARSET_DIMENSION (charset);
5465               while (idx < dim)
5466                 {
5467                   if (src == src_end)
5468                     goto too_short;
5469                   ONE_MORE_BYTE (c);
5470                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5471                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5472                     break;
5473                   idx++;
5474                 }
5475               if (idx == dim)
5476                 {
5477                   val = Qnil;
5478                   break;
5479                 }
5480             }
5481           if (CONSP (val))
5482             break;
5483         }
5484     }
5485  too_short:
5486   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5487   return 0;
5488
5489  no_more_source:
5490   detect_info->found |= found;
5491   return 1;
5492 }
5493
5494 static void
5495 decode_coding_charset (struct coding_system *coding)
5496 {
5497   const unsigned char *src = coding->source + coding->consumed;
5498   const unsigned char *src_end = coding->source + coding->src_bytes;
5499   const unsigned char *src_base;
5500   int *charbuf = coding->charbuf + coding->charbuf_used;
5501   /* We may produce one charset annotation in one loop and one more at
5502      the end.  */
5503   int *charbuf_end
5504     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5505   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5506   bool multibytep = coding->src_multibyte;
5507   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5508   Lisp_Object valids;
5509   ptrdiff_t char_offset = coding->produced_char;
5510   ptrdiff_t last_offset = char_offset;
5511   int last_id = charset_ascii;
5512   bool eol_dos
5513     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5514   int byte_after_cr = -1;
5515
5516   valids = AREF (attrs, coding_attr_charset_valids);
5517
5518   while (1)
5519     {
5520       int c;
5521       Lisp_Object val;
5522       struct charset *charset;
5523       int dim;
5524       int len = 1;
5525       unsigned code;
5526
5527       src_base = src;
5528       consumed_chars_base = consumed_chars;
5529
5530       if (charbuf >= charbuf_end)
5531         {
5532           if (byte_after_cr >= 0)
5533             src_base--;
5534           break;
5535         }
5536
5537       if (byte_after_cr >= 0)
5538         {
5539           c = byte_after_cr;
5540           byte_after_cr = -1;
5541         }
5542       else
5543         {
5544           ONE_MORE_BYTE (c);
5545           if (eol_dos && c == '\r')
5546             ONE_MORE_BYTE (byte_after_cr);
5547         }
5548       if (c < 0)
5549         goto invalid_code;
5550       code = c;
5551
5552       val = AREF (valids, c);
5553       if (! INTEGERP (val) && ! CONSP (val))
5554         goto invalid_code;
5555       if (INTEGERP (val))
5556         {
5557           charset = CHARSET_FROM_ID (XFASTINT (val));
5558           dim = CHARSET_DIMENSION (charset);
5559           while (len < dim)
5560             {
5561               ONE_MORE_BYTE (c);
5562               code = (code << 8) | c;
5563               len++;
5564             }
5565           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5566                               charset, code, c);
5567         }
5568       else
5569         {
5570           /* VAL is a list of charset IDs.  It is assured that the
5571              list is sorted by charset dimensions (smaller one
5572              comes first).  */
5573           while (CONSP (val))
5574             {
5575               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5576               dim = CHARSET_DIMENSION (charset);
5577               while (len < dim)
5578                 {
5579                   ONE_MORE_BYTE (c);
5580                   code = (code << 8) | c;
5581                   len++;
5582                 }
5583               CODING_DECODE_CHAR (coding, src, src_base,
5584                                   src_end, charset, code, c);
5585               if (c >= 0)
5586                 break;
5587               val = XCDR (val);
5588             }
5589         }
5590       if (c < 0)
5591         goto invalid_code;
5592       if (charset->id != charset_ascii
5593           && last_id != charset->id)
5594         {
5595           if (last_id != charset_ascii)
5596             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5597           last_id = charset->id;
5598           last_offset = char_offset;
5599         }
5600
5601       *charbuf++ = c;
5602       char_offset++;
5603       continue;
5604
5605     invalid_code:
5606       src = src_base;
5607       consumed_chars = consumed_chars_base;
5608       ONE_MORE_BYTE (c);
5609       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5610       char_offset++;
5611     }
5612
5613  no_more_source:
5614   if (last_id != charset_ascii)
5615     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5616   coding->consumed_char += consumed_chars_base;
5617   coding->consumed = src_base - coding->source;
5618   coding->charbuf_used = charbuf - coding->charbuf;
5619 }
5620
5621 static bool
5622 encode_coding_charset (struct coding_system *coding)
5623 {
5624   bool multibytep = coding->dst_multibyte;
5625   int *charbuf = coding->charbuf;
5626   int *charbuf_end = charbuf + coding->charbuf_used;
5627   unsigned char *dst = coding->destination + coding->produced;
5628   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5629   int safe_room = MAX_MULTIBYTE_LENGTH;
5630   ptrdiff_t produced_chars = 0;
5631   Lisp_Object attrs, charset_list;
5632   bool ascii_compatible;
5633   int c;
5634
5635   CODING_GET_INFO (coding, attrs, charset_list);
5636   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5637
5638   while (charbuf < charbuf_end)
5639     {
5640       struct charset *charset;
5641       unsigned code;
5642
5643       ASSURE_DESTINATION (safe_room);
5644       c = *charbuf++;
5645       if (ascii_compatible && ASCII_CHAR_P (c))
5646         EMIT_ONE_ASCII_BYTE (c);
5647       else if (CHAR_BYTE8_P (c))
5648         {
5649           c = CHAR_TO_BYTE8 (c);
5650           EMIT_ONE_BYTE (c);
5651         }
5652       else
5653         {
5654           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5655                                &code, charset);
5656
5657           if (charset)
5658             {
5659               if (CHARSET_DIMENSION (charset) == 1)
5660                 EMIT_ONE_BYTE (code);
5661               else if (CHARSET_DIMENSION (charset) == 2)
5662                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5663               else if (CHARSET_DIMENSION (charset) == 3)
5664                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5665               else
5666                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5667                                  (code >> 8) & 0xFF, code & 0xFF);
5668             }
5669           else
5670             {
5671               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5672                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5673               else
5674                 c = coding->default_char;
5675               EMIT_ONE_BYTE (c);
5676             }
5677         }
5678     }
5679
5680   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5681   coding->produced_char += produced_chars;
5682   coding->produced = dst - coding->destination;
5683   return 0;
5684 }
5685
5686 \f
5687 /*** 7. C library functions ***/
5688
5689 /* Setup coding context CODING from information about CODING_SYSTEM.
5690    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5691    CODING_SYSTEM is invalid, signal an error.  */
5692
5693 void
5694 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5695 {
5696   Lisp_Object attrs;
5697   Lisp_Object eol_type;
5698   Lisp_Object coding_type;
5699   Lisp_Object val;
5700
5701   if (NILP (coding_system))
5702     coding_system = Qundecided;
5703
5704   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5705
5706   attrs = CODING_ID_ATTRS (coding->id);
5707   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5708
5709   coding->mode = 0;
5710   if (VECTORP (eol_type))
5711     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5712                             | CODING_REQUIRE_DETECTION_MASK);
5713   else if (! EQ (eol_type, Qunix))
5714     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5715                             | CODING_REQUIRE_ENCODING_MASK);
5716   else
5717     coding->common_flags = 0;
5718   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5719     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5720   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5721     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5722   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5723     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5724
5725   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5726   coding->max_charset_id = SCHARS (val) - 1;
5727   coding->safe_charsets = SDATA (val);
5728   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5729   coding->carryover_bytes = 0;
5730   coding->raw_destination = 0;
5731
5732   coding_type = CODING_ATTR_TYPE (attrs);
5733   if (EQ (coding_type, Qundecided))
5734     {
5735       coding->detector = NULL;
5736       coding->decoder = decode_coding_raw_text;
5737       coding->encoder = encode_coding_raw_text;
5738       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5739       coding->spec.undecided.inhibit_nbd
5740         = (encode_inhibit_flag
5741            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5742       coding->spec.undecided.inhibit_ied
5743         = (encode_inhibit_flag
5744            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5745       coding->spec.undecided.prefer_utf_8
5746         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5747     }
5748   else if (EQ (coding_type, Qiso_2022))
5749     {
5750       int i;
5751       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5752
5753       /* Invoke graphic register 0 to plane 0.  */
5754       CODING_ISO_INVOCATION (coding, 0) = 0;
5755       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5756       CODING_ISO_INVOCATION (coding, 1)
5757         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5758       /* Setup the initial status of designation.  */
5759       for (i = 0; i < 4; i++)
5760         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5761       /* Not single shifting initially.  */
5762       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5763       /* Beginning of buffer should also be regarded as bol. */
5764       CODING_ISO_BOL (coding) = 1;
5765       coding->detector = detect_coding_iso_2022;
5766       coding->decoder = decode_coding_iso_2022;
5767       coding->encoder = encode_coding_iso_2022;
5768       if (flags & CODING_ISO_FLAG_SAFE)
5769         coding->mode |= CODING_MODE_SAFE_ENCODING;
5770       coding->common_flags
5771         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5772             | CODING_REQUIRE_FLUSHING_MASK);
5773       if (flags & CODING_ISO_FLAG_COMPOSITION)
5774         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5775       if (flags & CODING_ISO_FLAG_DESIGNATION)
5776         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5777       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5778         {
5779           setup_iso_safe_charsets (attrs);
5780           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5781           coding->max_charset_id = SCHARS (val) - 1;
5782           coding->safe_charsets = SDATA (val);
5783         }
5784       CODING_ISO_FLAGS (coding) = flags;
5785       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5786       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5787       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5788       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5789     }
5790   else if (EQ (coding_type, Qcharset))
5791     {
5792       coding->detector = detect_coding_charset;
5793       coding->decoder = decode_coding_charset;
5794       coding->encoder = encode_coding_charset;
5795       coding->common_flags
5796         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5797     }
5798   else if (EQ (coding_type, Qutf_8))
5799     {
5800       val = AREF (attrs, coding_attr_utf_bom);
5801       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5802                                    : EQ (val, Qt) ? utf_with_bom
5803                                    : utf_without_bom);
5804       coding->detector = detect_coding_utf_8;
5805       coding->decoder = decode_coding_utf_8;
5806       coding->encoder = encode_coding_utf_8;
5807       coding->common_flags
5808         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5809       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5810         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5811     }
5812   else if (EQ (coding_type, Qutf_16))
5813     {
5814       val = AREF (attrs, coding_attr_utf_bom);
5815       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5816                                     : EQ (val, Qt) ? utf_with_bom
5817                                     : utf_without_bom);
5818       val = AREF (attrs, coding_attr_utf_16_endian);
5819       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5820                                        : utf_16_little_endian);
5821       CODING_UTF_16_SURROGATE (coding) = 0;
5822       coding->detector = detect_coding_utf_16;
5823       coding->decoder = decode_coding_utf_16;
5824       coding->encoder = encode_coding_utf_16;
5825       coding->common_flags
5826         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5827       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5828         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5829     }
5830   else if (EQ (coding_type, Qccl))
5831     {
5832       coding->detector = detect_coding_ccl;
5833       coding->decoder = decode_coding_ccl;
5834       coding->encoder = encode_coding_ccl;
5835       coding->common_flags
5836         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5837             | CODING_REQUIRE_FLUSHING_MASK);
5838     }
5839   else if (EQ (coding_type, Qemacs_mule))
5840     {
5841       coding->detector = detect_coding_emacs_mule;
5842       coding->decoder = decode_coding_emacs_mule;
5843       coding->encoder = encode_coding_emacs_mule;
5844       coding->common_flags
5845         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5846       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5847           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5848         {
5849           Lisp_Object tail, safe_charsets;
5850           int max_charset_id = 0;
5851
5852           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5853                tail = XCDR (tail))
5854             if (max_charset_id < XFASTINT (XCAR (tail)))
5855               max_charset_id = XFASTINT (XCAR (tail));
5856           safe_charsets = make_uninit_string (max_charset_id + 1);
5857           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5858           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5859                tail = XCDR (tail))
5860             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5861           coding->max_charset_id = max_charset_id;
5862           coding->safe_charsets = SDATA (safe_charsets);
5863         }
5864       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5865       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5866     }
5867   else if (EQ (coding_type, Qshift_jis))
5868     {
5869       coding->detector = detect_coding_sjis;
5870       coding->decoder = decode_coding_sjis;
5871       coding->encoder = encode_coding_sjis;
5872       coding->common_flags
5873         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5874     }
5875   else if (EQ (coding_type, Qbig5))
5876     {
5877       coding->detector = detect_coding_big5;
5878       coding->decoder = decode_coding_big5;
5879       coding->encoder = encode_coding_big5;
5880       coding->common_flags
5881         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5882     }
5883   else                          /* EQ (coding_type, Qraw_text) */
5884     {
5885       coding->detector = NULL;
5886       coding->decoder = decode_coding_raw_text;
5887       coding->encoder = encode_coding_raw_text;
5888       if (! EQ (eol_type, Qunix))
5889         {
5890           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5891           if (! VECTORP (eol_type))
5892             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5893         }
5894
5895     }
5896
5897   return;
5898 }
5899
5900 /* Return a list of charsets supported by CODING.  */
5901
5902 Lisp_Object
5903 coding_charset_list (struct coding_system *coding)
5904 {
5905   Lisp_Object attrs, charset_list;
5906
5907   CODING_GET_INFO (coding, attrs, charset_list);
5908   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5909     {
5910       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5911
5912       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5913         charset_list = Viso_2022_charset_list;
5914     }
5915   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5916     {
5917       charset_list = Vemacs_mule_charset_list;
5918     }
5919   return charset_list;
5920 }
5921
5922
5923 /* Return a list of charsets supported by CODING-SYSTEM.  */
5924
5925 Lisp_Object
5926 coding_system_charset_list (Lisp_Object coding_system)
5927 {
5928   ptrdiff_t id;
5929   Lisp_Object attrs, charset_list;
5930
5931   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5932   attrs = CODING_ID_ATTRS (id);
5933
5934   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5935     {
5936       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5937
5938       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5939         charset_list = Viso_2022_charset_list;
5940       else
5941         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5942     }
5943   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5944     {
5945       charset_list = Vemacs_mule_charset_list;
5946     }
5947   else
5948     {
5949       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5950     }
5951   return charset_list;
5952 }
5953
5954
5955 /* Return raw-text or one of its subsidiaries that has the same
5956    eol_type as CODING-SYSTEM.  */
5957
5958 Lisp_Object
5959 raw_text_coding_system (Lisp_Object coding_system)
5960 {
5961   Lisp_Object spec, attrs;
5962   Lisp_Object eol_type, raw_text_eol_type;
5963
5964   if (NILP (coding_system))
5965     return Qraw_text;
5966   spec = CODING_SYSTEM_SPEC (coding_system);
5967   attrs = AREF (spec, 0);
5968
5969   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5970     return coding_system;
5971
5972   eol_type = AREF (spec, 2);
5973   if (VECTORP (eol_type))
5974     return Qraw_text;
5975   spec = CODING_SYSTEM_SPEC (Qraw_text);
5976   raw_text_eol_type = AREF (spec, 2);
5977   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5978           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5979           : AREF (raw_text_eol_type, 2));
5980 }
5981
5982
5983 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5984    the subsidiary that has the same eol-spec as PARENT (if it is not
5985    nil and specifies end-of-line format) or the system's setting
5986    (system_eol_type).  */
5987
5988 Lisp_Object
5989 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5990 {
5991   Lisp_Object spec, eol_type;
5992
5993   if (NILP (coding_system))
5994     coding_system = Qraw_text;
5995   spec = CODING_SYSTEM_SPEC (coding_system);
5996   eol_type = AREF (spec, 2);
5997   if (VECTORP (eol_type))
5998     {
5999       Lisp_Object parent_eol_type;
6000
6001       if (! NILP (parent))
6002         {
6003           Lisp_Object parent_spec;
6004
6005           parent_spec = CODING_SYSTEM_SPEC (parent);
6006           parent_eol_type = AREF (parent_spec, 2);
6007           if (VECTORP (parent_eol_type))
6008             parent_eol_type = system_eol_type;
6009         }
6010       else
6011         parent_eol_type = system_eol_type;
6012       if (EQ (parent_eol_type, Qunix))
6013         coding_system = AREF (eol_type, 0);
6014       else if (EQ (parent_eol_type, Qdos))
6015         coding_system = AREF (eol_type, 1);
6016       else if (EQ (parent_eol_type, Qmac))
6017         coding_system = AREF (eol_type, 2);
6018     }
6019   return coding_system;
6020 }
6021
6022
6023 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6024    decided for writing to a process.  If not, complement them, and
6025    return a new coding system.  */
6026
6027 Lisp_Object
6028 complement_process_encoding_system (Lisp_Object coding_system)
6029 {
6030   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6031   Lisp_Object spec, attrs;
6032   int i;
6033
6034   for (i = 0; i < 3; i++)
6035     {
6036       if (i == 1)
6037         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6038       else if (i == 2)
6039         coding_system = preferred_coding_system ();
6040       spec = CODING_SYSTEM_SPEC (coding_system);
6041       if (NILP (spec))
6042         continue;
6043       attrs = AREF (spec, 0);
6044       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6045         coding_base = CODING_ATTR_BASE_NAME (attrs);
6046       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6047         eol_base = coding_system;
6048       if (! NILP (coding_base) && ! NILP (eol_base))
6049         break;
6050     }
6051
6052   if (i > 0)
6053     /* The original CODING_SYSTEM didn't specify text-conversion or
6054        eol-conversion.  Be sure that we return a fully complemented
6055        coding system.  */
6056     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6057   return coding_system;
6058 }
6059
6060
6061 /* Emacs has a mechanism to automatically detect a coding system if it
6062    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6063    it's impossible to distinguish some coding systems accurately
6064    because they use the same range of codes.  So, at first, coding
6065    systems are categorized into 7, those are:
6066
6067    o coding-category-emacs-mule
6068
6069         The category for a coding system which has the same code range
6070         as Emacs' internal format.  Assigned the coding-system (Lisp
6071         symbol) `emacs-mule' by default.
6072
6073    o coding-category-sjis
6074
6075         The category for a coding system which has the same code range
6076         as SJIS.  Assigned the coding-system (Lisp
6077         symbol) `japanese-shift-jis' by default.
6078
6079    o coding-category-iso-7
6080
6081         The category for a coding system which has the same code range
6082         as ISO2022 of 7-bit environment.  This doesn't use any locking
6083         shift and single shift functions.  This can encode/decode all
6084         charsets.  Assigned the coding-system (Lisp symbol)
6085         `iso-2022-7bit' by default.
6086
6087    o coding-category-iso-7-tight
6088
6089         Same as coding-category-iso-7 except that this can
6090         encode/decode only the specified charsets.
6091
6092    o coding-category-iso-8-1
6093
6094         The category for a coding system which has the same code range
6095         as ISO2022 of 8-bit environment and graphic plane 1 used only
6096         for DIMENSION1 charset.  This doesn't use any locking shift
6097         and single shift functions.  Assigned the coding-system (Lisp
6098         symbol) `iso-latin-1' by default.
6099
6100    o coding-category-iso-8-2
6101
6102         The category for a coding system which has the same code range
6103         as ISO2022 of 8-bit environment and graphic plane 1 used only
6104         for DIMENSION2 charset.  This doesn't use any locking shift
6105         and single shift functions.  Assigned the coding-system (Lisp
6106         symbol) `japanese-iso-8bit' by default.
6107
6108    o coding-category-iso-7-else
6109
6110         The category for a coding system which has the same code range
6111         as ISO2022 of 7-bit environment but uses locking shift or
6112         single shift functions.  Assigned the coding-system (Lisp
6113         symbol) `iso-2022-7bit-lock' by default.
6114
6115    o coding-category-iso-8-else
6116
6117         The category for a coding system which has the same code range
6118         as ISO2022 of 8-bit environment but uses locking shift or
6119         single shift functions.  Assigned the coding-system (Lisp
6120         symbol) `iso-2022-8bit-ss2' by default.
6121
6122    o coding-category-big5
6123
6124         The category for a coding system which has the same code range
6125         as BIG5.  Assigned the coding-system (Lisp symbol)
6126         `cn-big5' by default.
6127
6128    o coding-category-utf-8
6129
6130         The category for a coding system which has the same code range
6131         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6132         symbol) `utf-8' by default.
6133
6134    o coding-category-utf-16-be
6135
6136         The category for a coding system in which a text has an
6137         Unicode signature (cf. Unicode Standard) in the order of BIG
6138         endian at the head.  Assigned the coding-system (Lisp symbol)
6139         `utf-16-be' by default.
6140
6141    o coding-category-utf-16-le
6142
6143         The category for a coding system in which a text has an
6144         Unicode signature (cf. Unicode Standard) in the order of
6145         LITTLE endian at the head.  Assigned the coding-system (Lisp
6146         symbol) `utf-16-le' by default.
6147
6148    o coding-category-ccl
6149
6150         The category for a coding system of which encoder/decoder is
6151         written in CCL programs.  The default value is nil, i.e., no
6152         coding system is assigned.
6153
6154    o coding-category-binary
6155
6156         The category for a coding system not categorized in any of the
6157         above.  Assigned the coding-system (Lisp symbol)
6158         `no-conversion' by default.
6159
6160    Each of them is a Lisp symbol and the value is an actual
6161    `coding-system's (this is also a Lisp symbol) assigned by a user.
6162    What Emacs does actually is to detect a category of coding system.
6163    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6164    decide only one possible category, it selects a category of the
6165    highest priority.  Priorities of categories are also specified by a
6166    user in a Lisp variable `coding-category-list'.
6167
6168 */
6169
6170 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6171                                            int eol_seen);
6172
6173
6174 /* Return the number of ASCII characters at the head of the source.
6175    By side effects, set coding->head_ascii and update
6176    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6177    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6178    reliable only when all the source bytes are ASCII.  */
6179
6180 static ptrdiff_t
6181 check_ascii (struct coding_system *coding)
6182 {
6183   const unsigned char *src, *end;
6184   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6185   int eol_seen = coding->eol_seen;
6186
6187   coding_set_source (coding);
6188   src = coding->source;
6189   end = src + coding->src_bytes;
6190
6191   if (inhibit_eol_conversion
6192       || SYMBOLP (eol_type))
6193     {
6194       /* We don't have to check EOL format.  */
6195       while (src < end && !( *src & 0x80))
6196         {
6197           if (*src++ == '\n')
6198             eol_seen |= EOL_SEEN_LF;
6199         }
6200     }
6201   else
6202     {
6203       end--;                /* We look ahead one byte for "CR LF".  */
6204       while (src < end)
6205         {
6206           int c = *src;
6207
6208           if (c & 0x80)
6209             break;
6210           src++;
6211           if (c == '\r')
6212             {
6213               if (*src == '\n')
6214                 {
6215                   eol_seen |= EOL_SEEN_CRLF;
6216                   src++;
6217                 }
6218               else
6219                 eol_seen |= EOL_SEEN_CR;
6220             }
6221           else if (c == '\n')
6222             eol_seen |= EOL_SEEN_LF;
6223         }
6224       if (src == end)
6225         {
6226           int c = *src;
6227
6228           /* All bytes but the last one C are ASCII.  */
6229           if (! (c & 0x80))
6230             {
6231               if (c == '\r')
6232                 eol_seen |= EOL_SEEN_CR;
6233               else if (c  == '\n')
6234                 eol_seen |= EOL_SEEN_LF;
6235               src++;
6236             }
6237         }
6238     }
6239   coding->head_ascii = src - coding->source;
6240   coding->eol_seen = eol_seen;
6241   return (coding->head_ascii);
6242 }
6243
6244
6245 /* Return the number of characters at the source if all the bytes are
6246    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6247    effects, update coding->eol_seen.  The value of coding->eol_seen is
6248    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6249    the value is reliable only when all the source bytes are valid
6250    UTF-8.  */
6251
6252 static ptrdiff_t
6253 check_utf_8 (struct coding_system *coding)
6254 {
6255   const unsigned char *src, *end;
6256   int eol_seen;
6257   ptrdiff_t nchars = coding->head_ascii;
6258
6259   if (coding->head_ascii < 0)
6260     check_ascii (coding);
6261   else
6262     coding_set_source (coding);
6263   src = coding->source + coding->head_ascii;
6264   /* We look ahead one byte for CR LF.  */
6265   end = coding->source + coding->src_bytes - 1;
6266   eol_seen = coding->eol_seen;
6267   while (src < end)
6268     {
6269       int c = *src;
6270
6271       if (UTF_8_1_OCTET_P (*src))
6272         {
6273           src++;
6274           if (c < 0x20)
6275             {
6276               if (c == '\r')
6277                 {
6278                   if (*src == '\n')
6279                     {
6280                       eol_seen |= EOL_SEEN_CRLF;
6281                       src++;
6282                       nchars++;
6283                     }
6284                   else
6285                     eol_seen |= EOL_SEEN_CR;
6286                 }
6287               else if (c == '\n')
6288                 eol_seen |= EOL_SEEN_LF;
6289             }
6290         }
6291       else if (UTF_8_2_OCTET_LEADING_P (c))
6292         {
6293           if (c < 0xC2          /* overlong sequence */
6294               || src + 1 >= end
6295               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6296             return -1;
6297           src += 2;
6298         }
6299       else if (UTF_8_3_OCTET_LEADING_P (c))
6300         {
6301           if (src + 2 >= end
6302               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6303                     && UTF_8_EXTRA_OCTET_P (src[2])))
6304             return -1;
6305           c = (((c & 0xF) << 12)
6306                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6307           if (c < 0x800                       /* overlong sequence */
6308               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6309             return -1;
6310           src += 3;
6311         }
6312       else if (UTF_8_4_OCTET_LEADING_P (c))
6313         {
6314           if (src + 3 >= end
6315               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6316                     && UTF_8_EXTRA_OCTET_P (src[2])
6317                     && UTF_8_EXTRA_OCTET_P (src[3])))
6318             return -1;
6319           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6320                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6321           if (c < 0x10000       /* overlong sequence */
6322               || c >= 0x110000) /* non-Unicode character  */
6323             return -1;
6324           src += 4;
6325         }
6326       else
6327         return -1;
6328       nchars++;
6329     }
6330
6331   if (src == end)
6332     {
6333       if (! UTF_8_1_OCTET_P (*src))
6334         return -1;
6335       nchars++;
6336       if (*src == '\r')
6337         eol_seen |= EOL_SEEN_CR;
6338       else if (*src  == '\n')
6339         eol_seen |= EOL_SEEN_LF;
6340     }
6341   coding->eol_seen = eol_seen;
6342   return nchars;
6343 }
6344
6345
6346 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6347    SOURCE is encoded.  If CATEGORY is one of
6348    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6349    two-byte, else they are encoded by one-byte.
6350
6351    Return one of EOL_SEEN_XXX.  */
6352
6353 #define MAX_EOL_CHECK_COUNT 3
6354
6355 static int
6356 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6357             enum coding_category category)
6358 {
6359   const unsigned char *src = source, *src_end = src + src_bytes;
6360   unsigned char c;
6361   int total  = 0;
6362   int eol_seen = EOL_SEEN_NONE;
6363
6364   if ((1 << category) & CATEGORY_MASK_UTF_16)
6365     {
6366       bool msb = category == (coding_category_utf_16_le
6367                               | coding_category_utf_16_le_nosig);
6368       bool lsb = !msb;
6369
6370       while (src + 1 < src_end)
6371         {
6372           c = src[lsb];
6373           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6374             {
6375               int this_eol;
6376
6377               if (c == '\n')
6378                 this_eol = EOL_SEEN_LF;
6379               else if (src + 3 >= src_end
6380                        || src[msb + 2] != 0
6381                        || src[lsb + 2] != '\n')
6382                 this_eol = EOL_SEEN_CR;
6383               else
6384                 {
6385                   this_eol = EOL_SEEN_CRLF;
6386                   src += 2;
6387                 }
6388
6389               if (eol_seen == EOL_SEEN_NONE)
6390                 /* This is the first end-of-line.  */
6391                 eol_seen = this_eol;
6392               else if (eol_seen != this_eol)
6393                 {
6394                   /* The found type is different from what found before.
6395                      Allow for stray ^M characters in DOS EOL files.  */
6396                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6397                       || (eol_seen == EOL_SEEN_CRLF
6398                           && this_eol == EOL_SEEN_CR))
6399                     eol_seen = EOL_SEEN_CRLF;
6400                   else
6401                     {
6402                       eol_seen = EOL_SEEN_LF;
6403                       break;
6404                     }
6405                 }
6406               if (++total == MAX_EOL_CHECK_COUNT)
6407                 break;
6408             }
6409           src += 2;
6410         }
6411     }
6412   else
6413     while (src < src_end)
6414       {
6415         c = *src++;
6416         if (c == '\n' || c == '\r')
6417           {
6418             int this_eol;
6419
6420             if (c == '\n')
6421               this_eol = EOL_SEEN_LF;
6422             else if (src >= src_end || *src != '\n')
6423               this_eol = EOL_SEEN_CR;
6424             else
6425               this_eol = EOL_SEEN_CRLF, src++;
6426
6427             if (eol_seen == EOL_SEEN_NONE)
6428               /* This is the first end-of-line.  */
6429               eol_seen = this_eol;
6430             else if (eol_seen != this_eol)
6431               {
6432                 /* The found type is different from what found before.
6433                    Allow for stray ^M characters in DOS EOL files.  */
6434                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6435                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6436                   eol_seen = EOL_SEEN_CRLF;
6437                 else
6438                   {
6439                     eol_seen = EOL_SEEN_LF;
6440                     break;
6441                   }
6442               }
6443             if (++total == MAX_EOL_CHECK_COUNT)
6444               break;
6445           }
6446       }
6447   return eol_seen;
6448 }
6449
6450
6451 static Lisp_Object
6452 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6453 {
6454   Lisp_Object eol_type;
6455
6456   eol_type = CODING_ID_EOL_TYPE (coding->id);
6457   if (! VECTORP (eol_type))
6458     /* Already adjusted.  */
6459     return eol_type;
6460   if (eol_seen & EOL_SEEN_LF)
6461     {
6462       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6463       eol_type = Qunix;
6464     }
6465   else if (eol_seen & EOL_SEEN_CRLF)
6466     {
6467       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6468       eol_type = Qdos;
6469     }
6470   else if (eol_seen & EOL_SEEN_CR)
6471     {
6472       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6473       eol_type = Qmac;
6474     }
6475   return eol_type;
6476 }
6477
6478 /* Detect how a text specified in CODING is encoded.  If a coding
6479    system is detected, update fields of CODING by the detected coding
6480    system.  */
6481
6482 static void
6483 detect_coding (struct coding_system *coding)
6484 {
6485   const unsigned char *src, *src_end;
6486   unsigned int saved_mode = coding->mode;
6487   Lisp_Object found = Qnil;
6488   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6489
6490   coding->consumed = coding->consumed_char = 0;
6491   coding->produced = coding->produced_char = 0;
6492   coding_set_source (coding);
6493
6494   src_end = coding->source + coding->src_bytes;
6495
6496   coding->eol_seen = EOL_SEEN_NONE;
6497   /* If we have not yet decided the text encoding type, detect it
6498      now.  */
6499   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6500     {
6501       int c, i;
6502       struct coding_detection_info detect_info;
6503       bool null_byte_found = 0, eight_bit_found = 0;
6504       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6505                                        inhibit_null_byte_detection);
6506       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6507                                        inhibit_iso_escape_detection);
6508       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6509
6510       coding->head_ascii = 0;
6511       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6512       for (src = coding->source; src < src_end; src++)
6513         {
6514           c = *src;
6515           if (c & 0x80)
6516             {
6517               eight_bit_found = 1;
6518               if (null_byte_found)
6519                 break;
6520             }
6521           else if (c < 0x20)
6522             {
6523               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6524                   && ! inhibit_ied
6525                   && ! detect_info.checked)
6526                 {
6527                   if (detect_coding_iso_2022 (coding, &detect_info))
6528                     {
6529                       /* We have scanned the whole data.  */
6530                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6531                         {
6532                           /* We didn't find an 8-bit code.  We may
6533                              have found a null-byte, but it's very
6534                              rare that a binary file conforms to
6535                              ISO-2022.  */
6536                           src = src_end;
6537                           coding->head_ascii = src - coding->source;
6538                         }
6539                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6540                       break;
6541                     }
6542                 }
6543               else if (! c && !inhibit_nbd)
6544                 {
6545                   null_byte_found = 1;
6546                   if (eight_bit_found)
6547                     break;
6548                 }
6549               else if (! disable_ascii_optimization
6550                        && ! inhibit_eol_conversion)
6551                 {
6552                   if (c == '\r')
6553                     {
6554                       if (src < src_end && src[1] == '\n')
6555                         {
6556                           coding->eol_seen |= EOL_SEEN_CRLF;
6557                           src++;
6558                           if (! eight_bit_found)
6559                             coding->head_ascii++;
6560                         }
6561                       else
6562                         coding->eol_seen |= EOL_SEEN_CR;
6563                     }
6564                   else if (c == '\n')
6565                     {
6566                       coding->eol_seen |= EOL_SEEN_LF;
6567                     }
6568                 }
6569
6570               if (! eight_bit_found)
6571                 coding->head_ascii++;
6572             }
6573           else if (! eight_bit_found)
6574             coding->head_ascii++;
6575         }
6576
6577       if (null_byte_found || eight_bit_found
6578           || coding->head_ascii < coding->src_bytes
6579           || detect_info.found)
6580         {
6581           enum coding_category category;
6582           struct coding_system *this;
6583
6584           if (coding->head_ascii == coding->src_bytes)
6585             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6586             for (i = 0; i < coding_category_raw_text; i++)
6587               {
6588                 category = coding_priorities[i];
6589                 this = coding_categories + category;
6590                 if (detect_info.found & (1 << category))
6591                   break;
6592               }
6593           else
6594             {
6595               if (null_byte_found)
6596                 {
6597                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6598                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6599                 }
6600               else if (prefer_utf_8
6601                        && detect_coding_utf_8 (coding, &detect_info))
6602                 {
6603                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6604                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6605                 }
6606               for (i = 0; i < coding_category_raw_text; i++)
6607                 {
6608                   category = coding_priorities[i];
6609                   this = coding_categories + category;
6610                   /* Some of this->detector (e.g. detect_coding_sjis)
6611                      require this information.  */
6612                   coding->id = this->id;
6613                   if (this->id < 0)
6614                     {
6615                       /* No coding system of this category is defined.  */
6616                       detect_info.rejected |= (1 << category);
6617                     }
6618                   else if (category >= coding_category_raw_text)
6619                     continue;
6620                   else if (detect_info.checked & (1 << category))
6621                     {
6622                       if (detect_info.found & (1 << category))
6623                         break;
6624                     }
6625                   else if ((*(this->detector)) (coding, &detect_info)
6626                            && detect_info.found & (1 << category))
6627                     break;
6628                 }
6629             }
6630
6631           if (i < coding_category_raw_text)
6632             {
6633               if (category == coding_category_utf_8_auto)
6634                 {
6635                   Lisp_Object coding_systems;
6636
6637                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6638                                          coding_attr_utf_bom);
6639                   if (CONSP (coding_systems))
6640                     {
6641                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6642                         found = XCAR (coding_systems);
6643                       else
6644                         found = XCDR (coding_systems);
6645                     }
6646                   else
6647                     found = CODING_ID_NAME (this->id);
6648                 }
6649               else if (category == coding_category_utf_16_auto)
6650                 {
6651                   Lisp_Object coding_systems;
6652
6653                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6654                                          coding_attr_utf_bom);
6655                   if (CONSP (coding_systems))
6656                     {
6657                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6658                         found = XCAR (coding_systems);
6659                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6660                         found = XCDR (coding_systems);
6661                     }
6662                   else
6663                     found = CODING_ID_NAME (this->id);
6664                 }
6665               else
6666                 found = CODING_ID_NAME (this->id);
6667             }
6668           else if (null_byte_found)
6669             found = Qno_conversion;
6670           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6671                    == CATEGORY_MASK_ANY)
6672             found = Qraw_text;
6673           else if (detect_info.rejected)
6674             for (i = 0; i < coding_category_raw_text; i++)
6675               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6676                 {
6677                   this = coding_categories + coding_priorities[i];
6678                   found = CODING_ID_NAME (this->id);
6679                   break;
6680                 }
6681         }
6682     }
6683   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6684            == coding_category_utf_8_auto)
6685     {
6686       Lisp_Object coding_systems;
6687       struct coding_detection_info detect_info;
6688
6689       coding_systems
6690         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6691       detect_info.found = detect_info.rejected = 0;
6692       if (check_ascii (coding) == coding->src_bytes)
6693         {
6694           if (CONSP (coding_systems))
6695             found = XCDR (coding_systems);
6696         }
6697       else
6698         {
6699           if (CONSP (coding_systems)
6700               && detect_coding_utf_8 (coding, &detect_info))
6701             {
6702               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6703                 found = XCAR (coding_systems);
6704               else
6705                 found = XCDR (coding_systems);
6706             }
6707         }
6708     }
6709   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6710            == coding_category_utf_16_auto)
6711     {
6712       Lisp_Object coding_systems;
6713       struct coding_detection_info detect_info;
6714
6715       coding_systems
6716         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6717       detect_info.found = detect_info.rejected = 0;
6718       coding->head_ascii = 0;
6719       if (CONSP (coding_systems)
6720           && detect_coding_utf_16 (coding, &detect_info))
6721         {
6722           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6723             found = XCAR (coding_systems);
6724           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6725             found = XCDR (coding_systems);
6726         }
6727     }
6728
6729   if (! NILP (found))
6730     {
6731       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6732                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6733                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6734                            : EOL_SEEN_LF);
6735
6736       setup_coding_system (found, coding);
6737       if (specified_eol != EOL_SEEN_NONE)
6738         adjust_coding_eol_type (coding, specified_eol);
6739     }
6740
6741   coding->mode = saved_mode;
6742 }
6743
6744
6745 static void
6746 decode_eol (struct coding_system *coding)
6747 {
6748   Lisp_Object eol_type;
6749   unsigned char *p, *pbeg, *pend;
6750
6751   eol_type = CODING_ID_EOL_TYPE (coding->id);
6752   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6753     return;
6754
6755   if (NILP (coding->dst_object))
6756     pbeg = coding->destination;
6757   else
6758     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6759   pend = pbeg + coding->produced;
6760
6761   if (VECTORP (eol_type))
6762     {
6763       int eol_seen = EOL_SEEN_NONE;
6764
6765       for (p = pbeg; p < pend; p++)
6766         {
6767           if (*p == '\n')
6768             eol_seen |= EOL_SEEN_LF;
6769           else if (*p == '\r')
6770             {
6771               if (p + 1 < pend && *(p + 1) == '\n')
6772                 {
6773                   eol_seen |= EOL_SEEN_CRLF;
6774                   p++;
6775                 }
6776               else
6777                 eol_seen |= EOL_SEEN_CR;
6778             }
6779         }
6780       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6781       if ((eol_seen & EOL_SEEN_CRLF) != 0
6782           && (eol_seen & EOL_SEEN_CR) != 0
6783           && (eol_seen & EOL_SEEN_LF) == 0)
6784         eol_seen = EOL_SEEN_CRLF;
6785       else if (eol_seen != EOL_SEEN_NONE
6786           && eol_seen != EOL_SEEN_LF
6787           && eol_seen != EOL_SEEN_CRLF
6788           && eol_seen != EOL_SEEN_CR)
6789         eol_seen = EOL_SEEN_LF;
6790       if (eol_seen != EOL_SEEN_NONE)
6791         eol_type = adjust_coding_eol_type (coding, eol_seen);
6792     }
6793
6794   if (EQ (eol_type, Qmac))
6795     {
6796       for (p = pbeg; p < pend; p++)
6797         if (*p == '\r')
6798           *p = '\n';
6799     }
6800   else if (EQ (eol_type, Qdos))
6801     {
6802       ptrdiff_t n = 0;
6803
6804       if (NILP (coding->dst_object))
6805         {
6806           /* Start deleting '\r' from the tail to minimize the memory
6807              movement.  */
6808           for (p = pend - 2; p >= pbeg; p--)
6809             if (*p == '\r')
6810               {
6811                 memmove (p, p + 1, pend-- - p - 1);
6812                 n++;
6813               }
6814         }
6815       else
6816         {
6817           ptrdiff_t pos_byte = coding->dst_pos_byte;
6818           ptrdiff_t pos = coding->dst_pos;
6819           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6820
6821           while (pos < pos_end)
6822             {
6823               p = BYTE_POS_ADDR (pos_byte);
6824               if (*p == '\r' && p[1] == '\n')
6825                 {
6826                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6827                   n++;
6828                   pos_end--;
6829                 }
6830               pos++;
6831               if (coding->dst_multibyte)
6832                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6833               else
6834                 pos_byte++;
6835             }
6836         }
6837       coding->produced -= n;
6838       coding->produced_char -= n;
6839     }
6840 }
6841
6842
6843 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
6844    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
6845    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
6846 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6847
6848 /* Return a translation table (or list of them) from coding system
6849    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6850    not ENCODEP). */
6851
6852 static Lisp_Object
6853 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6854 {
6855   Lisp_Object standard, translation_table;
6856   Lisp_Object val;
6857
6858   if (NILP (Venable_character_translation))
6859     {
6860       if (max_lookup)
6861         *max_lookup = 0;
6862       return Qnil;
6863     }
6864   if (encodep)
6865     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6866       standard = Vstandard_translation_table_for_encode;
6867   else
6868     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6869       standard = Vstandard_translation_table_for_decode;
6870   if (NILP (translation_table))
6871     translation_table = standard;
6872   else
6873     {
6874       if (SYMBOLP (translation_table))
6875         translation_table = Fget (translation_table, Qtranslation_table);
6876       else if (CONSP (translation_table))
6877         {
6878           translation_table = Fcopy_sequence (translation_table);
6879           for (val = translation_table; CONSP (val); val = XCDR (val))
6880             if (SYMBOLP (XCAR (val)))
6881               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6882         }
6883       if (CHAR_TABLE_P (standard))
6884         {
6885           if (CONSP (translation_table))
6886             translation_table = nconc2 (translation_table, list1 (standard));
6887           else
6888             translation_table = list2 (translation_table, standard);
6889         }
6890     }
6891
6892   if (max_lookup)
6893     {
6894       *max_lookup = 1;
6895       if (CHAR_TABLE_P (translation_table)
6896           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6897         {
6898           val = XCHAR_TABLE (translation_table)->extras[1];
6899           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6900             *max_lookup = min (XFASTINT (val), MAX_LOOKUP_MAX);
6901         }
6902       else if (CONSP (translation_table))
6903         {
6904           Lisp_Object tail;
6905
6906           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6907             if (CHAR_TABLE_P (XCAR (tail))
6908                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6909               {
6910                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6911                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6912                   *max_lookup = min (XFASTINT (tailval), MAX_LOOKUP_MAX);
6913               }
6914         }
6915     }
6916   return translation_table;
6917 }
6918
6919 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6920   do {                                                          \
6921     trans = Qnil;                                               \
6922     if (CHAR_TABLE_P (table))                                   \
6923       {                                                         \
6924         trans = CHAR_TABLE_REF (table, c);                      \
6925         if (CHARACTERP (trans))                                 \
6926           c = XFASTINT (trans), trans = Qnil;                   \
6927       }                                                         \
6928     else if (CONSP (table))                                     \
6929       {                                                         \
6930         Lisp_Object tail;                                       \
6931                                                                 \
6932         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6933           if (CHAR_TABLE_P (XCAR (tail)))                       \
6934             {                                                   \
6935               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6936               if (CHARACTERP (trans))                           \
6937                 c = XFASTINT (trans), trans = Qnil;             \
6938               else if (! NILP (trans))                          \
6939                 break;                                          \
6940             }                                                   \
6941       }                                                         \
6942   } while (0)
6943
6944
6945 /* Return a translation of character(s) at BUF according to TRANS.
6946    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6947    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6948    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6949    translation is found, and Qnil if not found..
6950    If BUF is too short to lookup characters in FROM, return Qt.  */
6951
6952 static Lisp_Object
6953 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6954 {
6955
6956   if (INTEGERP (trans))
6957     return trans;
6958   for (; CONSP (trans); trans = XCDR (trans))
6959     {
6960       Lisp_Object val = XCAR (trans);
6961       Lisp_Object from = XCAR (val);
6962       ptrdiff_t len = ASIZE (from);
6963       ptrdiff_t i;
6964
6965       for (i = 0; i < len; i++)
6966         {
6967           if (buf + i == buf_end)
6968             return Qt;
6969           if (XINT (AREF (from, i)) != buf[i])
6970             break;
6971         }
6972       if (i == len)
6973         return val;
6974     }
6975   return Qnil;
6976 }
6977
6978
6979 static int
6980 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6981                bool last_block)
6982 {
6983   unsigned char *dst = coding->destination + coding->produced;
6984   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6985   ptrdiff_t produced;
6986   ptrdiff_t produced_chars = 0;
6987   int carryover = 0;
6988
6989   if (! coding->chars_at_source)
6990     {
6991       /* Source characters are in coding->charbuf.  */
6992       int *buf = coding->charbuf;
6993       int *buf_end = buf + coding->charbuf_used;
6994
6995       if (EQ (coding->src_object, coding->dst_object)
6996           && ! NILP (coding->dst_object))
6997         {
6998           eassert (growable_destination (coding));
6999           coding_set_source (coding);
7000           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7001         }
7002
7003       while (buf < buf_end)
7004         {
7005           int c = *buf;
7006           ptrdiff_t i;
7007
7008           if (c >= 0)
7009             {
7010               ptrdiff_t from_nchars = 1, to_nchars = 1;
7011               Lisp_Object trans = Qnil;
7012
7013               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7014               if (! NILP (trans))
7015                 {
7016                   trans = get_translation (trans, buf, buf_end);
7017                   if (INTEGERP (trans))
7018                     c = XINT (trans);
7019                   else if (CONSP (trans))
7020                     {
7021                       from_nchars = ASIZE (XCAR (trans));
7022                       trans = XCDR (trans);
7023                       if (INTEGERP (trans))
7024                         c = XINT (trans);
7025                       else
7026                         {
7027                           to_nchars = ASIZE (trans);
7028                           c = XINT (AREF (trans, 0));
7029                         }
7030                     }
7031                   else if (EQ (trans, Qt) && ! last_block)
7032                     break;
7033                 }
7034
7035               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7036                 {
7037                   eassert (growable_destination (coding));
7038                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7039                        / MAX_MULTIBYTE_LENGTH)
7040                       < to_nchars)
7041                     memory_full (SIZE_MAX);
7042                   dst = alloc_destination (coding,
7043                                            buf_end - buf
7044                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7045                                            dst);
7046                   if (EQ (coding->src_object, coding->dst_object))
7047                     {
7048                       coding_set_source (coding);
7049                       dst_end = (((unsigned char *) coding->source)
7050                                  + coding->consumed);
7051                     }
7052                   else
7053                     dst_end = coding->destination + coding->dst_bytes;
7054                 }
7055
7056               for (i = 0; i < to_nchars; i++)
7057                 {
7058                   if (i > 0)
7059                     c = XINT (AREF (trans, i));
7060                   if (coding->dst_multibyte
7061                       || ! CHAR_BYTE8_P (c))
7062                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7063                   else
7064                     *dst++ = CHAR_TO_BYTE8 (c);
7065                 }
7066               produced_chars += to_nchars;
7067               buf += from_nchars;
7068             }
7069           else
7070             /* This is an annotation datum.  (-C) is the length.  */
7071             buf += -c;
7072         }
7073       carryover = buf_end - buf;
7074     }
7075   else
7076     {
7077       /* Source characters are at coding->source.  */
7078       const unsigned char *src = coding->source;
7079       const unsigned char *src_end = src + coding->consumed;
7080
7081       if (EQ (coding->dst_object, coding->src_object))
7082         {
7083           eassert (growable_destination (coding));
7084           dst_end = (unsigned char *) src;
7085         }
7086       if (coding->src_multibyte != coding->dst_multibyte)
7087         {
7088           if (coding->src_multibyte)
7089             {
7090               bool multibytep = 1;
7091               ptrdiff_t consumed_chars = 0;
7092
7093               while (1)
7094                 {
7095                   const unsigned char *src_base = src;
7096                   int c;
7097
7098                   ONE_MORE_BYTE (c);
7099                   if (dst == dst_end)
7100                     {
7101                       eassert (growable_destination (coding));
7102                       if (EQ (coding->src_object, coding->dst_object))
7103                         dst_end = (unsigned char *) src;
7104                       if (dst == dst_end)
7105                         {
7106                           ptrdiff_t offset = src - coding->source;
7107
7108                           dst = alloc_destination (coding, src_end - src + 1,
7109                                                    dst);
7110                           dst_end = coding->destination + coding->dst_bytes;
7111                           coding_set_source (coding);
7112                           src = coding->source + offset;
7113                           src_end = coding->source + coding->consumed;
7114                           if (EQ (coding->src_object, coding->dst_object))
7115                             dst_end = (unsigned char *) src;
7116                         }
7117                     }
7118                   *dst++ = c;
7119                   produced_chars++;
7120                 }
7121             no_more_source:
7122               ;
7123             }
7124           else
7125             while (src < src_end)
7126               {
7127                 bool multibytep = 1;
7128                 int c = *src++;
7129
7130                 if (dst >= dst_end - 1)
7131                   {
7132                     eassert (growable_destination (coding));
7133                     if (EQ (coding->src_object, coding->dst_object))
7134                       dst_end = (unsigned char *) src;
7135                     if (dst >= dst_end - 1)
7136                       {
7137                         ptrdiff_t offset = src - coding->source;
7138                         ptrdiff_t more_bytes;
7139
7140                         if (EQ (coding->src_object, coding->dst_object))
7141                           more_bytes = ((src_end - src) / 2) + 2;
7142                         else
7143                           more_bytes = src_end - src + 2;
7144                         dst = alloc_destination (coding, more_bytes, dst);
7145                         dst_end = coding->destination + coding->dst_bytes;
7146                         coding_set_source (coding);
7147                         src = coding->source + offset;
7148                         src_end = coding->source + coding->consumed;
7149                         if (EQ (coding->src_object, coding->dst_object))
7150                           dst_end = (unsigned char *) src;
7151                       }
7152                   }
7153                 EMIT_ONE_BYTE (c);
7154               }
7155         }
7156       else
7157         {
7158           if (!EQ (coding->src_object, coding->dst_object))
7159             {
7160               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7161
7162               if (require > 0)
7163                 {
7164                   ptrdiff_t offset = src - coding->source;
7165
7166                   dst = alloc_destination (coding, require, dst);
7167                   coding_set_source (coding);
7168                   src = coding->source + offset;
7169                   src_end = coding->source + coding->consumed;
7170                 }
7171             }
7172           produced_chars = coding->consumed_char;
7173           while (src < src_end)
7174             *dst++ = *src++;
7175         }
7176     }
7177
7178   produced = dst - (coding->destination + coding->produced);
7179   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7180     insert_from_gap (produced_chars, produced, 0);
7181   coding->produced += produced;
7182   coding->produced_char += produced_chars;
7183   return carryover;
7184 }
7185
7186 /* Compose text in CODING->object according to the annotation data at
7187    CHARBUF.  CHARBUF is an array:
7188      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7189  */
7190
7191 static void
7192 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7193 {
7194   int len;
7195   ptrdiff_t to;
7196   enum composition_method method;
7197   Lisp_Object components;
7198
7199   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7200   to = pos + charbuf[2];
7201   method = (enum composition_method) (charbuf[4]);
7202
7203   if (method == COMPOSITION_RELATIVE)
7204     components = Qnil;
7205   else
7206     {
7207       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7208       int i, j;
7209
7210       if (method == COMPOSITION_WITH_RULE)
7211         len = charbuf[2] * 3 - 2;
7212       charbuf += MAX_ANNOTATION_LENGTH;
7213       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7214       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7215         {
7216           if (charbuf[i] >= 0)
7217             args[j] = make_number (charbuf[i]);
7218           else
7219             {
7220               i++;
7221               args[j] = make_number (charbuf[i] % 0x100);
7222             }
7223         }
7224       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7225     }
7226   compose_text (pos, to, components, Qnil, coding->dst_object);
7227 }
7228
7229
7230 /* Put `charset' property on text in CODING->object according to
7231    the annotation data at CHARBUF.  CHARBUF is an array:
7232      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7233  */
7234
7235 static void
7236 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7237 {
7238   ptrdiff_t from = pos - charbuf[2];
7239   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7240
7241   Fput_text_property (make_number (from), make_number (pos),
7242                       Qcharset, CHARSET_NAME (charset),
7243                       coding->dst_object);
7244 }
7245
7246 #define MAX_CHARBUF_SIZE 0x4000
7247 /* How many units decoding functions expect in coding->charbuf at
7248    most.  Currently, decode_coding_emacs_mule expects the following
7249    size, and that is the largest value.  */
7250 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7251
7252 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7253   do {                                                          \
7254     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
7255                            MAX_CHARBUF_SIZE);                   \
7256     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
7257     coding->charbuf_size = units;                               \
7258   } while (0)
7259
7260 static void
7261 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7262 {
7263   int *charbuf = coding->charbuf;
7264   int *charbuf_end = charbuf + coding->charbuf_used;
7265
7266   if (NILP (coding->dst_object))
7267     return;
7268
7269   while (charbuf < charbuf_end)
7270     {
7271       if (*charbuf >= 0)
7272         pos++, charbuf++;
7273       else
7274         {
7275           int len = -*charbuf;
7276
7277           if (len > 2)
7278             switch (charbuf[1])
7279               {
7280               case CODING_ANNOTATE_COMPOSITION_MASK:
7281                 produce_composition (coding, charbuf, pos);
7282                 break;
7283               case CODING_ANNOTATE_CHARSET_MASK:
7284                 produce_charset (coding, charbuf, pos);
7285                 break;
7286               }
7287           charbuf += len;
7288         }
7289     }
7290 }
7291
7292 /* Decode the data at CODING->src_object into CODING->dst_object.
7293    CODING->src_object is a buffer, a string, or nil.
7294    CODING->dst_object is a buffer.
7295
7296    If CODING->src_object is a buffer, it must be the current buffer.
7297    In this case, if CODING->src_pos is positive, it is a position of
7298    the source text in the buffer, otherwise, the source text is in the
7299    gap area of the buffer, and CODING->src_pos specifies the offset of
7300    the text from GPT (which must be the same as PT).  If this is the
7301    same buffer as CODING->dst_object, CODING->src_pos must be
7302    negative.
7303
7304    If CODING->src_object is a string, CODING->src_pos is an index to
7305    that string.
7306
7307    If CODING->src_object is nil, CODING->source must already point to
7308    the non-relocatable memory area.  In this case, CODING->src_pos is
7309    an offset from CODING->source.
7310
7311    The decoded data is inserted at the current point of the buffer
7312    CODING->dst_object.
7313 */
7314
7315 static void
7316 decode_coding (struct coding_system *coding)
7317 {
7318   Lisp_Object attrs;
7319   Lisp_Object undo_list;
7320   Lisp_Object translation_table;
7321   struct ccl_spec cclspec;
7322   int carryover;
7323   int i;
7324
7325   USE_SAFE_ALLOCA;
7326
7327   if (BUFFERP (coding->src_object)
7328       && coding->src_pos > 0
7329       && coding->src_pos < GPT
7330       && coding->src_pos + coding->src_chars > GPT)
7331     move_gap_both (coding->src_pos, coding->src_pos_byte);
7332
7333   undo_list = Qt;
7334   if (BUFFERP (coding->dst_object))
7335     {
7336       set_buffer_internal (XBUFFER (coding->dst_object));
7337       if (GPT != PT)
7338         move_gap_both (PT, PT_BYTE);
7339
7340       /* We must disable undo_list in order to record the whole insert
7341          transaction via record_insert at the end.  But doing so also
7342          disables the recording of the first change to the undo_list.
7343          Therefore we check for first change here and record it via
7344          record_first_change if needed.  */
7345       if (MODIFF <= SAVE_MODIFF)
7346         record_first_change ();
7347
7348       undo_list = BVAR (current_buffer, undo_list);
7349       bset_undo_list (current_buffer, Qt);
7350     }
7351
7352   coding->consumed = coding->consumed_char = 0;
7353   coding->produced = coding->produced_char = 0;
7354   coding->chars_at_source = 0;
7355   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7356
7357   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7358
7359   attrs = CODING_ID_ATTRS (coding->id);
7360   translation_table = get_translation_table (attrs, 0, NULL);
7361
7362   carryover = 0;
7363   if (coding->decoder == decode_coding_ccl)
7364     {
7365       coding->spec.ccl = &cclspec;
7366       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7367     }
7368   do
7369     {
7370       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7371
7372       coding_set_source (coding);
7373       coding->annotated = 0;
7374       coding->charbuf_used = carryover;
7375       (*(coding->decoder)) (coding);
7376       coding_set_destination (coding);
7377       carryover = produce_chars (coding, translation_table, 0);
7378       if (coding->annotated)
7379         produce_annotation (coding, pos);
7380       for (i = 0; i < carryover; i++)
7381         coding->charbuf[i]
7382           = coding->charbuf[coding->charbuf_used - carryover + i];
7383     }
7384   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7385          || (coding->consumed < coding->src_bytes
7386              && (coding->result == CODING_RESULT_SUCCESS
7387                  || coding->result == CODING_RESULT_INVALID_SRC)));
7388
7389   if (carryover > 0)
7390     {
7391       coding_set_destination (coding);
7392       coding->charbuf_used = carryover;
7393       produce_chars (coding, translation_table, 1);
7394     }
7395
7396   coding->carryover_bytes = 0;
7397   if (coding->consumed < coding->src_bytes)
7398     {
7399       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7400       const unsigned char *src;
7401
7402       coding_set_source (coding);
7403       coding_set_destination (coding);
7404       src = coding->source + coding->consumed;
7405
7406       if (coding->mode & CODING_MODE_LAST_BLOCK)
7407         {
7408           /* Flush out unprocessed data as binary chars.  We are sure
7409              that the number of data is less than the size of
7410              coding->charbuf.  */
7411           coding->charbuf_used = 0;
7412           coding->chars_at_source = 0;
7413
7414           while (nbytes-- > 0)
7415             {
7416               int c = *src++;
7417
7418               if (c & 0x80)
7419                 c = BYTE8_TO_CHAR (c);
7420               coding->charbuf[coding->charbuf_used++] = c;
7421             }
7422           produce_chars (coding, Qnil, 1);
7423         }
7424       else
7425         {
7426           /* Record unprocessed bytes in coding->carryover.  We are
7427              sure that the number of data is less than the size of
7428              coding->carryover.  */
7429           unsigned char *p = coding->carryover;
7430
7431           if (nbytes > sizeof coding->carryover)
7432             nbytes = sizeof coding->carryover;
7433           coding->carryover_bytes = nbytes;
7434           while (nbytes-- > 0)
7435             *p++ = *src++;
7436         }
7437       coding->consumed = coding->src_bytes;
7438     }
7439
7440   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7441       && !inhibit_eol_conversion)
7442     decode_eol (coding);
7443   if (BUFFERP (coding->dst_object))
7444     {
7445       bset_undo_list (current_buffer, undo_list);
7446       record_insert (coding->dst_pos, coding->produced_char);
7447     }
7448
7449   SAFE_FREE ();
7450 }
7451
7452
7453 /* Extract an annotation datum from a composition starting at POS and
7454    ending before LIMIT of CODING->src_object (buffer or string), store
7455    the data in BUF, set *STOP to a starting position of the next
7456    composition (if any) or to LIMIT, and return the address of the
7457    next element of BUF.
7458
7459    If such an annotation is not found, set *STOP to a starting
7460    position of a composition after POS (if any) or to LIMIT, and
7461    return BUF.  */
7462
7463 static int *
7464 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7465                                struct coding_system *coding, int *buf,
7466                                ptrdiff_t *stop)
7467 {
7468   ptrdiff_t start, end;
7469   Lisp_Object prop;
7470
7471   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7472       || end > limit)
7473     *stop = limit;
7474   else if (start > pos)
7475     *stop = start;
7476   else
7477     {
7478       if (start == pos)
7479         {
7480           /* We found a composition.  Store the corresponding
7481              annotation data in BUF.  */
7482           int *head = buf;
7483           enum composition_method method = composition_method (prop);
7484           int nchars = COMPOSITION_LENGTH (prop);
7485
7486           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7487           if (method != COMPOSITION_RELATIVE)
7488             {
7489               Lisp_Object components;
7490               ptrdiff_t i, len, i_byte;
7491
7492               components = COMPOSITION_COMPONENTS (prop);
7493               if (VECTORP (components))
7494                 {
7495                   len = ASIZE (components);
7496                   for (i = 0; i < len; i++)
7497                     *buf++ = XINT (AREF (components, i));
7498                 }
7499               else if (STRINGP (components))
7500                 {
7501                   len = SCHARS (components);
7502                   i = i_byte = 0;
7503                   while (i < len)
7504                     {
7505                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7506                       buf++;
7507                     }
7508                 }
7509               else if (INTEGERP (components))
7510                 {
7511                   len = 1;
7512                   *buf++ = XINT (components);
7513                 }
7514               else if (CONSP (components))
7515                 {
7516                   for (len = 0; CONSP (components);
7517                        len++, components = XCDR (components))
7518                     *buf++ = XINT (XCAR (components));
7519                 }
7520               else
7521                 emacs_abort ();
7522               *head -= len;
7523             }
7524         }
7525
7526       if (find_composition (end, limit, &start, &end, &prop,
7527                             coding->src_object)
7528           && end <= limit)
7529         *stop = start;
7530       else
7531         *stop = limit;
7532     }
7533   return buf;
7534 }
7535
7536
7537 /* Extract an annotation datum from a text property `charset' at POS of
7538    CODING->src_object (buffer of string), store the data in BUF, set
7539    *STOP to the position where the value of `charset' property changes
7540    (limiting by LIMIT), and return the address of the next element of
7541    BUF.
7542
7543    If the property value is nil, set *STOP to the position where the
7544    property value is non-nil (limiting by LIMIT), and return BUF.  */
7545
7546 static int *
7547 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7548                            struct coding_system *coding, int *buf,
7549                            ptrdiff_t *stop)
7550 {
7551   Lisp_Object val, next;
7552   int id;
7553
7554   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7555   if (! NILP (val) && CHARSETP (val))
7556     id = XINT (CHARSET_SYMBOL_ID (val));
7557   else
7558     id = -1;
7559   ADD_CHARSET_DATA (buf, 0, id);
7560   next = Fnext_single_property_change (make_number (pos), Qcharset,
7561                                        coding->src_object,
7562                                        make_number (limit));
7563   *stop = XINT (next);
7564   return buf;
7565 }
7566
7567
7568 static void
7569 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7570                int max_lookup)
7571 {
7572   int *buf = coding->charbuf;
7573   int *buf_end = coding->charbuf + coding->charbuf_size;
7574   const unsigned char *src = coding->source + coding->consumed;
7575   const unsigned char *src_end = coding->source + coding->src_bytes;
7576   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7577   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7578   bool multibytep = coding->src_multibyte;
7579   Lisp_Object eol_type;
7580   int c;
7581   ptrdiff_t stop, stop_composition, stop_charset;
7582   int *lookup_buf = NULL;
7583
7584   if (! NILP (translation_table))
7585     lookup_buf = alloca (sizeof (int) * max_lookup);
7586
7587   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7588   if (VECTORP (eol_type))
7589     eol_type = Qunix;
7590
7591   /* Note: composition handling is not yet implemented.  */
7592   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7593
7594   if (NILP (coding->src_object))
7595     stop = stop_composition = stop_charset = end_pos;
7596   else
7597     {
7598       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7599         stop = stop_composition = pos;
7600       else
7601         stop = stop_composition = end_pos;
7602       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7603         stop = stop_charset = pos;
7604       else
7605         stop_charset = end_pos;
7606     }
7607
7608   /* Compensate for CRLF and conversion.  */
7609   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7610   while (buf < buf_end)
7611     {
7612       Lisp_Object trans;
7613
7614       if (pos == stop)
7615         {
7616           if (pos == end_pos)
7617             break;
7618           if (pos == stop_composition)
7619             buf = handle_composition_annotation (pos, end_pos, coding,
7620                                                  buf, &stop_composition);
7621           if (pos == stop_charset)
7622             buf = handle_charset_annotation (pos, end_pos, coding,
7623                                              buf, &stop_charset);
7624           stop = (stop_composition < stop_charset
7625                   ? stop_composition : stop_charset);
7626         }
7627
7628       if (! multibytep)
7629         {
7630           int bytes;
7631
7632           if (coding->encoder == encode_coding_raw_text
7633               || coding->encoder == encode_coding_ccl)
7634             c = *src++, pos++;
7635           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7636             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7637           else
7638             c = BYTE8_TO_CHAR (*src), src++, pos++;
7639         }
7640       else
7641         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7642       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7643         c = '\n';
7644       if (! EQ (eol_type, Qunix))
7645         {
7646           if (c == '\n')
7647             {
7648               if (EQ (eol_type, Qdos))
7649                 *buf++ = '\r';
7650               else
7651                 c = '\r';
7652             }
7653         }
7654
7655       trans = Qnil;
7656       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7657       if (NILP (trans))
7658         *buf++ = c;
7659       else
7660         {
7661           ptrdiff_t from_nchars = 1, to_nchars = 1;
7662           int *lookup_buf_end;
7663           const unsigned char *p = src;
7664           int i;
7665
7666           lookup_buf[0] = c;
7667           for (i = 1; i < max_lookup && p < src_end; i++)
7668             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7669           lookup_buf_end = lookup_buf + i;
7670           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7671           if (INTEGERP (trans))
7672             c = XINT (trans);
7673           else if (CONSP (trans))
7674             {
7675               from_nchars = ASIZE (XCAR (trans));
7676               trans = XCDR (trans);
7677               if (INTEGERP (trans))
7678                 c = XINT (trans);
7679               else
7680                 {
7681                   to_nchars = ASIZE (trans);
7682                   if (buf_end - buf < to_nchars)
7683                     break;
7684                   c = XINT (AREF (trans, 0));
7685                 }
7686             }
7687           else
7688             break;
7689           *buf++ = c;
7690           for (i = 1; i < to_nchars; i++)
7691             *buf++ = XINT (AREF (trans, i));
7692           for (i = 1; i < from_nchars; i++, pos++)
7693             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7694         }
7695     }
7696
7697   coding->consumed = src - coding->source;
7698   coding->consumed_char = pos - coding->src_pos;
7699   coding->charbuf_used = buf - coding->charbuf;
7700   coding->chars_at_source = 0;
7701 }
7702
7703
7704 /* Encode the text at CODING->src_object into CODING->dst_object.
7705    CODING->src_object is a buffer or a string.
7706    CODING->dst_object is a buffer or nil.
7707
7708    If CODING->src_object is a buffer, it must be the current buffer.
7709    In this case, if CODING->src_pos is positive, it is a position of
7710    the source text in the buffer, otherwise. the source text is in the
7711    gap area of the buffer, and coding->src_pos specifies the offset of
7712    the text from GPT (which must be the same as PT).  If this is the
7713    same buffer as CODING->dst_object, CODING->src_pos must be
7714    negative and CODING should not have `pre-write-conversion'.
7715
7716    If CODING->src_object is a string, CODING should not have
7717    `pre-write-conversion'.
7718
7719    If CODING->dst_object is a buffer, the encoded data is inserted at
7720    the current point of that buffer.
7721
7722    If CODING->dst_object is nil, the encoded data is placed at the
7723    memory area specified by CODING->destination.  */
7724
7725 static void
7726 encode_coding (struct coding_system *coding)
7727 {
7728   Lisp_Object attrs;
7729   Lisp_Object translation_table;
7730   int max_lookup;
7731   struct ccl_spec cclspec;
7732
7733   USE_SAFE_ALLOCA;
7734
7735   attrs = CODING_ID_ATTRS (coding->id);
7736   if (coding->encoder == encode_coding_raw_text)
7737     translation_table = Qnil, max_lookup = 0;
7738   else
7739     translation_table = get_translation_table (attrs, 1, &max_lookup);
7740
7741   if (BUFFERP (coding->dst_object))
7742     {
7743       set_buffer_internal (XBUFFER (coding->dst_object));
7744       coding->dst_multibyte
7745         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7746     }
7747
7748   coding->consumed = coding->consumed_char = 0;
7749   coding->produced = coding->produced_char = 0;
7750   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7751
7752   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7753
7754   if (coding->encoder == encode_coding_ccl)
7755     {
7756       coding->spec.ccl = &cclspec;
7757       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7758     }
7759   do {
7760     coding_set_source (coding);
7761     consume_chars (coding, translation_table, max_lookup);
7762     coding_set_destination (coding);
7763     (*(coding->encoder)) (coding);
7764   } while (coding->consumed_char < coding->src_chars);
7765
7766   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7767     insert_from_gap (coding->produced_char, coding->produced, 0);
7768
7769   SAFE_FREE ();
7770 }
7771
7772
7773 /* Name (or base name) of work buffer for code conversion.  */
7774 static Lisp_Object Vcode_conversion_workbuf_name;
7775
7776 /* A working buffer used by the top level conversion.  Once it is
7777    created, it is never destroyed.  It has the name
7778    Vcode_conversion_workbuf_name.  The other working buffers are
7779    destroyed after the use is finished, and their names are modified
7780    versions of Vcode_conversion_workbuf_name.  */
7781 static Lisp_Object Vcode_conversion_reused_workbuf;
7782
7783 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7784 static bool reused_workbuf_in_use;
7785
7786
7787 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7788    multibyteness of returning buffer.  */
7789
7790 static Lisp_Object
7791 make_conversion_work_buffer (bool multibyte)
7792 {
7793   Lisp_Object name, workbuf;
7794   struct buffer *current;
7795
7796   if (reused_workbuf_in_use)
7797     {
7798       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7799       workbuf = Fget_buffer_create (name);
7800     }
7801   else
7802     {
7803       reused_workbuf_in_use = 1;
7804       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7805         Vcode_conversion_reused_workbuf
7806           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7807       workbuf = Vcode_conversion_reused_workbuf;
7808     }
7809   current = current_buffer;
7810   set_buffer_internal (XBUFFER (workbuf));
7811   /* We can't allow modification hooks to run in the work buffer.  For
7812      instance, directory_files_internal assumes that file decoding
7813      doesn't compile new regexps.  */
7814   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7815   Ferase_buffer ();
7816   bset_undo_list (current_buffer, Qt);
7817   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7818   set_buffer_internal (current);
7819   return workbuf;
7820 }
7821
7822
7823 static void
7824 code_conversion_restore (Lisp_Object arg)
7825 {
7826   Lisp_Object current, workbuf;
7827   struct gcpro gcpro1;
7828
7829   GCPRO1 (arg);
7830   current = XCAR (arg);
7831   workbuf = XCDR (arg);
7832   if (! NILP (workbuf))
7833     {
7834       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7835         reused_workbuf_in_use = 0;
7836       else
7837         Fkill_buffer (workbuf);
7838     }
7839   set_buffer_internal (XBUFFER (current));
7840   UNGCPRO;
7841 }
7842
7843 Lisp_Object
7844 code_conversion_save (bool with_work_buf, bool multibyte)
7845 {
7846   Lisp_Object workbuf = Qnil;
7847
7848   if (with_work_buf)
7849     workbuf = make_conversion_work_buffer (multibyte);
7850   record_unwind_protect (code_conversion_restore,
7851                          Fcons (Fcurrent_buffer (), workbuf));
7852   return workbuf;
7853 }
7854
7855 void
7856 decode_coding_gap (struct coding_system *coding,
7857                    ptrdiff_t chars, ptrdiff_t bytes)
7858 {
7859   ptrdiff_t count = SPECPDL_INDEX ();
7860   Lisp_Object attrs;
7861
7862   coding->src_object = Fcurrent_buffer ();
7863   coding->src_chars = chars;
7864   coding->src_bytes = bytes;
7865   coding->src_pos = -chars;
7866   coding->src_pos_byte = -bytes;
7867   coding->src_multibyte = chars < bytes;
7868   coding->dst_object = coding->src_object;
7869   coding->dst_pos = PT;
7870   coding->dst_pos_byte = PT_BYTE;
7871   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7872
7873   coding->head_ascii = -1;
7874   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7875   coding->eol_seen = EOL_SEEN_NONE;
7876   if (CODING_REQUIRE_DETECTION (coding))
7877     detect_coding (coding);
7878   attrs = CODING_ID_ATTRS (coding->id);
7879   if (! disable_ascii_optimization
7880       && ! coding->src_multibyte
7881       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7882       && NILP (CODING_ATTR_POST_READ (attrs))
7883       && NILP (get_translation_table (attrs, 0, NULL)))
7884     {
7885       chars = coding->head_ascii;
7886       if (chars < 0)
7887         chars = check_ascii (coding);
7888       if (chars != bytes)
7889         {
7890           /* There exists a non-ASCII byte.  */
7891           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7892               && coding->detected_utf8_bytes == coding->src_bytes)
7893             {
7894               if (coding->detected_utf8_chars >= 0)
7895                 chars = coding->detected_utf8_chars;
7896               else
7897                 chars = check_utf_8 (coding);
7898               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7899                   && coding->head_ascii == 0
7900                   && coding->source[0] == UTF_8_BOM_1
7901                   && coding->source[1] == UTF_8_BOM_2
7902                   && coding->source[2] == UTF_8_BOM_3)
7903                 {
7904                   chars--;
7905                   bytes -= 3;
7906                   coding->src_bytes -= 3;
7907                 }
7908             }
7909           else
7910             chars = -1;
7911         }
7912       if (chars >= 0)
7913         {
7914           Lisp_Object eol_type;
7915
7916           eol_type = CODING_ID_EOL_TYPE (coding->id);
7917           if (VECTORP (eol_type))
7918             {
7919               if (coding->eol_seen != EOL_SEEN_NONE)
7920                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7921             }
7922           if (EQ (eol_type, Qmac))
7923             {
7924               unsigned char *src_end = GAP_END_ADDR;
7925               unsigned char *src = src_end - coding->src_bytes;
7926
7927               while (src < src_end)
7928                 {
7929                   if (*src++ == '\r')
7930                     src[-1] = '\n';
7931                 }
7932             }
7933           else if (EQ (eol_type, Qdos))
7934             {
7935               unsigned char *src = GAP_END_ADDR;
7936               unsigned char *src_beg = src - coding->src_bytes;
7937               unsigned char *dst = src;
7938               ptrdiff_t diff;
7939
7940               while (src_beg < src)
7941                 {
7942                   *--dst = *--src;
7943                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7944                     src--;
7945                 }
7946               diff = dst - src;
7947               bytes -= diff;
7948               chars -= diff;
7949             }
7950           coding->produced = bytes;
7951           coding->produced_char = chars;
7952           insert_from_gap (chars, bytes, 1);
7953           return;
7954         }
7955     }
7956   code_conversion_save (0, 0);
7957
7958   coding->mode |= CODING_MODE_LAST_BLOCK;
7959   current_buffer->text->inhibit_shrinking = 1;
7960   decode_coding (coding);
7961   current_buffer->text->inhibit_shrinking = 0;
7962
7963   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7964     {
7965       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7966       Lisp_Object val;
7967
7968       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7969       val = call1 (CODING_ATTR_POST_READ (attrs),
7970                    make_number (coding->produced_char));
7971       CHECK_NATNUM (val);
7972       coding->produced_char += Z - prev_Z;
7973       coding->produced += Z_BYTE - prev_Z_BYTE;
7974     }
7975
7976   unbind_to (count, Qnil);
7977 }
7978
7979
7980 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7981    SRC_OBJECT into DST_OBJECT by coding context CODING.
7982
7983    SRC_OBJECT is a buffer, a string, or Qnil.
7984
7985    If it is a buffer, the text is at point of the buffer.  FROM and TO
7986    are positions in the buffer.
7987
7988    If it is a string, the text is at the beginning of the string.
7989    FROM and TO are indices to the string.
7990
7991    If it is nil, the text is at coding->source.  FROM and TO are
7992    indices to coding->source.
7993
7994    DST_OBJECT is a buffer, Qt, or Qnil.
7995
7996    If it is a buffer, the decoded text is inserted at point of the
7997    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7998    is deleted.
7999
8000    If it is Qt, a string is made from the decoded text, and
8001    set in CODING->dst_object.
8002
8003    If it is Qnil, the decoded text is stored at CODING->destination.
8004    The caller must allocate CODING->dst_bytes bytes at
8005    CODING->destination by xmalloc.  If the decoded text is longer than
8006    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8007  */
8008
8009 void
8010 decode_coding_object (struct coding_system *coding,
8011                       Lisp_Object src_object,
8012                       ptrdiff_t from, ptrdiff_t from_byte,
8013                       ptrdiff_t to, ptrdiff_t to_byte,
8014                       Lisp_Object dst_object)
8015 {
8016   ptrdiff_t count = SPECPDL_INDEX ();
8017   unsigned char *destination IF_LINT (= NULL);
8018   ptrdiff_t dst_bytes IF_LINT (= 0);
8019   ptrdiff_t chars = to - from;
8020   ptrdiff_t bytes = to_byte - from_byte;
8021   Lisp_Object attrs;
8022   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8023   bool need_marker_adjustment = 0;
8024   Lisp_Object old_deactivate_mark;
8025
8026   old_deactivate_mark = Vdeactivate_mark;
8027
8028   if (NILP (dst_object))
8029     {
8030       destination = coding->destination;
8031       dst_bytes = coding->dst_bytes;
8032     }
8033
8034   coding->src_object = src_object;
8035   coding->src_chars = chars;
8036   coding->src_bytes = bytes;
8037   coding->src_multibyte = chars < bytes;
8038
8039   if (STRINGP (src_object))
8040     {
8041       coding->src_pos = from;
8042       coding->src_pos_byte = from_byte;
8043     }
8044   else if (BUFFERP (src_object))
8045     {
8046       set_buffer_internal (XBUFFER (src_object));
8047       if (from != GPT)
8048         move_gap_both (from, from_byte);
8049       if (EQ (src_object, dst_object))
8050         {
8051           struct Lisp_Marker *tail;
8052
8053           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8054             {
8055               tail->need_adjustment
8056                 = tail->charpos == (tail->insertion_type ? from : to);
8057               need_marker_adjustment |= tail->need_adjustment;
8058             }
8059           saved_pt = PT, saved_pt_byte = PT_BYTE;
8060           TEMP_SET_PT_BOTH (from, from_byte);
8061           current_buffer->text->inhibit_shrinking = 1;
8062           del_range_both (from, from_byte, to, to_byte, 1);
8063           coding->src_pos = -chars;
8064           coding->src_pos_byte = -bytes;
8065         }
8066       else
8067         {
8068           coding->src_pos = from;
8069           coding->src_pos_byte = from_byte;
8070         }
8071     }
8072
8073   if (CODING_REQUIRE_DETECTION (coding))
8074     detect_coding (coding);
8075   attrs = CODING_ID_ATTRS (coding->id);
8076
8077   if (EQ (dst_object, Qt)
8078       || (! NILP (CODING_ATTR_POST_READ (attrs))
8079           && NILP (dst_object)))
8080     {
8081       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8082       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8083       coding->dst_pos = BEG;
8084       coding->dst_pos_byte = BEG_BYTE;
8085     }
8086   else if (BUFFERP (dst_object))
8087     {
8088       code_conversion_save (0, 0);
8089       coding->dst_object = dst_object;
8090       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8091       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8092       coding->dst_multibyte
8093         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8094     }
8095   else
8096     {
8097       code_conversion_save (0, 0);
8098       coding->dst_object = Qnil;
8099       /* Most callers presume this will return a multibyte result, and they
8100          won't use `binary' or `raw-text' anyway, so let's not worry about
8101          CODING_FOR_UNIBYTE.  */
8102       coding->dst_multibyte = 1;
8103     }
8104
8105   decode_coding (coding);
8106
8107   if (BUFFERP (coding->dst_object))
8108     set_buffer_internal (XBUFFER (coding->dst_object));
8109
8110   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8111     {
8112       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8113       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8114       Lisp_Object val;
8115
8116       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8117       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8118               old_deactivate_mark);
8119       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8120                         make_number (coding->produced_char));
8121       UNGCPRO;
8122       CHECK_NATNUM (val);
8123       coding->produced_char += Z - prev_Z;
8124       coding->produced += Z_BYTE - prev_Z_BYTE;
8125     }
8126
8127   if (EQ (dst_object, Qt))
8128     {
8129       coding->dst_object = Fbuffer_string ();
8130     }
8131   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8132     {
8133       set_buffer_internal (XBUFFER (coding->dst_object));
8134       if (dst_bytes < coding->produced)
8135         {
8136           eassert (coding->produced > 0);
8137           destination = xrealloc (destination, coding->produced);
8138           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8139             move_gap_both (BEGV, BEGV_BYTE);
8140           memcpy (destination, BEGV_ADDR, coding->produced);
8141           coding->destination = destination;
8142         }
8143     }
8144
8145   if (saved_pt >= 0)
8146     {
8147       /* This is the case of:
8148          (BUFFERP (src_object) && EQ (src_object, dst_object))
8149          As we have moved PT while replacing the original buffer
8150          contents, we must recover it now.  */
8151       set_buffer_internal (XBUFFER (src_object));
8152       current_buffer->text->inhibit_shrinking = 0;
8153       if (saved_pt < from)
8154         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8155       else if (saved_pt < from + chars)
8156         TEMP_SET_PT_BOTH (from, from_byte);
8157       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8158         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8159                           saved_pt_byte + (coding->produced - bytes));
8160       else
8161         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8162                           saved_pt_byte + (coding->produced - bytes));
8163
8164       if (need_marker_adjustment)
8165         {
8166           struct Lisp_Marker *tail;
8167
8168           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8169             if (tail->need_adjustment)
8170               {
8171                 tail->need_adjustment = 0;
8172                 if (tail->insertion_type)
8173                   {
8174                     tail->bytepos = from_byte;
8175                     tail->charpos = from;
8176                   }
8177                 else
8178                   {
8179                     tail->bytepos = from_byte + coding->produced;
8180                     tail->charpos
8181                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8182                          ? tail->bytepos : from + coding->produced_char);
8183                   }
8184               }
8185         }
8186     }
8187
8188   Vdeactivate_mark = old_deactivate_mark;
8189   unbind_to (count, coding->dst_object);
8190 }
8191
8192
8193 void
8194 encode_coding_object (struct coding_system *coding,
8195                       Lisp_Object src_object,
8196                       ptrdiff_t from, ptrdiff_t from_byte,
8197                       ptrdiff_t to, ptrdiff_t to_byte,
8198                       Lisp_Object dst_object)
8199 {
8200   ptrdiff_t count = SPECPDL_INDEX ();
8201   ptrdiff_t chars = to - from;
8202   ptrdiff_t bytes = to_byte - from_byte;
8203   Lisp_Object attrs;
8204   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8205   bool need_marker_adjustment = 0;
8206   bool kill_src_buffer = 0;
8207   Lisp_Object old_deactivate_mark;
8208
8209   old_deactivate_mark = Vdeactivate_mark;
8210
8211   coding->src_object = src_object;
8212   coding->src_chars = chars;
8213   coding->src_bytes = bytes;
8214   coding->src_multibyte = chars < bytes;
8215
8216   attrs = CODING_ID_ATTRS (coding->id);
8217
8218   if (EQ (src_object, dst_object))
8219     {
8220       struct Lisp_Marker *tail;
8221
8222       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8223         {
8224           tail->need_adjustment
8225             = tail->charpos == (tail->insertion_type ? from : to);
8226           need_marker_adjustment |= tail->need_adjustment;
8227         }
8228     }
8229
8230   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8231     {
8232       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8233       set_buffer_internal (XBUFFER (coding->src_object));
8234       if (STRINGP (src_object))
8235         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8236       else if (BUFFERP (src_object))
8237         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8238       else
8239         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8240
8241       if (EQ (src_object, dst_object))
8242         {
8243           set_buffer_internal (XBUFFER (src_object));
8244           saved_pt = PT, saved_pt_byte = PT_BYTE;
8245           del_range_both (from, from_byte, to, to_byte, 1);
8246           set_buffer_internal (XBUFFER (coding->src_object));
8247         }
8248
8249       {
8250         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8251
8252         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8253                 old_deactivate_mark);
8254         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8255                     make_number (BEG), make_number (Z));
8256         UNGCPRO;
8257       }
8258       if (XBUFFER (coding->src_object) != current_buffer)
8259         kill_src_buffer = 1;
8260       coding->src_object = Fcurrent_buffer ();
8261       if (BEG != GPT)
8262         move_gap_both (BEG, BEG_BYTE);
8263       coding->src_chars = Z - BEG;
8264       coding->src_bytes = Z_BYTE - BEG_BYTE;
8265       coding->src_pos = BEG;
8266       coding->src_pos_byte = BEG_BYTE;
8267       coding->src_multibyte = Z < Z_BYTE;
8268     }
8269   else if (STRINGP (src_object))
8270     {
8271       code_conversion_save (0, 0);
8272       coding->src_pos = from;
8273       coding->src_pos_byte = from_byte;
8274     }
8275   else if (BUFFERP (src_object))
8276     {
8277       code_conversion_save (0, 0);
8278       set_buffer_internal (XBUFFER (src_object));
8279       if (EQ (src_object, dst_object))
8280         {
8281           saved_pt = PT, saved_pt_byte = PT_BYTE;
8282           coding->src_object = del_range_1 (from, to, 1, 1);
8283           coding->src_pos = 0;
8284           coding->src_pos_byte = 0;
8285         }
8286       else
8287         {
8288           if (from < GPT && to >= GPT)
8289             move_gap_both (from, from_byte);
8290           coding->src_pos = from;
8291           coding->src_pos_byte = from_byte;
8292         }
8293     }
8294   else
8295     code_conversion_save (0, 0);
8296
8297   if (BUFFERP (dst_object))
8298     {
8299       coding->dst_object = dst_object;
8300       if (EQ (src_object, dst_object))
8301         {
8302           coding->dst_pos = from;
8303           coding->dst_pos_byte = from_byte;
8304         }
8305       else
8306         {
8307           struct buffer *current = current_buffer;
8308
8309           set_buffer_temp (XBUFFER (dst_object));
8310           coding->dst_pos = PT;
8311           coding->dst_pos_byte = PT_BYTE;
8312           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8313           set_buffer_temp (current);
8314         }
8315       coding->dst_multibyte
8316         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8317     }
8318   else if (EQ (dst_object, Qt))
8319     {
8320       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8321       coding->dst_object = Qnil;
8322       coding->destination = xmalloc (dst_bytes);
8323       coding->dst_bytes = dst_bytes;
8324       coding->dst_multibyte = 0;
8325     }
8326   else
8327     {
8328       coding->dst_object = Qnil;
8329       coding->dst_multibyte = 0;
8330     }
8331
8332   encode_coding (coding);
8333
8334   if (EQ (dst_object, Qt))
8335     {
8336       if (BUFFERP (coding->dst_object))
8337         coding->dst_object = Fbuffer_string ();
8338       else if (coding->raw_destination)
8339         /* This is used to avoid creating huge Lisp string.
8340            NOTE: caller who sets `raw_destination' is also
8341            responsible for freeing `destination' buffer.  */
8342         coding->dst_object = Qnil;
8343       else
8344         {
8345           coding->dst_object
8346             = make_unibyte_string ((char *) coding->destination,
8347                                    coding->produced);
8348           xfree (coding->destination);
8349         }
8350     }
8351
8352   if (saved_pt >= 0)
8353     {
8354       /* This is the case of:
8355          (BUFFERP (src_object) && EQ (src_object, dst_object))
8356          As we have moved PT while replacing the original buffer
8357          contents, we must recover it now.  */
8358       set_buffer_internal (XBUFFER (src_object));
8359       if (saved_pt < from)
8360         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8361       else if (saved_pt < from + chars)
8362         TEMP_SET_PT_BOTH (from, from_byte);
8363       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8364         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8365                           saved_pt_byte + (coding->produced - bytes));
8366       else
8367         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8368                           saved_pt_byte + (coding->produced - bytes));
8369
8370       if (need_marker_adjustment)
8371         {
8372           struct Lisp_Marker *tail;
8373
8374           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8375             if (tail->need_adjustment)
8376               {
8377                 tail->need_adjustment = 0;
8378                 if (tail->insertion_type)
8379                   {
8380                     tail->bytepos = from_byte;
8381                     tail->charpos = from;
8382                   }
8383                 else
8384                   {
8385                     tail->bytepos = from_byte + coding->produced;
8386                     tail->charpos
8387                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8388                          ? tail->bytepos : from + coding->produced_char);
8389                   }
8390               }
8391         }
8392     }
8393
8394   if (kill_src_buffer)
8395     Fkill_buffer (coding->src_object);
8396
8397   Vdeactivate_mark = old_deactivate_mark;
8398   unbind_to (count, Qnil);
8399 }
8400
8401
8402 Lisp_Object
8403 preferred_coding_system (void)
8404 {
8405   int id = coding_categories[coding_priorities[0]].id;
8406
8407   return CODING_ID_NAME (id);
8408 }
8409
8410 #if defined (WINDOWSNT) || defined (CYGWIN)
8411
8412 Lisp_Object
8413 from_unicode (Lisp_Object str)
8414 {
8415   CHECK_STRING (str);
8416   if (!STRING_MULTIBYTE (str) &&
8417       SBYTES (str) & 1)
8418     {
8419       str = Fsubstring (str, make_number (0), make_number (-1));
8420     }
8421
8422   return code_convert_string_norecord (str, Qutf_16le, 0);
8423 }
8424
8425 Lisp_Object
8426 from_unicode_buffer (const wchar_t *wstr)
8427 {
8428     return from_unicode (
8429         make_unibyte_string (
8430             (char *) wstr,
8431             /* we get one of the two final 0 bytes for free. */
8432             1 + sizeof (wchar_t) * wcslen (wstr)));
8433 }
8434
8435 wchar_t *
8436 to_unicode (Lisp_Object str, Lisp_Object *buf)
8437 {
8438   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8439   /* We need to make another copy (in addition to the one made by
8440      code_convert_string_norecord) to ensure that the final string is
8441      _doubly_ zero terminated --- that is, that the string is
8442      terminated by two zero bytes and one utf-16le null character.
8443      Because strings are already terminated with a single zero byte,
8444      we just add one additional zero. */
8445   str = make_uninit_string (SBYTES (*buf) + 1);
8446   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8447   SDATA (str) [SBYTES (*buf)] = '\0';
8448   *buf = str;
8449   return WCSDATA (*buf);
8450 }
8451
8452 #endif /* WINDOWSNT || CYGWIN */
8453
8454 \f
8455 #ifdef emacs
8456 /*** 8. Emacs Lisp library functions ***/
8457
8458 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8459        doc: /* Return t if OBJECT is nil or a coding-system.
8460 See the documentation of `define-coding-system' for information
8461 about coding-system objects.  */)
8462   (Lisp_Object object)
8463 {
8464   if (NILP (object)
8465       || CODING_SYSTEM_ID (object) >= 0)
8466     return Qt;
8467   if (! SYMBOLP (object)
8468       || NILP (Fget (object, Qcoding_system_define_form)))
8469     return Qnil;
8470   return Qt;
8471 }
8472
8473 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8474        Sread_non_nil_coding_system, 1, 1, 0,
8475        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8476   (Lisp_Object prompt)
8477 {
8478   Lisp_Object val;
8479   do
8480     {
8481       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8482                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8483     }
8484   while (SCHARS (val) == 0);
8485   return (Fintern (val, Qnil));
8486 }
8487
8488 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8489        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8490 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8491 Ignores case when completing coding systems (all Emacs coding systems
8492 are lower-case).  */)
8493   (Lisp_Object prompt, Lisp_Object default_coding_system)
8494 {
8495   Lisp_Object val;
8496   ptrdiff_t count = SPECPDL_INDEX ();
8497
8498   if (SYMBOLP (default_coding_system))
8499     default_coding_system = SYMBOL_NAME (default_coding_system);
8500   specbind (Qcompletion_ignore_case, Qt);
8501   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8502                           Qt, Qnil, Qcoding_system_history,
8503                           default_coding_system, Qnil);
8504   unbind_to (count, Qnil);
8505   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8506 }
8507
8508 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8509        1, 1, 0,
8510        doc: /* Check validity of CODING-SYSTEM.
8511 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8512 It is valid if it is nil or a symbol defined as a coding system by the
8513 function `define-coding-system'.  */)
8514   (Lisp_Object coding_system)
8515 {
8516   Lisp_Object define_form;
8517
8518   define_form = Fget (coding_system, Qcoding_system_define_form);
8519   if (! NILP (define_form))
8520     {
8521       Fput (coding_system, Qcoding_system_define_form, Qnil);
8522       safe_eval (define_form);
8523     }
8524   if (!NILP (Fcoding_system_p (coding_system)))
8525     return coding_system;
8526   xsignal1 (Qcoding_system_error, coding_system);
8527 }
8528
8529 \f
8530 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8531    HIGHEST, return the coding system of the highest
8532    priority among the detected coding systems.  Otherwise return a
8533    list of detected coding systems sorted by their priorities.  If
8534    MULTIBYTEP, it is assumed that the bytes are in correct
8535    multibyte form but contains only ASCII and eight-bit chars.
8536    Otherwise, the bytes are raw bytes.
8537
8538    CODING-SYSTEM controls the detection as below:
8539
8540    If it is nil, detect both text-format and eol-format.  If the
8541    text-format part of CODING-SYSTEM is already specified
8542    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8543    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8544    detect only text-format.  */
8545
8546 Lisp_Object
8547 detect_coding_system (const unsigned char *src,
8548                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8549                       bool highest, bool multibytep,
8550                       Lisp_Object coding_system)
8551 {
8552   const unsigned char *src_end = src + src_bytes;
8553   Lisp_Object attrs, eol_type;
8554   Lisp_Object val = Qnil;
8555   struct coding_system coding;
8556   ptrdiff_t id;
8557   struct coding_detection_info detect_info;
8558   enum coding_category base_category;
8559   bool null_byte_found = 0, eight_bit_found = 0;
8560
8561   if (NILP (coding_system))
8562     coding_system = Qundecided;
8563   setup_coding_system (coding_system, &coding);
8564   attrs = CODING_ID_ATTRS (coding.id);
8565   eol_type = CODING_ID_EOL_TYPE (coding.id);
8566   coding_system = CODING_ATTR_BASE_NAME (attrs);
8567
8568   coding.source = src;
8569   coding.src_chars = src_chars;
8570   coding.src_bytes = src_bytes;
8571   coding.src_multibyte = multibytep;
8572   coding.consumed = 0;
8573   coding.mode |= CODING_MODE_LAST_BLOCK;
8574   coding.head_ascii = 0;
8575
8576   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8577
8578   /* At first, detect text-format if necessary.  */
8579   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8580   if (base_category == coding_category_undecided)
8581     {
8582       enum coding_category category IF_LINT (= 0);
8583       struct coding_system *this IF_LINT (= NULL);
8584       int c, i;
8585       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8586                                        inhibit_null_byte_detection);
8587       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8588                                        inhibit_iso_escape_detection);
8589       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8590
8591       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8592       for (; src < src_end; src++)
8593         {
8594           c = *src;
8595           if (c & 0x80)
8596             {
8597               eight_bit_found = 1;
8598               if (null_byte_found)
8599                 break;
8600             }
8601           else if (c < 0x20)
8602             {
8603               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8604                   && ! inhibit_ied
8605                   && ! detect_info.checked)
8606                 {
8607                   if (detect_coding_iso_2022 (&coding, &detect_info))
8608                     {
8609                       /* We have scanned the whole data.  */
8610                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8611                         {
8612                           /* We didn't find an 8-bit code.  We may
8613                              have found a null-byte, but it's very
8614                              rare that a binary file confirm to
8615                              ISO-2022.  */
8616                           src = src_end;
8617                           coding.head_ascii = src - coding.source;
8618                         }
8619                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8620                       break;
8621                     }
8622                 }
8623               else if (! c && !inhibit_nbd)
8624                 {
8625                   null_byte_found = 1;
8626                   if (eight_bit_found)
8627                     break;
8628                 }
8629               if (! eight_bit_found)
8630                 coding.head_ascii++;
8631             }
8632           else if (! eight_bit_found)
8633             coding.head_ascii++;
8634         }
8635
8636       if (null_byte_found || eight_bit_found
8637           || coding.head_ascii < coding.src_bytes
8638           || detect_info.found)
8639         {
8640           if (coding.head_ascii == coding.src_bytes)
8641             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8642             for (i = 0; i < coding_category_raw_text; i++)
8643               {
8644                 category = coding_priorities[i];
8645                 this = coding_categories + category;
8646                 if (detect_info.found & (1 << category))
8647                   break;
8648               }
8649           else
8650             {
8651               if (null_byte_found)
8652                 {
8653                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8654                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8655                 }
8656               else if (prefer_utf_8
8657                        && detect_coding_utf_8 (&coding, &detect_info))
8658                 {
8659                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8660                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8661                 }
8662               for (i = 0; i < coding_category_raw_text; i++)
8663                 {
8664                   category = coding_priorities[i];
8665                   this = coding_categories + category;
8666
8667                   if (this->id < 0)
8668                     {
8669                       /* No coding system of this category is defined.  */
8670                       detect_info.rejected |= (1 << category);
8671                     }
8672                   else if (category >= coding_category_raw_text)
8673                     continue;
8674                   else if (detect_info.checked & (1 << category))
8675                     {
8676                       if (highest
8677                           && (detect_info.found & (1 << category)))
8678                         break;
8679                     }
8680                   else if ((*(this->detector)) (&coding, &detect_info)
8681                            && highest
8682                            && (detect_info.found & (1 << category)))
8683                     {
8684                       if (category == coding_category_utf_16_auto)
8685                         {
8686                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8687                             category = coding_category_utf_16_le;
8688                           else
8689                             category = coding_category_utf_16_be;
8690                         }
8691                       break;
8692                     }
8693                 }
8694             }
8695         }
8696
8697       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8698           || null_byte_found)
8699         {
8700           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8701           id = CODING_SYSTEM_ID (Qno_conversion);
8702           val = list1 (make_number (id));
8703         }
8704       else if (! detect_info.rejected && ! detect_info.found)
8705         {
8706           detect_info.found = CATEGORY_MASK_ANY;
8707           id = coding_categories[coding_category_undecided].id;
8708           val = list1 (make_number (id));
8709         }
8710       else if (highest)
8711         {
8712           if (detect_info.found)
8713             {
8714               detect_info.found = 1 << category;
8715               val = list1 (make_number (this->id));
8716             }
8717           else
8718             for (i = 0; i < coding_category_raw_text; i++)
8719               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8720                 {
8721                   detect_info.found = 1 << coding_priorities[i];
8722                   id = coding_categories[coding_priorities[i]].id;
8723                   val = list1 (make_number (id));
8724                   break;
8725                 }
8726         }
8727       else
8728         {
8729           int mask = detect_info.rejected | detect_info.found;
8730           int found = 0;
8731
8732           for (i = coding_category_raw_text - 1; i >= 0; i--)
8733             {
8734               category = coding_priorities[i];
8735               if (! (mask & (1 << category)))
8736                 {
8737                   found |= 1 << category;
8738                   id = coding_categories[category].id;
8739                   if (id >= 0)
8740                     val = list1 (make_number (id));
8741                 }
8742             }
8743           for (i = coding_category_raw_text - 1; i >= 0; i--)
8744             {
8745               category = coding_priorities[i];
8746               if (detect_info.found & (1 << category))
8747                 {
8748                   id = coding_categories[category].id;
8749                   val = Fcons (make_number (id), val);
8750                 }
8751             }
8752           detect_info.found |= found;
8753         }
8754     }
8755   else if (base_category == coding_category_utf_8_auto)
8756     {
8757       if (detect_coding_utf_8 (&coding, &detect_info))
8758         {
8759           struct coding_system *this;
8760
8761           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8762             this = coding_categories + coding_category_utf_8_sig;
8763           else
8764             this = coding_categories + coding_category_utf_8_nosig;
8765           val = list1 (make_number (this->id));
8766         }
8767     }
8768   else if (base_category == coding_category_utf_16_auto)
8769     {
8770       if (detect_coding_utf_16 (&coding, &detect_info))
8771         {
8772           struct coding_system *this;
8773
8774           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8775             this = coding_categories + coding_category_utf_16_le;
8776           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8777             this = coding_categories + coding_category_utf_16_be;
8778           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8779             this = coding_categories + coding_category_utf_16_be_nosig;
8780           else
8781             this = coding_categories + coding_category_utf_16_le_nosig;
8782           val = list1 (make_number (this->id));
8783         }
8784     }
8785   else
8786     {
8787       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8788       val = list1 (make_number (coding.id));
8789     }
8790
8791   /* Then, detect eol-format if necessary.  */
8792   {
8793     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8794     Lisp_Object tail;
8795
8796     if (VECTORP (eol_type))
8797       {
8798         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8799           {
8800             if (null_byte_found)
8801               normal_eol = EOL_SEEN_LF;
8802             else
8803               normal_eol = detect_eol (coding.source, src_bytes,
8804                                        coding_category_raw_text);
8805           }
8806         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8807                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8808           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8809                                       coding_category_utf_16_be);
8810         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8811                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8812           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8813                                       coding_category_utf_16_le);
8814       }
8815     else
8816       {
8817         if (EQ (eol_type, Qunix))
8818           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8819         else if (EQ (eol_type, Qdos))
8820           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8821         else
8822           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8823       }
8824
8825     for (tail = val; CONSP (tail); tail = XCDR (tail))
8826       {
8827         enum coding_category category;
8828         int this_eol;
8829
8830         id = XINT (XCAR (tail));
8831         attrs = CODING_ID_ATTRS (id);
8832         category = XINT (CODING_ATTR_CATEGORY (attrs));
8833         eol_type = CODING_ID_EOL_TYPE (id);
8834         if (VECTORP (eol_type))
8835           {
8836             if (category == coding_category_utf_16_be
8837                 || category == coding_category_utf_16_be_nosig)
8838               this_eol = utf_16_be_eol;
8839             else if (category == coding_category_utf_16_le
8840                      || category == coding_category_utf_16_le_nosig)
8841               this_eol = utf_16_le_eol;
8842             else
8843               this_eol = normal_eol;
8844
8845             if (this_eol == EOL_SEEN_LF)
8846               XSETCAR (tail, AREF (eol_type, 0));
8847             else if (this_eol == EOL_SEEN_CRLF)
8848               XSETCAR (tail, AREF (eol_type, 1));
8849             else if (this_eol == EOL_SEEN_CR)
8850               XSETCAR (tail, AREF (eol_type, 2));
8851             else
8852               XSETCAR (tail, CODING_ID_NAME (id));
8853           }
8854         else
8855           XSETCAR (tail, CODING_ID_NAME (id));
8856       }
8857   }
8858
8859   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8860 }
8861
8862
8863 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8864        2, 3, 0,
8865        doc: /* Detect coding system of the text in the region between START and END.
8866 Return a list of possible coding systems ordered by priority.
8867 The coding systems to try and their priorities follows what
8868 the function `coding-system-priority-list' (which see) returns.
8869
8870 If only ASCII characters are found (except for such ISO-2022 control
8871 characters as ESC), it returns a list of single element `undecided'
8872 or its subsidiary coding system according to a detected end-of-line
8873 format.
8874
8875 If optional argument HIGHEST is non-nil, return the coding system of
8876 highest priority.  */)
8877   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8878 {
8879   ptrdiff_t from, to;
8880   ptrdiff_t from_byte, to_byte;
8881
8882   validate_region (&start, &end);
8883   from = XINT (start), to = XINT (end);
8884   from_byte = CHAR_TO_BYTE (from);
8885   to_byte = CHAR_TO_BYTE (to);
8886
8887   if (from < GPT && to >= GPT)
8888     move_gap_both (to, to_byte);
8889
8890   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8891                                to - from, to_byte - from_byte,
8892                                !NILP (highest),
8893                                !NILP (BVAR (current_buffer
8894                                       , enable_multibyte_characters)),
8895                                Qnil);
8896 }
8897
8898 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8899        1, 2, 0,
8900        doc: /* Detect coding system of the text in STRING.
8901 Return a list of possible coding systems ordered by priority.
8902 The coding systems to try and their priorities follows what
8903 the function `coding-system-priority-list' (which see) returns.
8904
8905 If only ASCII characters are found (except for such ISO-2022 control
8906 characters as ESC), it returns a list of single element `undecided'
8907 or its subsidiary coding system according to a detected end-of-line
8908 format.
8909
8910 If optional argument HIGHEST is non-nil, return the coding system of
8911 highest priority.  */)
8912   (Lisp_Object string, Lisp_Object highest)
8913 {
8914   CHECK_STRING (string);
8915
8916   return detect_coding_system (SDATA (string),
8917                                SCHARS (string), SBYTES (string),
8918                                !NILP (highest), STRING_MULTIBYTE (string),
8919                                Qnil);
8920 }
8921
8922
8923 static bool
8924 char_encodable_p (int c, Lisp_Object attrs)
8925 {
8926   Lisp_Object tail;
8927   struct charset *charset;
8928   Lisp_Object translation_table;
8929
8930   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8931   if (! NILP (translation_table))
8932     c = translate_char (translation_table, c);
8933   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8934        CONSP (tail); tail = XCDR (tail))
8935     {
8936       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8937       if (CHAR_CHARSET_P (c, charset))
8938         break;
8939     }
8940   return (! NILP (tail));
8941 }
8942
8943
8944 /* Return a list of coding systems that safely encode the text between
8945    START and END.  If EXCLUDE is non-nil, it is a list of coding
8946    systems not to check.  The returned list doesn't contain any such
8947    coding systems.  In any case, if the text contains only ASCII or is
8948    unibyte, return t.  */
8949
8950 DEFUN ("find-coding-systems-region-internal",
8951        Ffind_coding_systems_region_internal,
8952        Sfind_coding_systems_region_internal, 2, 3, 0,
8953        doc: /* Internal use only.  */)
8954   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8955 {
8956   Lisp_Object coding_attrs_list, safe_codings;
8957   ptrdiff_t start_byte, end_byte;
8958   const unsigned char *p, *pbeg, *pend;
8959   int c;
8960   Lisp_Object tail, elt, work_table;
8961
8962   if (STRINGP (start))
8963     {
8964       if (!STRING_MULTIBYTE (start)
8965           || SCHARS (start) == SBYTES (start))
8966         return Qt;
8967       start_byte = 0;
8968       end_byte = SBYTES (start);
8969     }
8970   else
8971     {
8972       CHECK_NUMBER_COERCE_MARKER (start);
8973       CHECK_NUMBER_COERCE_MARKER (end);
8974       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8975         args_out_of_range (start, end);
8976       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8977         return Qt;
8978       start_byte = CHAR_TO_BYTE (XINT (start));
8979       end_byte = CHAR_TO_BYTE (XINT (end));
8980       if (XINT (end) - XINT (start) == end_byte - start_byte)
8981         return Qt;
8982
8983       if (XINT (start) < GPT && XINT (end) > GPT)
8984         {
8985           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8986             move_gap_both (XINT (start), start_byte);
8987           else
8988             move_gap_both (XINT (end), end_byte);
8989         }
8990     }
8991
8992   coding_attrs_list = Qnil;
8993   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8994     if (NILP (exclude)
8995         || NILP (Fmemq (XCAR (tail), exclude)))
8996       {
8997         Lisp_Object attrs;
8998
8999         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9000         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9001           {
9002             ASET (attrs, coding_attr_trans_tbl,
9003                   get_translation_table (attrs, 1, NULL));
9004             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9005           }
9006       }
9007
9008   if (STRINGP (start))
9009     p = pbeg = SDATA (start);
9010   else
9011     p = pbeg = BYTE_POS_ADDR (start_byte);
9012   pend = p + (end_byte - start_byte);
9013
9014   while (p < pend && ASCII_CHAR_P (*p)) p++;
9015   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9016
9017   work_table = Fmake_char_table (Qnil, Qnil);
9018   while (p < pend)
9019     {
9020       if (ASCII_CHAR_P (*p))
9021         p++;
9022       else
9023         {
9024           c = STRING_CHAR_ADVANCE (p);
9025           if (!NILP (char_table_ref (work_table, c)))
9026             /* This character was already checked.  Ignore it.  */
9027             continue;
9028
9029           charset_map_loaded = 0;
9030           for (tail = coding_attrs_list; CONSP (tail);)
9031             {
9032               elt = XCAR (tail);
9033               if (NILP (elt))
9034                 tail = XCDR (tail);
9035               else if (char_encodable_p (c, elt))
9036                 tail = XCDR (tail);
9037               else if (CONSP (XCDR (tail)))
9038                 {
9039                   XSETCAR (tail, XCAR (XCDR (tail)));
9040                   XSETCDR (tail, XCDR (XCDR (tail)));
9041                 }
9042               else
9043                 {
9044                   XSETCAR (tail, Qnil);
9045                   tail = XCDR (tail);
9046                 }
9047             }
9048           if (charset_map_loaded)
9049             {
9050               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9051
9052               if (STRINGP (start))
9053                 pbeg = SDATA (start);
9054               else
9055                 pbeg = BYTE_POS_ADDR (start_byte);
9056               p = pbeg + p_offset;
9057               pend = pbeg + pend_offset;
9058             }
9059           char_table_set (work_table, c, Qt);
9060         }
9061     }
9062
9063   safe_codings = list2 (Qraw_text, Qno_conversion);
9064   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9065     if (! NILP (XCAR (tail)))
9066       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9067
9068   return safe_codings;
9069 }
9070
9071
9072 DEFUN ("unencodable-char-position", Funencodable_char_position,
9073        Sunencodable_char_position, 3, 5, 0,
9074        doc: /* Return position of first un-encodable character in a region.
9075 START and END specify the region and CODING-SYSTEM specifies the
9076 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9077
9078 If optional 4th argument COUNT is non-nil, it specifies at most how
9079 many un-encodable characters to search.  In this case, the value is a
9080 list of positions.
9081
9082 If optional 5th argument STRING is non-nil, it is a string to search
9083 for un-encodable characters.  In that case, START and END are indexes
9084 to the string and treated as in `substring'.  */)
9085   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9086    Lisp_Object count, Lisp_Object string)
9087 {
9088   EMACS_INT n;
9089   struct coding_system coding;
9090   Lisp_Object attrs, charset_list, translation_table;
9091   Lisp_Object positions;
9092   ptrdiff_t from, to;
9093   const unsigned char *p, *stop, *pend;
9094   bool ascii_compatible;
9095
9096   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9097   attrs = CODING_ID_ATTRS (coding.id);
9098   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9099     return Qnil;
9100   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9101   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9102   translation_table = get_translation_table (attrs, 1, NULL);
9103
9104   if (NILP (string))
9105     {
9106       validate_region (&start, &end);
9107       from = XINT (start);
9108       to = XINT (end);
9109       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9110           || (ascii_compatible
9111               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9112         return Qnil;
9113       p = CHAR_POS_ADDR (from);
9114       pend = CHAR_POS_ADDR (to);
9115       if (from < GPT && to >= GPT)
9116         stop = GPT_ADDR;
9117       else
9118         stop = pend;
9119     }
9120   else
9121     {
9122       CHECK_STRING (string);
9123       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9124       if (! STRING_MULTIBYTE (string))
9125         return Qnil;
9126       p = SDATA (string) + string_char_to_byte (string, from);
9127       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9128       if (ascii_compatible && (to - from) == (pend - p))
9129         return Qnil;
9130     }
9131
9132   if (NILP (count))
9133     n = 1;
9134   else
9135     {
9136       CHECK_NATNUM (count);
9137       n = XINT (count);
9138     }
9139
9140   positions = Qnil;
9141   charset_map_loaded = 0;
9142   while (1)
9143     {
9144       int c;
9145
9146       if (ascii_compatible)
9147         while (p < stop && ASCII_CHAR_P (*p))
9148           p++, from++;
9149       if (p >= stop)
9150         {
9151           if (p >= pend)
9152             break;
9153           stop = pend;
9154           p = GAP_END_ADDR;
9155         }
9156
9157       c = STRING_CHAR_ADVANCE (p);
9158       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9159           && ! char_charset (translate_char (translation_table, c),
9160                              charset_list, NULL))
9161         {
9162           positions = Fcons (make_number (from), positions);
9163           n--;
9164           if (n == 0)
9165             break;
9166         }
9167
9168       from++;
9169       if (charset_map_loaded && NILP (string))
9170         {
9171           p = CHAR_POS_ADDR (from);
9172           pend = CHAR_POS_ADDR (to);
9173           if (from < GPT && to >= GPT)
9174             stop = GPT_ADDR;
9175           else
9176             stop = pend;
9177           charset_map_loaded = 0;
9178         }
9179     }
9180
9181   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9182 }
9183
9184
9185 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9186        Scheck_coding_systems_region, 3, 3, 0,
9187        doc: /* Check if the region is encodable by coding systems.
9188
9189 START and END are buffer positions specifying the region.
9190 CODING-SYSTEM-LIST is a list of coding systems to check.
9191
9192 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9193 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9194 whole region, POS0, POS1, ... are buffer positions where non-encodable
9195 characters are found.
9196
9197 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9198 value is nil.
9199
9200 START may be a string.  In that case, check if the string is
9201 encodable, and the value contains indices to the string instead of
9202 buffer positions.  END is ignored.
9203
9204 If the current buffer (or START if it is a string) is unibyte, the value
9205 is nil.  */)
9206   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9207 {
9208   Lisp_Object list;
9209   ptrdiff_t start_byte, end_byte;
9210   ptrdiff_t pos;
9211   const unsigned char *p, *pbeg, *pend;
9212   int c;
9213   Lisp_Object tail, elt, attrs;
9214
9215   if (STRINGP (start))
9216     {
9217       if (!STRING_MULTIBYTE (start)
9218           || SCHARS (start) == SBYTES (start))
9219         return Qnil;
9220       start_byte = 0;
9221       end_byte = SBYTES (start);
9222       pos = 0;
9223     }
9224   else
9225     {
9226       CHECK_NUMBER_COERCE_MARKER (start);
9227       CHECK_NUMBER_COERCE_MARKER (end);
9228       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9229         args_out_of_range (start, end);
9230       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9231         return Qnil;
9232       start_byte = CHAR_TO_BYTE (XINT (start));
9233       end_byte = CHAR_TO_BYTE (XINT (end));
9234       if (XINT (end) - XINT (start) == end_byte - start_byte)
9235         return Qnil;
9236
9237       if (XINT (start) < GPT && XINT (end) > GPT)
9238         {
9239           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9240             move_gap_both (XINT (start), start_byte);
9241           else
9242             move_gap_both (XINT (end), end_byte);
9243         }
9244       pos = XINT (start);
9245     }
9246
9247   list = Qnil;
9248   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9249     {
9250       elt = XCAR (tail);
9251       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9252       ASET (attrs, coding_attr_trans_tbl,
9253             get_translation_table (attrs, 1, NULL));
9254       list = Fcons (list2 (elt, attrs), list);
9255     }
9256
9257   if (STRINGP (start))
9258     p = pbeg = SDATA (start);
9259   else
9260     p = pbeg = BYTE_POS_ADDR (start_byte);
9261   pend = p + (end_byte - start_byte);
9262
9263   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9264   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9265
9266   while (p < pend)
9267     {
9268       if (ASCII_CHAR_P (*p))
9269         p++;
9270       else
9271         {
9272           c = STRING_CHAR_ADVANCE (p);
9273
9274           charset_map_loaded = 0;
9275           for (tail = list; CONSP (tail); tail = XCDR (tail))
9276             {
9277               elt = XCDR (XCAR (tail));
9278               if (! char_encodable_p (c, XCAR (elt)))
9279                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9280             }
9281           if (charset_map_loaded)
9282             {
9283               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9284
9285               if (STRINGP (start))
9286                 pbeg = SDATA (start);
9287               else
9288                 pbeg = BYTE_POS_ADDR (start_byte);
9289               p = pbeg + p_offset;
9290               pend = pbeg + pend_offset;
9291             }
9292         }
9293       pos++;
9294     }
9295
9296   tail = list;
9297   list = Qnil;
9298   for (; CONSP (tail); tail = XCDR (tail))
9299     {
9300       elt = XCAR (tail);
9301       if (CONSP (XCDR (XCDR (elt))))
9302         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9303                       list);
9304     }
9305
9306   return list;
9307 }
9308
9309
9310 static Lisp_Object
9311 code_convert_region (Lisp_Object start, Lisp_Object end,
9312                      Lisp_Object coding_system, Lisp_Object dst_object,
9313                      bool encodep, bool norecord)
9314 {
9315   struct coding_system coding;
9316   ptrdiff_t from, from_byte, to, to_byte;
9317   Lisp_Object src_object;
9318
9319   if (NILP (coding_system))
9320     coding_system = Qno_conversion;
9321   else
9322     CHECK_CODING_SYSTEM (coding_system);
9323   src_object = Fcurrent_buffer ();
9324   if (NILP (dst_object))
9325     dst_object = src_object;
9326   else if (! EQ (dst_object, Qt))
9327     CHECK_BUFFER (dst_object);
9328
9329   validate_region (&start, &end);
9330   from = XFASTINT (start);
9331   from_byte = CHAR_TO_BYTE (from);
9332   to = XFASTINT (end);
9333   to_byte = CHAR_TO_BYTE (to);
9334
9335   setup_coding_system (coding_system, &coding);
9336   coding.mode |= CODING_MODE_LAST_BLOCK;
9337
9338   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9339     {
9340       struct buffer *buf = XBUFFER (dst_object);
9341       ptrdiff_t buf_pt = BUF_PT (buf);
9342
9343       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9344     }
9345
9346   if (encodep)
9347     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9348                           dst_object);
9349   else
9350     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9351                           dst_object);
9352   if (! norecord)
9353     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9354
9355   return (BUFFERP (dst_object)
9356           ? make_number (coding.produced_char)
9357           : coding.dst_object);
9358 }
9359
9360
9361 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9362        3, 4, "r\nzCoding system: ",
9363        doc: /* Decode the current region from the specified coding system.
9364 When called from a program, takes four arguments:
9365         START, END, CODING-SYSTEM, and DESTINATION.
9366 START and END are buffer positions.
9367
9368 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9369 If nil, the region between START and END is replaced by the decoded text.
9370 If buffer, the decoded text is inserted in that buffer after point (point
9371 does not move).
9372 In those cases, the length of the decoded text is returned.
9373 If DESTINATION is t, the decoded text is returned.
9374
9375 This function sets `last-coding-system-used' to the precise coding system
9376 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9377 not fully specified.)  */)
9378   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9379 {
9380   return code_convert_region (start, end, coding_system, destination, 0, 0);
9381 }
9382
9383 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9384        3, 4, "r\nzCoding system: ",
9385        doc: /* Encode the current region by specified coding system.
9386 When called from a program, takes four arguments:
9387         START, END, CODING-SYSTEM and DESTINATION.
9388 START and END are buffer positions.
9389
9390 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9391 If nil, the region between START and END is replace by the encoded text.
9392 If buffer, the encoded text is inserted in that buffer after point (point
9393 does not move).
9394 In those cases, the length of the encoded text is returned.
9395 If DESTINATION is t, the encoded text is returned.
9396
9397 This function sets `last-coding-system-used' to the precise coding system
9398 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9399 not fully specified.)  */)
9400   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9401 {
9402   return code_convert_region (start, end, coding_system, destination, 1, 0);
9403 }
9404
9405 Lisp_Object
9406 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9407                      Lisp_Object dst_object, bool encodep, bool nocopy,
9408                      bool norecord)
9409 {
9410   struct coding_system coding;
9411   ptrdiff_t chars, bytes;
9412
9413   CHECK_STRING (string);
9414   if (NILP (coding_system))
9415     {
9416       if (! norecord)
9417         Vlast_coding_system_used = Qno_conversion;
9418       if (NILP (dst_object))
9419         return (nocopy ? Fcopy_sequence (string) : string);
9420     }
9421
9422   if (NILP (coding_system))
9423     coding_system = Qno_conversion;
9424   else
9425     CHECK_CODING_SYSTEM (coding_system);
9426   if (NILP (dst_object))
9427     dst_object = Qt;
9428   else if (! EQ (dst_object, Qt))
9429     CHECK_BUFFER (dst_object);
9430
9431   setup_coding_system (coding_system, &coding);
9432   coding.mode |= CODING_MODE_LAST_BLOCK;
9433   chars = SCHARS (string);
9434   bytes = SBYTES (string);
9435
9436   if (BUFFERP (dst_object))
9437     {
9438       struct buffer *buf = XBUFFER (dst_object);
9439       ptrdiff_t buf_pt = BUF_PT (buf);
9440
9441       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9442     }
9443
9444   if (encodep)
9445     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9446   else
9447     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9448   if (! norecord)
9449     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9450
9451   return (BUFFERP (dst_object)
9452           ? make_number (coding.produced_char)
9453           : coding.dst_object);
9454 }
9455
9456
9457 /* Encode or decode STRING according to CODING_SYSTEM.
9458    Do not set Vlast_coding_system_used.
9459
9460    This function is called only from macros DECODE_FILE and
9461    ENCODE_FILE, thus we ignore character composition.  */
9462
9463 Lisp_Object
9464 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9465                               bool encodep)
9466 {
9467   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9468 }
9469
9470 /* Encode or decode a file name, to or from a unibyte string suitable
9471    for passing to C library functions.  */
9472 Lisp_Object
9473 decode_file_name (Lisp_Object fname)
9474 {
9475 #ifdef WINDOWSNT
9476   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9477      converts the file names either to UTF-16LE or to the system ANSI
9478      codepage internally, depending on the underlying OS; see w32.c.  */
9479   if (! NILP (Fcoding_system_p (Qutf_8)))
9480     return code_convert_string_norecord (fname, Qutf_8, 0);
9481   return fname;
9482 #else  /* !WINDOWSNT */
9483   if (! NILP (Vfile_name_coding_system))
9484     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9485   else if (! NILP (Vdefault_file_name_coding_system))
9486     return code_convert_string_norecord (fname,
9487                                          Vdefault_file_name_coding_system, 0);
9488   else
9489     return fname;
9490 #endif
9491 }
9492
9493 Lisp_Object
9494 encode_file_name (Lisp_Object fname)
9495 {
9496   /* This is especially important during bootstrap and dumping, when
9497      file-name encoding is not yet known, and therefore any non-ASCII
9498      file names are unibyte strings, and could only be thrashed if we
9499      try to encode them.  */
9500   if (!STRING_MULTIBYTE (fname))
9501     return fname;
9502 #ifdef WINDOWSNT
9503   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9504      converts the file names either to UTF-16LE or to the system ANSI
9505      codepage internally, depending on the underlying OS; see w32.c.  */
9506   if (! NILP (Fcoding_system_p (Qutf_8)))
9507     return code_convert_string_norecord (fname, Qutf_8, 1);
9508   return fname;
9509 #else  /* !WINDOWSNT */
9510   if (! NILP (Vfile_name_coding_system))
9511     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9512   else if (! NILP (Vdefault_file_name_coding_system))
9513     return code_convert_string_norecord (fname,
9514                                          Vdefault_file_name_coding_system, 1);
9515   else
9516     return fname;
9517 #endif
9518 }
9519
9520 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9521        2, 4, 0,
9522        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9523
9524 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9525 if the decoding operation is trivial.
9526
9527 Optional fourth arg BUFFER non-nil means that the decoded text is
9528 inserted in that buffer after point (point does not move).  In this
9529 case, the return value is the length of the decoded text.
9530
9531 This function sets `last-coding-system-used' to the precise coding system
9532 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9533 not fully specified.)  */)
9534   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9535 {
9536   return code_convert_string (string, coding_system, buffer,
9537                               0, ! NILP (nocopy), 0);
9538 }
9539
9540 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9541        2, 4, 0,
9542        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9543
9544 Optional third arg NOCOPY non-nil means it is OK to return STRING
9545 itself if the encoding operation is trivial.
9546
9547 Optional fourth arg BUFFER non-nil means that the encoded text is
9548 inserted in that buffer after point (point does not move).  In this
9549 case, the return value is the length of the encoded text.
9550
9551 This function sets `last-coding-system-used' to the precise coding system
9552 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9553 not fully specified.)  */)
9554   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9555 {
9556   return code_convert_string (string, coding_system, buffer,
9557                               1, ! NILP (nocopy), 0);
9558 }
9559
9560 \f
9561 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9562        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9563 Return the corresponding character.  */)
9564   (Lisp_Object code)
9565 {
9566   Lisp_Object spec, attrs, val;
9567   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9568   EMACS_INT ch;
9569   int c;
9570
9571   CHECK_NATNUM (code);
9572   ch = XFASTINT (code);
9573   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9574   attrs = AREF (spec, 0);
9575
9576   if (ASCII_CHAR_P (ch)
9577       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9578     return code;
9579
9580   val = CODING_ATTR_CHARSET_LIST (attrs);
9581   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9582   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9583   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9584
9585   if (ch <= 0x7F)
9586     {
9587       c = ch;
9588       charset = charset_roman;
9589     }
9590   else if (ch >= 0xA0 && ch < 0xDF)
9591     {
9592       c = ch - 0x80;
9593       charset = charset_kana;
9594     }
9595   else
9596     {
9597       EMACS_INT c1 = ch >> 8;
9598       int c2 = ch & 0xFF;
9599
9600       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9601           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9602         error ("Invalid code: %"pI"d", ch);
9603       c = ch;
9604       SJIS_TO_JIS (c);
9605       charset = charset_kanji;
9606     }
9607   c = DECODE_CHAR (charset, c);
9608   if (c < 0)
9609     error ("Invalid code: %"pI"d", ch);
9610   return make_number (c);
9611 }
9612
9613
9614 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9615        doc: /* Encode a Japanese character CH to shift_jis encoding.
9616 Return the corresponding code in SJIS.  */)
9617   (Lisp_Object ch)
9618 {
9619   Lisp_Object spec, attrs, charset_list;
9620   int c;
9621   struct charset *charset;
9622   unsigned code;
9623
9624   CHECK_CHARACTER (ch);
9625   c = XFASTINT (ch);
9626   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9627   attrs = AREF (spec, 0);
9628
9629   if (ASCII_CHAR_P (c)
9630       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9631     return ch;
9632
9633   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9634   charset = char_charset (c, charset_list, &code);
9635   if (code == CHARSET_INVALID_CODE (charset))
9636     error ("Can't encode by shift_jis encoding: %c", c);
9637   JIS_TO_SJIS (code);
9638
9639   return make_number (code);
9640 }
9641
9642 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9643        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9644 Return the corresponding character.  */)
9645   (Lisp_Object code)
9646 {
9647   Lisp_Object spec, attrs, val;
9648   struct charset *charset_roman, *charset_big5, *charset;
9649   EMACS_INT ch;
9650   int c;
9651
9652   CHECK_NATNUM (code);
9653   ch = XFASTINT (code);
9654   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9655   attrs = AREF (spec, 0);
9656
9657   if (ASCII_CHAR_P (ch)
9658       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9659     return code;
9660
9661   val = CODING_ATTR_CHARSET_LIST (attrs);
9662   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9663   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9664
9665   if (ch <= 0x7F)
9666     {
9667       c = ch;
9668       charset = charset_roman;
9669     }
9670   else
9671     {
9672       EMACS_INT b1 = ch >> 8;
9673       int b2 = ch & 0x7F;
9674       if (b1 < 0xA1 || b1 > 0xFE
9675           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9676         error ("Invalid code: %"pI"d", ch);
9677       c = ch;
9678       charset = charset_big5;
9679     }
9680   c = DECODE_CHAR (charset, c);
9681   if (c < 0)
9682     error ("Invalid code: %"pI"d", ch);
9683   return make_number (c);
9684 }
9685
9686 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9687        doc: /* Encode the Big5 character CH to BIG5 coding system.
9688 Return the corresponding character code in Big5.  */)
9689   (Lisp_Object ch)
9690 {
9691   Lisp_Object spec, attrs, charset_list;
9692   struct charset *charset;
9693   int c;
9694   unsigned code;
9695
9696   CHECK_CHARACTER (ch);
9697   c = XFASTINT (ch);
9698   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9699   attrs = AREF (spec, 0);
9700   if (ASCII_CHAR_P (c)
9701       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9702     return ch;
9703
9704   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9705   charset = char_charset (c, charset_list, &code);
9706   if (code == CHARSET_INVALID_CODE (charset))
9707     error ("Can't encode by Big5 encoding: %c", c);
9708
9709   return make_number (code);
9710 }
9711
9712 \f
9713 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9714        Sset_terminal_coding_system_internal, 1, 2, 0,
9715        doc: /* Internal use only.  */)
9716   (Lisp_Object coding_system, Lisp_Object terminal)
9717 {
9718   struct terminal *term = decode_live_terminal (terminal);
9719   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9720   CHECK_SYMBOL (coding_system);
9721   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9722   /* We had better not send unsafe characters to terminal.  */
9723   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9724   /* Character composition should be disabled.  */
9725   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9726   terminal_coding->src_multibyte = 1;
9727   terminal_coding->dst_multibyte = 0;
9728   tset_charset_list
9729     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9730             ? coding_charset_list (terminal_coding)
9731             : list1 (make_number (charset_ascii))));
9732   return Qnil;
9733 }
9734
9735 DEFUN ("set-safe-terminal-coding-system-internal",
9736        Fset_safe_terminal_coding_system_internal,
9737        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9738        doc: /* Internal use only.  */)
9739   (Lisp_Object coding_system)
9740 {
9741   CHECK_SYMBOL (coding_system);
9742   setup_coding_system (Fcheck_coding_system (coding_system),
9743                        &safe_terminal_coding);
9744   /* Character composition should be disabled.  */
9745   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9746   safe_terminal_coding.src_multibyte = 1;
9747   safe_terminal_coding.dst_multibyte = 0;
9748   return Qnil;
9749 }
9750
9751 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9752        Sterminal_coding_system, 0, 1, 0,
9753        doc: /* Return coding system specified for terminal output on the given terminal.
9754 TERMINAL may be a terminal object, a frame, or nil for the selected
9755 frame's terminal device.  */)
9756   (Lisp_Object terminal)
9757 {
9758   struct coding_system *terminal_coding
9759     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
9760   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9761
9762   /* For backward compatibility, return nil if it is `undecided'.  */
9763   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9764 }
9765
9766 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9767        Sset_keyboard_coding_system_internal, 1, 2, 0,
9768        doc: /* Internal use only.  */)
9769   (Lisp_Object coding_system, Lisp_Object terminal)
9770 {
9771   struct terminal *t = decode_live_terminal (terminal);
9772   CHECK_SYMBOL (coding_system);
9773   if (NILP (coding_system))
9774     coding_system = Qno_conversion;
9775   else
9776     Fcheck_coding_system (coding_system);
9777   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9778   /* Character composition should be disabled.  */
9779   TERMINAL_KEYBOARD_CODING (t)->common_flags
9780     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9781   return Qnil;
9782 }
9783
9784 DEFUN ("keyboard-coding-system",
9785        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9786        doc: /* Return coding system specified for decoding keyboard input.  */)
9787   (Lisp_Object terminal)
9788 {
9789   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9790                          (decode_live_terminal (terminal))->id);
9791 }
9792
9793 \f
9794 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9795        Sfind_operation_coding_system,  1, MANY, 0,
9796        doc: /* Choose a coding system for an operation based on the target name.
9797 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9798 DECODING-SYSTEM is the coding system to use for decoding
9799 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9800 for encoding (in case OPERATION does encoding).
9801
9802 The first argument OPERATION specifies an I/O primitive:
9803   For file I/O, `insert-file-contents' or `write-region'.
9804   For process I/O, `call-process', `call-process-region', or `start-process'.
9805   For network I/O, `open-network-stream'.
9806
9807 The remaining arguments should be the same arguments that were passed
9808 to the primitive.  Depending on which primitive, one of those arguments
9809 is selected as the TARGET.  For example, if OPERATION does file I/O,
9810 whichever argument specifies the file name is TARGET.
9811
9812 TARGET has a meaning which depends on OPERATION:
9813   For file I/O, TARGET is a file name (except for the special case below).
9814   For process I/O, TARGET is a process name.
9815   For network I/O, TARGET is a service name or a port number.
9816
9817 This function looks up what is specified for TARGET in
9818 `file-coding-system-alist', `process-coding-system-alist',
9819 or `network-coding-system-alist' depending on OPERATION.
9820 They may specify a coding system, a cons of coding systems,
9821 or a function symbol to call.
9822 In the last case, we call the function with one argument,
9823 which is a list of all the arguments given to this function.
9824 If the function can't decide a coding system, it can return
9825 `undecided' so that the normal code-detection is performed.
9826
9827 If OPERATION is `insert-file-contents', the argument corresponding to
9828 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9829 file name to look up, and BUFFER is a buffer that contains the file's
9830 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9831 function to call for FILENAME, that function should examine the
9832 contents of BUFFER instead of reading the file.
9833
9834 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9835   (ptrdiff_t nargs, Lisp_Object *args)
9836 {
9837   Lisp_Object operation, target_idx, target, val;
9838   register Lisp_Object chain;
9839
9840   if (nargs < 2)
9841     error ("Too few arguments");
9842   operation = args[0];
9843   if (!SYMBOLP (operation)
9844       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9845     error ("Invalid first argument");
9846   if (nargs <= 1 + XFASTINT (target_idx))
9847     error ("Too few arguments for operation `%s'",
9848            SDATA (SYMBOL_NAME (operation)));
9849   target = args[XFASTINT (target_idx) + 1];
9850   if (!(STRINGP (target)
9851         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9852             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9853         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9854     error ("Invalid argument %"pI"d of operation `%s'",
9855            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9856   if (CONSP (target))
9857     target = XCAR (target);
9858
9859   chain = ((EQ (operation, Qinsert_file_contents)
9860             || EQ (operation, Qwrite_region))
9861            ? Vfile_coding_system_alist
9862            : (EQ (operation, Qopen_network_stream)
9863               ? Vnetwork_coding_system_alist
9864               : Vprocess_coding_system_alist));
9865   if (NILP (chain))
9866     return Qnil;
9867
9868   for (; CONSP (chain); chain = XCDR (chain))
9869     {
9870       Lisp_Object elt;
9871
9872       elt = XCAR (chain);
9873       if (CONSP (elt)
9874           && ((STRINGP (target)
9875                && STRINGP (XCAR (elt))
9876                && fast_string_match (XCAR (elt), target) >= 0)
9877               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9878         {
9879           val = XCDR (elt);
9880           /* Here, if VAL is both a valid coding system and a valid
9881              function symbol, we return VAL as a coding system.  */
9882           if (CONSP (val))
9883             return val;
9884           if (! SYMBOLP (val))
9885             return Qnil;
9886           if (! NILP (Fcoding_system_p (val)))
9887             return Fcons (val, val);
9888           if (! NILP (Ffboundp (val)))
9889             {
9890               /* We use call1 rather than safe_call1
9891                  so as to get bug reports about functions called here
9892                  which don't handle the current interface.  */
9893               val = call1 (val, Flist (nargs, args));
9894               if (CONSP (val))
9895                 return val;
9896               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9897                 return Fcons (val, val);
9898             }
9899           return Qnil;
9900         }
9901     }
9902   return Qnil;
9903 }
9904
9905 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9906        Sset_coding_system_priority, 0, MANY, 0,
9907        doc: /* Assign higher priority to the coding systems given as arguments.
9908 If multiple coding systems belong to the same category,
9909 all but the first one are ignored.
9910
9911 usage: (set-coding-system-priority &rest coding-systems)  */)
9912   (ptrdiff_t nargs, Lisp_Object *args)
9913 {
9914   ptrdiff_t i, j;
9915   bool changed[coding_category_max];
9916   enum coding_category priorities[coding_category_max];
9917
9918   memset (changed, 0, sizeof changed);
9919
9920   for (i = j = 0; i < nargs; i++)
9921     {
9922       enum coding_category category;
9923       Lisp_Object spec, attrs;
9924
9925       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9926       attrs = AREF (spec, 0);
9927       category = XINT (CODING_ATTR_CATEGORY (attrs));
9928       if (changed[category])
9929         /* Ignore this coding system because a coding system of the
9930            same category already had a higher priority.  */
9931         continue;
9932       changed[category] = 1;
9933       priorities[j++] = category;
9934       if (coding_categories[category].id >= 0
9935           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9936         setup_coding_system (args[i], &coding_categories[category]);
9937       Fset (AREF (Vcoding_category_table, category), args[i]);
9938     }
9939
9940   /* Now we have decided top J priorities.  Reflect the order of the
9941      original priorities to the remaining priorities.  */
9942
9943   for (i = j, j = 0; i < coding_category_max; i++, j++)
9944     {
9945       while (j < coding_category_max
9946              && changed[coding_priorities[j]])
9947         j++;
9948       if (j == coding_category_max)
9949         emacs_abort ();
9950       priorities[i] = coding_priorities[j];
9951     }
9952
9953   memcpy (coding_priorities, priorities, sizeof priorities);
9954
9955   /* Update `coding-category-list'.  */
9956   Vcoding_category_list = Qnil;
9957   for (i = coding_category_max; i-- > 0; )
9958     Vcoding_category_list
9959       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9960                Vcoding_category_list);
9961
9962   return Qnil;
9963 }
9964
9965 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9966        Scoding_system_priority_list, 0, 1, 0,
9967        doc: /* Return a list of coding systems ordered by their priorities.
9968 The list contains a subset of coding systems; i.e. coding systems
9969 assigned to each coding category (see `coding-category-list').
9970
9971 HIGHESTP non-nil means just return the highest priority one.  */)
9972   (Lisp_Object highestp)
9973 {
9974   int i;
9975   Lisp_Object val;
9976
9977   for (i = 0, val = Qnil; i < coding_category_max; i++)
9978     {
9979       enum coding_category category = coding_priorities[i];
9980       int id = coding_categories[category].id;
9981       Lisp_Object attrs;
9982
9983       if (id < 0)
9984         continue;
9985       attrs = CODING_ID_ATTRS (id);
9986       if (! NILP (highestp))
9987         return CODING_ATTR_BASE_NAME (attrs);
9988       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9989     }
9990   return Fnreverse (val);
9991 }
9992
9993 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9994
9995 static Lisp_Object
9996 make_subsidiaries (Lisp_Object base)
9997 {
9998   Lisp_Object subsidiaries;
9999   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10000   USE_SAFE_ALLOCA;
10001   char *buf = SAFE_ALLOCA (base_name_len + 6);
10002   int i;
10003
10004   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10005   subsidiaries = make_uninit_vector (3);
10006   for (i = 0; i < 3; i++)
10007     {
10008       strcpy (buf + base_name_len, suffixes[i]);
10009       ASET (subsidiaries, i, intern (buf));
10010     }
10011   SAFE_FREE ();
10012   return subsidiaries;
10013 }
10014
10015
10016 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10017        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10018        doc: /* For internal use only.
10019 usage: (define-coding-system-internal ...)  */)
10020   (ptrdiff_t nargs, Lisp_Object *args)
10021 {
10022   Lisp_Object name;
10023   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10024   Lisp_Object attrs;            /* Vector of attributes.  */
10025   Lisp_Object eol_type;
10026   Lisp_Object aliases;
10027   Lisp_Object coding_type, charset_list, safe_charsets;
10028   enum coding_category category;
10029   Lisp_Object tail, val;
10030   int max_charset_id = 0;
10031   int i;
10032
10033   if (nargs < coding_arg_max)
10034     goto short_args;
10035
10036   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10037
10038   name = args[coding_arg_name];
10039   CHECK_SYMBOL (name);
10040   ASET (attrs, coding_attr_base_name, name);
10041
10042   val = args[coding_arg_mnemonic];
10043   if (! STRINGP (val))
10044     CHECK_CHARACTER (val);
10045   ASET (attrs, coding_attr_mnemonic, val);
10046
10047   coding_type = args[coding_arg_coding_type];
10048   CHECK_SYMBOL (coding_type);
10049   ASET (attrs, coding_attr_type, coding_type);
10050
10051   charset_list = args[coding_arg_charset_list];
10052   if (SYMBOLP (charset_list))
10053     {
10054       if (EQ (charset_list, Qiso_2022))
10055         {
10056           if (! EQ (coding_type, Qiso_2022))
10057             error ("Invalid charset-list");
10058           charset_list = Viso_2022_charset_list;
10059         }
10060       else if (EQ (charset_list, Qemacs_mule))
10061         {
10062           if (! EQ (coding_type, Qemacs_mule))
10063             error ("Invalid charset-list");
10064           charset_list = Vemacs_mule_charset_list;
10065         }
10066       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10067         {
10068           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10069             error ("Invalid charset-list");
10070           if (max_charset_id < XFASTINT (XCAR (tail)))
10071             max_charset_id = XFASTINT (XCAR (tail));
10072         }
10073     }
10074   else
10075     {
10076       charset_list = Fcopy_sequence (charset_list);
10077       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10078         {
10079           struct charset *charset;
10080
10081           val = XCAR (tail);
10082           CHECK_CHARSET_GET_CHARSET (val, charset);
10083           if (EQ (coding_type, Qiso_2022)
10084               ? CHARSET_ISO_FINAL (charset) < 0
10085               : EQ (coding_type, Qemacs_mule)
10086               ? CHARSET_EMACS_MULE_ID (charset) < 0
10087               : 0)
10088             error ("Can't handle charset `%s'",
10089                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10090
10091           XSETCAR (tail, make_number (charset->id));
10092           if (max_charset_id < charset->id)
10093             max_charset_id = charset->id;
10094         }
10095     }
10096   ASET (attrs, coding_attr_charset_list, charset_list);
10097
10098   safe_charsets = make_uninit_string (max_charset_id + 1);
10099   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10100   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10101     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10102   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10103
10104   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10105
10106   val = args[coding_arg_decode_translation_table];
10107   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10108     CHECK_SYMBOL (val);
10109   ASET (attrs, coding_attr_decode_tbl, val);
10110
10111   val = args[coding_arg_encode_translation_table];
10112   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10113     CHECK_SYMBOL (val);
10114   ASET (attrs, coding_attr_encode_tbl, val);
10115
10116   val = args[coding_arg_post_read_conversion];
10117   CHECK_SYMBOL (val);
10118   ASET (attrs, coding_attr_post_read, val);
10119
10120   val = args[coding_arg_pre_write_conversion];
10121   CHECK_SYMBOL (val);
10122   ASET (attrs, coding_attr_pre_write, val);
10123
10124   val = args[coding_arg_default_char];
10125   if (NILP (val))
10126     ASET (attrs, coding_attr_default_char, make_number (' '));
10127   else
10128     {
10129       CHECK_CHARACTER (val);
10130       ASET (attrs, coding_attr_default_char, val);
10131     }
10132
10133   val = args[coding_arg_for_unibyte];
10134   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10135
10136   val = args[coding_arg_plist];
10137   CHECK_LIST (val);
10138   ASET (attrs, coding_attr_plist, val);
10139
10140   if (EQ (coding_type, Qcharset))
10141     {
10142       /* Generate a lisp vector of 256 elements.  Each element is nil,
10143          integer, or a list of charset IDs.
10144
10145          If Nth element is nil, the byte code N is invalid in this
10146          coding system.
10147
10148          If Nth element is a number NUM, N is the first byte of a
10149          charset whose ID is NUM.
10150
10151          If Nth element is a list of charset IDs, N is the first byte
10152          of one of them.  The list is sorted by dimensions of the
10153          charsets.  A charset of smaller dimension comes first. */
10154       val = Fmake_vector (make_number (256), Qnil);
10155
10156       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10157         {
10158           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10159           int dim = CHARSET_DIMENSION (charset);
10160           int idx = (dim - 1) * 4;
10161
10162           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10163             ASET (attrs, coding_attr_ascii_compat, Qt);
10164
10165           for (i = charset->code_space[idx];
10166                i <= charset->code_space[idx + 1]; i++)
10167             {
10168               Lisp_Object tmp, tmp2;
10169               int dim2;
10170
10171               tmp = AREF (val, i);
10172               if (NILP (tmp))
10173                 tmp = XCAR (tail);
10174               else if (NUMBERP (tmp))
10175                 {
10176                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10177                   if (dim < dim2)
10178                     tmp = list2 (XCAR (tail), tmp);
10179                   else
10180                     tmp = list2 (tmp, XCAR (tail));
10181                 }
10182               else
10183                 {
10184                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10185                     {
10186                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10187                       if (dim < dim2)
10188                         break;
10189                     }
10190                   if (NILP (tmp2))
10191                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10192                   else
10193                     {
10194                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10195                       XSETCAR (tmp2, XCAR (tail));
10196                     }
10197                 }
10198               ASET (val, i, tmp);
10199             }
10200         }
10201       ASET (attrs, coding_attr_charset_valids, val);
10202       category = coding_category_charset;
10203     }
10204   else if (EQ (coding_type, Qccl))
10205     {
10206       Lisp_Object valids;
10207
10208       if (nargs < coding_arg_ccl_max)
10209         goto short_args;
10210
10211       val = args[coding_arg_ccl_decoder];
10212       CHECK_CCL_PROGRAM (val);
10213       if (VECTORP (val))
10214         val = Fcopy_sequence (val);
10215       ASET (attrs, coding_attr_ccl_decoder, val);
10216
10217       val = args[coding_arg_ccl_encoder];
10218       CHECK_CCL_PROGRAM (val);
10219       if (VECTORP (val))
10220         val = Fcopy_sequence (val);
10221       ASET (attrs, coding_attr_ccl_encoder, val);
10222
10223       val = args[coding_arg_ccl_valids];
10224       valids = Fmake_string (make_number (256), make_number (0));
10225       for (tail = val; CONSP (tail); tail = XCDR (tail))
10226         {
10227           int from, to;
10228
10229           val = XCAR (tail);
10230           if (INTEGERP (val))
10231             {
10232               if (! (0 <= XINT (val) && XINT (val) <= 255))
10233                 args_out_of_range_3 (val, make_number (0), make_number (255));
10234               from = to = XINT (val);
10235             }
10236           else
10237             {
10238               CHECK_CONS (val);
10239               CHECK_NATNUM_CAR (val);
10240               CHECK_NUMBER_CDR (val);
10241               if (XINT (XCAR (val)) > 255)
10242                 args_out_of_range_3 (XCAR (val),
10243                                      make_number (0), make_number (255));
10244               from = XINT (XCAR (val));
10245               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10246                 args_out_of_range_3 (XCDR (val),
10247                                      XCAR (val), make_number (255));
10248               to = XINT (XCDR (val));
10249             }
10250           for (i = from; i <= to; i++)
10251             SSET (valids, i, 1);
10252         }
10253       ASET (attrs, coding_attr_ccl_valids, valids);
10254
10255       category = coding_category_ccl;
10256     }
10257   else if (EQ (coding_type, Qutf_16))
10258     {
10259       Lisp_Object bom, endian;
10260
10261       ASET (attrs, coding_attr_ascii_compat, Qnil);
10262
10263       if (nargs < coding_arg_utf16_max)
10264         goto short_args;
10265
10266       bom = args[coding_arg_utf16_bom];
10267       if (! NILP (bom) && ! EQ (bom, Qt))
10268         {
10269           CHECK_CONS (bom);
10270           val = XCAR (bom);
10271           CHECK_CODING_SYSTEM (val);
10272           val = XCDR (bom);
10273           CHECK_CODING_SYSTEM (val);
10274         }
10275       ASET (attrs, coding_attr_utf_bom, bom);
10276
10277       endian = args[coding_arg_utf16_endian];
10278       CHECK_SYMBOL (endian);
10279       if (NILP (endian))
10280         endian = Qbig;
10281       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10282         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10283       ASET (attrs, coding_attr_utf_16_endian, endian);
10284
10285       category = (CONSP (bom)
10286                   ? coding_category_utf_16_auto
10287                   : NILP (bom)
10288                   ? (EQ (endian, Qbig)
10289                      ? coding_category_utf_16_be_nosig
10290                      : coding_category_utf_16_le_nosig)
10291                   : (EQ (endian, Qbig)
10292                      ? coding_category_utf_16_be
10293                      : coding_category_utf_16_le));
10294     }
10295   else if (EQ (coding_type, Qiso_2022))
10296     {
10297       Lisp_Object initial, reg_usage, request, flags;
10298
10299       if (nargs < coding_arg_iso2022_max)
10300         goto short_args;
10301
10302       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10303       CHECK_VECTOR (initial);
10304       for (i = 0; i < 4; i++)
10305         {
10306           val = AREF (initial, i);
10307           if (! NILP (val))
10308             {
10309               struct charset *charset;
10310
10311               CHECK_CHARSET_GET_CHARSET (val, charset);
10312               ASET (initial, i, make_number (CHARSET_ID (charset)));
10313               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10314                 ASET (attrs, coding_attr_ascii_compat, Qt);
10315             }
10316           else
10317             ASET (initial, i, make_number (-1));
10318         }
10319
10320       reg_usage = args[coding_arg_iso2022_reg_usage];
10321       CHECK_CONS (reg_usage);
10322       CHECK_NUMBER_CAR (reg_usage);
10323       CHECK_NUMBER_CDR (reg_usage);
10324
10325       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10326       for (tail = request; CONSP (tail); tail = XCDR (tail))
10327         {
10328           int id;
10329           Lisp_Object tmp1;
10330
10331           val = XCAR (tail);
10332           CHECK_CONS (val);
10333           tmp1 = XCAR (val);
10334           CHECK_CHARSET_GET_ID (tmp1, id);
10335           CHECK_NATNUM_CDR (val);
10336           if (XINT (XCDR (val)) >= 4)
10337             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10338           XSETCAR (val, make_number (id));
10339         }
10340
10341       flags = args[coding_arg_iso2022_flags];
10342       CHECK_NATNUM (flags);
10343       i = XINT (flags) & INT_MAX;
10344       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10345         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10346       flags = make_number (i);
10347
10348       ASET (attrs, coding_attr_iso_initial, initial);
10349       ASET (attrs, coding_attr_iso_usage, reg_usage);
10350       ASET (attrs, coding_attr_iso_request, request);
10351       ASET (attrs, coding_attr_iso_flags, flags);
10352       setup_iso_safe_charsets (attrs);
10353
10354       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10355         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10356                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10357                     ? coding_category_iso_7_else
10358                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10359                     ? coding_category_iso_7
10360                     : coding_category_iso_7_tight);
10361       else
10362         {
10363           int id = XINT (AREF (initial, 1));
10364
10365           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10366                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10367                        || id < 0)
10368                       ? coding_category_iso_8_else
10369                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10370                       ? coding_category_iso_8_1
10371                       : coding_category_iso_8_2);
10372         }
10373       if (category != coding_category_iso_8_1
10374           && category != coding_category_iso_8_2)
10375         ASET (attrs, coding_attr_ascii_compat, Qnil);
10376     }
10377   else if (EQ (coding_type, Qemacs_mule))
10378     {
10379       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10380         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10381       ASET (attrs, coding_attr_ascii_compat, Qt);
10382       category = coding_category_emacs_mule;
10383     }
10384   else if (EQ (coding_type, Qshift_jis))
10385     {
10386
10387       struct charset *charset;
10388
10389       if (XINT (Flength (charset_list)) != 3
10390           && XINT (Flength (charset_list)) != 4)
10391         error ("There should be three or four charsets");
10392
10393       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10394       if (CHARSET_DIMENSION (charset) != 1)
10395         error ("Dimension of charset %s is not one",
10396                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10397       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10398         ASET (attrs, coding_attr_ascii_compat, Qt);
10399
10400       charset_list = XCDR (charset_list);
10401       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10402       if (CHARSET_DIMENSION (charset) != 1)
10403         error ("Dimension of charset %s is not one",
10404                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10405
10406       charset_list = XCDR (charset_list);
10407       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10408       if (CHARSET_DIMENSION (charset) != 2)
10409         error ("Dimension of charset %s is not two",
10410                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10411
10412       charset_list = XCDR (charset_list);
10413       if (! NILP (charset_list))
10414         {
10415           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10416           if (CHARSET_DIMENSION (charset) != 2)
10417             error ("Dimension of charset %s is not two",
10418                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10419         }
10420
10421       category = coding_category_sjis;
10422       Vsjis_coding_system = name;
10423     }
10424   else if (EQ (coding_type, Qbig5))
10425     {
10426       struct charset *charset;
10427
10428       if (XINT (Flength (charset_list)) != 2)
10429         error ("There should be just two charsets");
10430
10431       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10432       if (CHARSET_DIMENSION (charset) != 1)
10433         error ("Dimension of charset %s is not one",
10434                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10435       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10436         ASET (attrs, coding_attr_ascii_compat, Qt);
10437
10438       charset_list = XCDR (charset_list);
10439       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10440       if (CHARSET_DIMENSION (charset) != 2)
10441         error ("Dimension of charset %s is not two",
10442                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10443
10444       category = coding_category_big5;
10445       Vbig5_coding_system = name;
10446     }
10447   else if (EQ (coding_type, Qraw_text))
10448     {
10449       category = coding_category_raw_text;
10450       ASET (attrs, coding_attr_ascii_compat, Qt);
10451     }
10452   else if (EQ (coding_type, Qutf_8))
10453     {
10454       Lisp_Object bom;
10455
10456       if (nargs < coding_arg_utf8_max)
10457         goto short_args;
10458
10459       bom = args[coding_arg_utf8_bom];
10460       if (! NILP (bom) && ! EQ (bom, Qt))
10461         {
10462           CHECK_CONS (bom);
10463           val = XCAR (bom);
10464           CHECK_CODING_SYSTEM (val);
10465           val = XCDR (bom);
10466           CHECK_CODING_SYSTEM (val);
10467         }
10468       ASET (attrs, coding_attr_utf_bom, bom);
10469       if (NILP (bom))
10470         ASET (attrs, coding_attr_ascii_compat, Qt);
10471
10472       category = (CONSP (bom) ? coding_category_utf_8_auto
10473                   : NILP (bom) ? coding_category_utf_8_nosig
10474                   : coding_category_utf_8_sig);
10475     }
10476   else if (EQ (coding_type, Qundecided))
10477     {
10478       if (nargs < coding_arg_undecided_max)
10479         goto short_args;
10480       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10481             args[coding_arg_undecided_inhibit_null_byte_detection]);
10482       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10483             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10484       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10485             args[coding_arg_undecided_prefer_utf_8]);
10486       category = coding_category_undecided;
10487     }
10488   else
10489     error ("Invalid coding system type: %s",
10490            SDATA (SYMBOL_NAME (coding_type)));
10491
10492   ASET (attrs, coding_attr_category, make_number (category));
10493   ASET (attrs, coding_attr_plist,
10494         Fcons (QCcategory,
10495                Fcons (AREF (Vcoding_category_table, category),
10496                       CODING_ATTR_PLIST (attrs))));
10497   ASET (attrs, coding_attr_plist,
10498         Fcons (QCascii_compatible_p,
10499                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10500                       CODING_ATTR_PLIST (attrs))));
10501
10502   eol_type = args[coding_arg_eol_type];
10503   if (! NILP (eol_type)
10504       && ! EQ (eol_type, Qunix)
10505       && ! EQ (eol_type, Qdos)
10506       && ! EQ (eol_type, Qmac))
10507     error ("Invalid eol-type");
10508
10509   aliases = list1 (name);
10510
10511   if (NILP (eol_type))
10512     {
10513       eol_type = make_subsidiaries (name);
10514       for (i = 0; i < 3; i++)
10515         {
10516           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10517
10518           this_name = AREF (eol_type, i);
10519           this_aliases = list1 (this_name);
10520           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10521           this_spec = make_uninit_vector (3);
10522           ASET (this_spec, 0, attrs);
10523           ASET (this_spec, 1, this_aliases);
10524           ASET (this_spec, 2, this_eol_type);
10525           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10526           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10527           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10528           if (NILP (val))
10529             Vcoding_system_alist
10530               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10531                        Vcoding_system_alist);
10532         }
10533     }
10534
10535   spec_vec = make_uninit_vector (3);
10536   ASET (spec_vec, 0, attrs);
10537   ASET (spec_vec, 1, aliases);
10538   ASET (spec_vec, 2, eol_type);
10539
10540   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10541   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10542   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10543   if (NILP (val))
10544     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10545                                   Vcoding_system_alist);
10546
10547   {
10548     int id = coding_categories[category].id;
10549
10550     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10551       setup_coding_system (name, &coding_categories[category]);
10552   }
10553
10554   return Qnil;
10555
10556  short_args:
10557   return Fsignal (Qwrong_number_of_arguments,
10558                   Fcons (intern ("define-coding-system-internal"),
10559                          make_number (nargs)));
10560 }
10561
10562
10563 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10564        3, 3, 0,
10565        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10566   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10567 {
10568   Lisp_Object spec, attrs;
10569
10570   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10571   attrs = AREF (spec, 0);
10572   if (EQ (prop, QCmnemonic))
10573     {
10574       if (! STRINGP (val))
10575         CHECK_CHARACTER (val);
10576       ASET (attrs, coding_attr_mnemonic, val);
10577     }
10578   else if (EQ (prop, QCdefault_char))
10579     {
10580       if (NILP (val))
10581         val = make_number (' ');
10582       else
10583         CHECK_CHARACTER (val);
10584       ASET (attrs, coding_attr_default_char, val);
10585     }
10586   else if (EQ (prop, QCdecode_translation_table))
10587     {
10588       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10589         CHECK_SYMBOL (val);
10590       ASET (attrs, coding_attr_decode_tbl, val);
10591     }
10592   else if (EQ (prop, QCencode_translation_table))
10593     {
10594       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10595         CHECK_SYMBOL (val);
10596       ASET (attrs, coding_attr_encode_tbl, val);
10597     }
10598   else if (EQ (prop, QCpost_read_conversion))
10599     {
10600       CHECK_SYMBOL (val);
10601       ASET (attrs, coding_attr_post_read, val);
10602     }
10603   else if (EQ (prop, QCpre_write_conversion))
10604     {
10605       CHECK_SYMBOL (val);
10606       ASET (attrs, coding_attr_pre_write, val);
10607     }
10608   else if (EQ (prop, QCascii_compatible_p))
10609     {
10610       ASET (attrs, coding_attr_ascii_compat, val);
10611     }
10612
10613   ASET (attrs, coding_attr_plist,
10614         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10615   return val;
10616 }
10617
10618
10619 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10620        Sdefine_coding_system_alias, 2, 2, 0,
10621        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10622   (Lisp_Object alias, Lisp_Object coding_system)
10623 {
10624   Lisp_Object spec, aliases, eol_type, val;
10625
10626   CHECK_SYMBOL (alias);
10627   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10628   aliases = AREF (spec, 1);
10629   /* ALIASES should be a list of length more than zero, and the first
10630      element is a base coding system.  Append ALIAS at the tail of the
10631      list.  */
10632   while (!NILP (XCDR (aliases)))
10633     aliases = XCDR (aliases);
10634   XSETCDR (aliases, list1 (alias));
10635
10636   eol_type = AREF (spec, 2);
10637   if (VECTORP (eol_type))
10638     {
10639       Lisp_Object subsidiaries;
10640       int i;
10641
10642       subsidiaries = make_subsidiaries (alias);
10643       for (i = 0; i < 3; i++)
10644         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10645                                      AREF (eol_type, i));
10646     }
10647
10648   Fputhash (alias, spec, Vcoding_system_hash_table);
10649   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10650   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10651   if (NILP (val))
10652     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10653                                   Vcoding_system_alist);
10654
10655   return Qnil;
10656 }
10657
10658 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10659        1, 1, 0,
10660        doc: /* Return the base of CODING-SYSTEM.
10661 Any alias or subsidiary coding system is not a base coding system.  */)
10662   (Lisp_Object coding_system)
10663 {
10664   Lisp_Object spec, attrs;
10665
10666   if (NILP (coding_system))
10667     return (Qno_conversion);
10668   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10669   attrs = AREF (spec, 0);
10670   return CODING_ATTR_BASE_NAME (attrs);
10671 }
10672
10673 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10674        1, 1, 0,
10675        doc: /* Return the property list of CODING-SYSTEM.  */)
10676   (Lisp_Object coding_system)
10677 {
10678   Lisp_Object spec, attrs;
10679
10680   if (NILP (coding_system))
10681     coding_system = Qno_conversion;
10682   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10683   attrs = AREF (spec, 0);
10684   return CODING_ATTR_PLIST (attrs);
10685 }
10686
10687
10688 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10689        1, 1, 0,
10690        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10691   (Lisp_Object coding_system)
10692 {
10693   Lisp_Object spec;
10694
10695   if (NILP (coding_system))
10696     coding_system = Qno_conversion;
10697   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10698   return AREF (spec, 1);
10699 }
10700
10701 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10702        Scoding_system_eol_type, 1, 1, 0,
10703        doc: /* Return eol-type of CODING-SYSTEM.
10704 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10705
10706 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10707 and CR respectively.
10708
10709 A vector value indicates that a format of end-of-line should be
10710 detected automatically.  Nth element of the vector is the subsidiary
10711 coding system whose eol-type is N.  */)
10712   (Lisp_Object coding_system)
10713 {
10714   Lisp_Object spec, eol_type;
10715   int n;
10716
10717   if (NILP (coding_system))
10718     coding_system = Qno_conversion;
10719   if (! CODING_SYSTEM_P (coding_system))
10720     return Qnil;
10721   spec = CODING_SYSTEM_SPEC (coding_system);
10722   eol_type = AREF (spec, 2);
10723   if (VECTORP (eol_type))
10724     return Fcopy_sequence (eol_type);
10725   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10726   return make_number (n);
10727 }
10728
10729 #endif /* emacs */
10730
10731 \f
10732 /*** 9. Post-amble ***/
10733
10734 void
10735 init_coding_once (void)
10736 {
10737   int i;
10738
10739   for (i = 0; i < coding_category_max; i++)
10740     {
10741       coding_categories[i].id = -1;
10742       coding_priorities[i] = i;
10743     }
10744
10745   /* ISO2022 specific initialize routine.  */
10746   for (i = 0; i < 0x20; i++)
10747     iso_code_class[i] = ISO_control_0;
10748   for (i = 0x21; i < 0x7F; i++)
10749     iso_code_class[i] = ISO_graphic_plane_0;
10750   for (i = 0x80; i < 0xA0; i++)
10751     iso_code_class[i] = ISO_control_1;
10752   for (i = 0xA1; i < 0xFF; i++)
10753     iso_code_class[i] = ISO_graphic_plane_1;
10754   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10755   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10756   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10757   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10758   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10759   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10760   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10761   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10762   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10763
10764   for (i = 0; i < 256; i++)
10765     {
10766       emacs_mule_bytes[i] = 1;
10767     }
10768   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10769   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10770   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10771   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10772 }
10773
10774 #ifdef emacs
10775
10776 void
10777 syms_of_coding (void)
10778 {
10779   staticpro (&Vcoding_system_hash_table);
10780   Vcoding_system_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
10781
10782   staticpro (&Vsjis_coding_system);
10783   Vsjis_coding_system = Qnil;
10784
10785   staticpro (&Vbig5_coding_system);
10786   Vbig5_coding_system = Qnil;
10787
10788   staticpro (&Vcode_conversion_reused_workbuf);
10789   Vcode_conversion_reused_workbuf = Qnil;
10790
10791   staticpro (&Vcode_conversion_workbuf_name);
10792   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10793
10794   reused_workbuf_in_use = 0;
10795
10796   DEFSYM (Qcharset, "charset");
10797   DEFSYM (Qtarget_idx, "target-idx");
10798   DEFSYM (Qcoding_system_history, "coding-system-history");
10799   Fset (Qcoding_system_history, Qnil);
10800
10801   /* Target FILENAME is the first argument.  */
10802   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10803   /* Target FILENAME is the third argument.  */
10804   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10805
10806   DEFSYM (Qcall_process, "call-process");
10807   /* Target PROGRAM is the first argument.  */
10808   Fput (Qcall_process, Qtarget_idx, make_number (0));
10809
10810   DEFSYM (Qcall_process_region, "call-process-region");
10811   /* Target PROGRAM is the third argument.  */
10812   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10813
10814   DEFSYM (Qstart_process, "start-process");
10815   /* Target PROGRAM is the third argument.  */
10816   Fput (Qstart_process, Qtarget_idx, make_number (2));
10817
10818   DEFSYM (Qopen_network_stream, "open-network-stream");
10819   /* Target SERVICE is the fourth argument.  */
10820   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10821
10822   DEFSYM (Qcoding_system, "coding-system");
10823   DEFSYM (Qcoding_aliases, "coding-aliases");
10824
10825   DEFSYM (Qeol_type, "eol-type");
10826   DEFSYM (Qunix, "unix");
10827   DEFSYM (Qdos, "dos");
10828   DEFSYM (Qmac, "mac");
10829
10830   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10831   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10832   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10833   DEFSYM (Qdefault_char, "default-char");
10834   DEFSYM (Qundecided, "undecided");
10835   DEFSYM (Qno_conversion, "no-conversion");
10836   DEFSYM (Qraw_text, "raw-text");
10837
10838   DEFSYM (Qiso_2022, "iso-2022");
10839
10840   DEFSYM (Qutf_8, "utf-8");
10841   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10842
10843 #if defined (WINDOWSNT) || defined (CYGWIN)
10844   /* No, not utf-16-le: that one has a BOM.  */
10845   DEFSYM (Qutf_16le, "utf-16le");
10846 #endif
10847
10848   DEFSYM (Qutf_16, "utf-16");
10849   DEFSYM (Qbig, "big");
10850   DEFSYM (Qlittle, "little");
10851
10852   DEFSYM (Qshift_jis, "shift-jis");
10853   DEFSYM (Qbig5, "big5");
10854
10855   DEFSYM (Qcoding_system_p, "coding-system-p");
10856
10857   /* Error signaled when there's a problem with detecting a coding system.  */
10858   DEFSYM (Qcoding_system_error, "coding-system-error");
10859   Fput (Qcoding_system_error, Qerror_conditions,
10860         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10861   Fput (Qcoding_system_error, Qerror_message,
10862         build_pure_c_string ("Invalid coding system"));
10863
10864   DEFSYM (Qtranslation_table, "translation-table");
10865   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10866   DEFSYM (Qtranslation_table_id, "translation-table-id");
10867   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10868   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10869
10870   DEFSYM (Qvalid_codes, "valid-codes");
10871
10872   /* Coding system emacs-mule and raw-text are for converting only
10873      end-of-line format.  */
10874   DEFSYM (Qemacs_mule, "emacs-mule");
10875
10876   DEFSYM (QCcategory, ":category");
10877   DEFSYM (QCmnemonic, ":mnemonic");
10878   DEFSYM (QCdefault_char, ":default-char");
10879   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10880   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10881   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10882   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10883   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10884
10885   Vcoding_category_table
10886     = Fmake_vector (make_number (coding_category_max), Qnil);
10887   staticpro (&Vcoding_category_table);
10888   /* Followings are target of code detection.  */
10889   ASET (Vcoding_category_table, coding_category_iso_7,
10890         intern_c_string ("coding-category-iso-7"));
10891   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10892         intern_c_string ("coding-category-iso-7-tight"));
10893   ASET (Vcoding_category_table, coding_category_iso_8_1,
10894         intern_c_string ("coding-category-iso-8-1"));
10895   ASET (Vcoding_category_table, coding_category_iso_8_2,
10896         intern_c_string ("coding-category-iso-8-2"));
10897   ASET (Vcoding_category_table, coding_category_iso_7_else,
10898         intern_c_string ("coding-category-iso-7-else"));
10899   ASET (Vcoding_category_table, coding_category_iso_8_else,
10900         intern_c_string ("coding-category-iso-8-else"));
10901   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10902         intern_c_string ("coding-category-utf-8-auto"));
10903   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10904         intern_c_string ("coding-category-utf-8"));
10905   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10906         intern_c_string ("coding-category-utf-8-sig"));
10907   ASET (Vcoding_category_table, coding_category_utf_16_be,
10908         intern_c_string ("coding-category-utf-16-be"));
10909   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10910         intern_c_string ("coding-category-utf-16-auto"));
10911   ASET (Vcoding_category_table, coding_category_utf_16_le,
10912         intern_c_string ("coding-category-utf-16-le"));
10913   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10914         intern_c_string ("coding-category-utf-16-be-nosig"));
10915   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10916         intern_c_string ("coding-category-utf-16-le-nosig"));
10917   ASET (Vcoding_category_table, coding_category_charset,
10918         intern_c_string ("coding-category-charset"));
10919   ASET (Vcoding_category_table, coding_category_sjis,
10920         intern_c_string ("coding-category-sjis"));
10921   ASET (Vcoding_category_table, coding_category_big5,
10922         intern_c_string ("coding-category-big5"));
10923   ASET (Vcoding_category_table, coding_category_ccl,
10924         intern_c_string ("coding-category-ccl"));
10925   ASET (Vcoding_category_table, coding_category_emacs_mule,
10926         intern_c_string ("coding-category-emacs-mule"));
10927   /* Followings are NOT target of code detection.  */
10928   ASET (Vcoding_category_table, coding_category_raw_text,
10929         intern_c_string ("coding-category-raw-text"));
10930   ASET (Vcoding_category_table, coding_category_undecided,
10931         intern_c_string ("coding-category-undecided"));
10932
10933   DEFSYM (Qinsufficient_source, "insufficient-source");
10934   DEFSYM (Qinvalid_source, "invalid-source");
10935   DEFSYM (Qinterrupted, "interrupted");
10936
10937   /* If a symbol has this property, evaluate the value to define the
10938      symbol as a coding system.  */
10939   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10940
10941   defsubr (&Scoding_system_p);
10942   defsubr (&Sread_coding_system);
10943   defsubr (&Sread_non_nil_coding_system);
10944   defsubr (&Scheck_coding_system);
10945   defsubr (&Sdetect_coding_region);
10946   defsubr (&Sdetect_coding_string);
10947   defsubr (&Sfind_coding_systems_region_internal);
10948   defsubr (&Sunencodable_char_position);
10949   defsubr (&Scheck_coding_systems_region);
10950   defsubr (&Sdecode_coding_region);
10951   defsubr (&Sencode_coding_region);
10952   defsubr (&Sdecode_coding_string);
10953   defsubr (&Sencode_coding_string);
10954   defsubr (&Sdecode_sjis_char);
10955   defsubr (&Sencode_sjis_char);
10956   defsubr (&Sdecode_big5_char);
10957   defsubr (&Sencode_big5_char);
10958   defsubr (&Sset_terminal_coding_system_internal);
10959   defsubr (&Sset_safe_terminal_coding_system_internal);
10960   defsubr (&Sterminal_coding_system);
10961   defsubr (&Sset_keyboard_coding_system_internal);
10962   defsubr (&Skeyboard_coding_system);
10963   defsubr (&Sfind_operation_coding_system);
10964   defsubr (&Sset_coding_system_priority);
10965   defsubr (&Sdefine_coding_system_internal);
10966   defsubr (&Sdefine_coding_system_alias);
10967   defsubr (&Scoding_system_put);
10968   defsubr (&Scoding_system_base);
10969   defsubr (&Scoding_system_plist);
10970   defsubr (&Scoding_system_aliases);
10971   defsubr (&Scoding_system_eol_type);
10972   defsubr (&Scoding_system_priority_list);
10973
10974   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10975                doc: /* List of coding systems.
10976
10977 Do not alter the value of this variable manually.  This variable should be
10978 updated by the functions `define-coding-system' and
10979 `define-coding-system-alias'.  */);
10980   Vcoding_system_list = Qnil;
10981
10982   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10983                doc: /* Alist of coding system names.
10984 Each element is one element list of coding system name.
10985 This variable is given to `completing-read' as COLLECTION argument.
10986
10987 Do not alter the value of this variable manually.  This variable should be
10988 updated by the functions `make-coding-system' and
10989 `define-coding-system-alias'.  */);
10990   Vcoding_system_alist = Qnil;
10991
10992   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10993                doc: /* List of coding-categories (symbols) ordered by priority.
10994
10995 On detecting a coding system, Emacs tries code detection algorithms
10996 associated with each coding-category one by one in this order.  When
10997 one algorithm agrees with a byte sequence of source text, the coding
10998 system bound to the corresponding coding-category is selected.
10999
11000 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
11001   {
11002     int i;
11003
11004     Vcoding_category_list = Qnil;
11005     for (i = coding_category_max - 1; i >= 0; i--)
11006       Vcoding_category_list
11007         = Fcons (AREF (Vcoding_category_table, i),
11008                  Vcoding_category_list);
11009   }
11010
11011   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11012                doc: /* Specify the coding system for read operations.
11013 It is useful to bind this variable with `let', but do not set it globally.
11014 If the value is a coding system, it is used for decoding on read operation.
11015 If not, an appropriate element is used from one of the coding system alists.
11016 There are three such tables: `file-coding-system-alist',
11017 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11018   Vcoding_system_for_read = Qnil;
11019
11020   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11021                doc: /* Specify the coding system for write operations.
11022 Programs bind this variable with `let', but you should not set it globally.
11023 If the value is a coding system, it is used for encoding of output,
11024 when writing it to a file and when sending it to a file or subprocess.
11025
11026 If this does not specify a coding system, an appropriate element
11027 is used from one of the coding system alists.
11028 There are three such tables: `file-coding-system-alist',
11029 `process-coding-system-alist', and `network-coding-system-alist'.
11030 For output to files, if the above procedure does not specify a coding system,
11031 the value of `buffer-file-coding-system' is used.  */);
11032   Vcoding_system_for_write = Qnil;
11033
11034   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11035                doc: /*
11036 Coding system used in the latest file or process I/O.  */);
11037   Vlast_coding_system_used = Qnil;
11038
11039   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11040                doc: /*
11041 Error status of the last code conversion.
11042
11043 When an error was detected in the last code conversion, this variable
11044 is set to one of the following symbols.
11045   `insufficient-source'
11046   `inconsistent-eol'
11047   `invalid-source'
11048   `interrupted'
11049   `insufficient-memory'
11050 When no error was detected, the value doesn't change.  So, to check
11051 the error status of a code conversion by this variable, you must
11052 explicitly set this variable to nil before performing code
11053 conversion.  */);
11054   Vlast_code_conversion_error = Qnil;
11055
11056   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11057                doc: /*
11058 *Non-nil means always inhibit code conversion of end-of-line format.
11059 See info node `Coding Systems' and info node `Text and Binary' concerning
11060 such conversion.  */);
11061   inhibit_eol_conversion = 0;
11062
11063   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11064                doc: /*
11065 Non-nil means process buffer inherits coding system of process output.
11066 Bind it to t if the process output is to be treated as if it were a file
11067 read from some filesystem.  */);
11068   inherit_process_coding_system = 0;
11069
11070   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11071                doc: /*
11072 Alist to decide a coding system to use for a file I/O operation.
11073 The format is ((PATTERN . VAL) ...),
11074 where PATTERN is a regular expression matching a file name,
11075 VAL is a coding system, a cons of coding systems, or a function symbol.
11076 If VAL is a coding system, it is used for both decoding and encoding
11077 the file contents.
11078 If VAL is a cons of coding systems, the car part is used for decoding,
11079 and the cdr part is used for encoding.
11080 If VAL is a function symbol, the function must return a coding system
11081 or a cons of coding systems which are used as above.  The function is
11082 called with an argument that is a list of the arguments with which
11083 `find-operation-coding-system' was called.  If the function can't decide
11084 a coding system, it can return `undecided' so that the normal
11085 code-detection is performed.
11086
11087 See also the function `find-operation-coding-system'
11088 and the variable `auto-coding-alist'.  */);
11089   Vfile_coding_system_alist = Qnil;
11090
11091   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11092                doc: /*
11093 Alist to decide a coding system to use for a process I/O operation.
11094 The format is ((PATTERN . VAL) ...),
11095 where PATTERN is a regular expression matching a program name,
11096 VAL is a coding system, a cons of coding systems, or a function symbol.
11097 If VAL is a coding system, it is used for both decoding what received
11098 from the program and encoding what sent to the program.
11099 If VAL is a cons of coding systems, the car part is used for decoding,
11100 and the cdr part is used for encoding.
11101 If VAL is a function symbol, the function must return a coding system
11102 or a cons of coding systems which are used as above.
11103
11104 See also the function `find-operation-coding-system'.  */);
11105   Vprocess_coding_system_alist = Qnil;
11106
11107   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11108                doc: /*
11109 Alist to decide a coding system to use for a network I/O operation.
11110 The format is ((PATTERN . VAL) ...),
11111 where PATTERN is a regular expression matching a network service name
11112 or is a port number to connect to,
11113 VAL is a coding system, a cons of coding systems, or a function symbol.
11114 If VAL is a coding system, it is used for both decoding what received
11115 from the network stream and encoding what sent to the network stream.
11116 If VAL is a cons of coding systems, the car part is used for decoding,
11117 and the cdr part is used for encoding.
11118 If VAL is a function symbol, the function must return a coding system
11119 or a cons of coding systems which are used as above.
11120
11121 See also the function `find-operation-coding-system'.  */);
11122   Vnetwork_coding_system_alist = Qnil;
11123
11124   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11125                doc: /* Coding system to use with system messages.
11126 Also used for decoding keyboard input on X Window system.  */);
11127   Vlocale_coding_system = Qnil;
11128
11129   /* The eol mnemonics are reset in startup.el system-dependently.  */
11130   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11131                doc: /*
11132 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11133   eol_mnemonic_unix = build_pure_c_string (":");
11134
11135   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11136                doc: /*
11137 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11138   eol_mnemonic_dos = build_pure_c_string ("\\");
11139
11140   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11141                doc: /*
11142 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11143   eol_mnemonic_mac = build_pure_c_string ("/");
11144
11145   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11146                doc: /*
11147 *String displayed in mode line when end-of-line format is not yet determined.  */);
11148   eol_mnemonic_undecided = build_pure_c_string (":");
11149
11150   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11151                doc: /*
11152 *Non-nil enables character translation while encoding and decoding.  */);
11153   Venable_character_translation = Qt;
11154
11155   DEFVAR_LISP ("standard-translation-table-for-decode",
11156                Vstandard_translation_table_for_decode,
11157                doc: /* Table for translating characters while decoding.  */);
11158   Vstandard_translation_table_for_decode = Qnil;
11159
11160   DEFVAR_LISP ("standard-translation-table-for-encode",
11161                Vstandard_translation_table_for_encode,
11162                doc: /* Table for translating characters while encoding.  */);
11163   Vstandard_translation_table_for_encode = Qnil;
11164
11165   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11166                doc: /* Alist of charsets vs revision numbers.
11167 While encoding, if a charset (car part of an element) is found,
11168 designate it with the escape sequence identifying revision (cdr part
11169 of the element).  */);
11170   Vcharset_revision_table = Qnil;
11171
11172   DEFVAR_LISP ("default-process-coding-system",
11173                Vdefault_process_coding_system,
11174                doc: /* Cons of coding systems used for process I/O by default.
11175 The car part is used for decoding a process output,
11176 the cdr part is used for encoding a text to be sent to a process.  */);
11177   Vdefault_process_coding_system = Qnil;
11178
11179   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11180                doc: /*
11181 Table of extra Latin codes in the range 128..159 (inclusive).
11182 This is a vector of length 256.
11183 If Nth element is non-nil, the existence of code N in a file
11184 \(or output of subprocess) doesn't prevent it to be detected as
11185 a coding system of ISO 2022 variant which has a flag
11186 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11187 or reading output of a subprocess.
11188 Only 128th through 159th elements have a meaning.  */);
11189   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11190
11191   DEFVAR_LISP ("select-safe-coding-system-function",
11192                Vselect_safe_coding_system_function,
11193                doc: /*
11194 Function to call to select safe coding system for encoding a text.
11195
11196 If set, this function is called to force a user to select a proper
11197 coding system which can encode the text in the case that a default
11198 coding system used in each operation can't encode the text.  The
11199 function should take care that the buffer is not modified while
11200 the coding system is being selected.
11201
11202 The default value is `select-safe-coding-system' (which see).  */);
11203   Vselect_safe_coding_system_function = Qnil;
11204
11205   DEFVAR_BOOL ("coding-system-require-warning",
11206                coding_system_require_warning,
11207                doc: /* Internal use only.
11208 If non-nil, on writing a file, `select-safe-coding-system-function' is
11209 called even if `coding-system-for-write' is non-nil.  The command
11210 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11211   coding_system_require_warning = 0;
11212
11213
11214   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11215                inhibit_iso_escape_detection,
11216                doc: /*
11217 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11218
11219 When Emacs reads text, it tries to detect how the text is encoded.
11220 This code detection is sensitive to escape sequences.  If Emacs sees
11221 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11222 of the ISO2022 encodings, and decodes text by the corresponding coding
11223 system (e.g. `iso-2022-7bit').
11224
11225 However, there may be a case that you want to read escape sequences in
11226 a file as is.  In such a case, you can set this variable to non-nil.
11227 Then the code detection will ignore any escape sequences, and no text is
11228 detected as encoded in some ISO-2022 encoding.  The result is that all
11229 escape sequences become visible in a buffer.
11230
11231 The default value is nil, and it is strongly recommended not to change
11232 it.  That is because many Emacs Lisp source files that contain
11233 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11234 in Emacs's distribution, and they won't be decoded correctly on
11235 reading if you suppress escape sequence detection.
11236
11237 The other way to read escape sequences in a file without decoding is
11238 to explicitly specify some coding system that doesn't use ISO-2022
11239 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11240   inhibit_iso_escape_detection = 0;
11241
11242   DEFVAR_BOOL ("inhibit-null-byte-detection",
11243                inhibit_null_byte_detection,
11244                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11245 By default, Emacs treats it as binary data, and does not attempt to
11246 decode it.  The effect is as if you specified `no-conversion' for
11247 reading that text.
11248
11249 Set this to non-nil when a regular text happens to include null bytes.
11250 Examples are Index nodes of Info files and null-byte delimited output
11251 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11252 decode text as usual.  */);
11253   inhibit_null_byte_detection = 0;
11254
11255   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11256                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11257 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11258   disable_ascii_optimization = 0;
11259
11260   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11261                doc: /* Char table for translating self-inserting characters.
11262 This is applied to the result of input methods, not their input.
11263 See also `keyboard-translate-table'.
11264
11265 Use of this variable for character code unification was rendered
11266 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11267 internal character representation.  */);
11268   Vtranslation_table_for_input = Qnil;
11269
11270   Lisp_Object args[coding_arg_undecided_max];
11271   memclear (args, sizeof args);
11272
11273   Lisp_Object plist[] =
11274     {
11275       intern_c_string (":name"),
11276       args[coding_arg_name] = Qno_conversion,
11277       intern_c_string (":mnemonic"),
11278       args[coding_arg_mnemonic] = make_number ('='),
11279       intern_c_string (":coding-type"),
11280       args[coding_arg_coding_type] = Qraw_text,
11281       intern_c_string (":ascii-compatible-p"),
11282       args[coding_arg_ascii_compatible_p] = Qt,
11283       intern_c_string (":default-char"),
11284       args[coding_arg_default_char] = make_number (0),
11285       intern_c_string (":for-unibyte"),
11286       args[coding_arg_for_unibyte] = Qt,
11287       intern_c_string (":docstring"),
11288       (build_pure_c_string
11289        ("Do no conversion.\n"
11290         "\n"
11291         "When you visit a file with this coding, the file is read into a\n"
11292         "unibyte buffer as is, thus each byte of a file is treated as a\n"
11293         "character.")),
11294       intern_c_string (":eol-type"),
11295       args[coding_arg_eol_type] = Qunix,
11296     };
11297   args[coding_arg_plist] = CALLMANY (Flist, plist);
11298   Fdefine_coding_system_internal (coding_arg_max, args);
11299
11300   plist[1] = args[coding_arg_name] = Qundecided;
11301   plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11302   plist[5] = args[coding_arg_coding_type] = Qundecided;
11303   /* This is already set.
11304      plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11305   plist[8] = intern_c_string (":charset-list");
11306   plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11307   plist[11] = args[coding_arg_for_unibyte] = Qnil;
11308   plist[13] = build_pure_c_string ("No conversion on encoding, "
11309                                    "automatic conversion on decoding.");
11310   plist[15] = args[coding_arg_eol_type] = Qnil;
11311   args[coding_arg_plist] = CALLMANY (Flist, plist);
11312   args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11313   args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11314   Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11315
11316   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11317
11318   for (int i = 0; i < coding_category_max; i++)
11319     Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11320
11321 #if defined (DOS_NT)
11322   system_eol_type = Qdos;
11323 #else
11324   system_eol_type = Qunix;
11325 #endif
11326   staticpro (&system_eol_type);
11327 }
11328
11329 char *
11330 emacs_strerror (int error_number)
11331 {
11332   char *str;
11333
11334   synchronize_system_messages_locale ();
11335   str = strerror (error_number);
11336
11337   if (! NILP (Vlocale_coding_system))
11338     {
11339       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11340                                                       Vlocale_coding_system,
11341                                                       0);
11342       str = SSDATA (dec);
11343     }
11344
11345   return str;
11346 }
11347
11348 #endif /* emacs */