src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2015 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 /* Format of end-of-line decided by system.  This is Qunix on
 307    Unix and Mac, Qdos on DOS/Windows.
 308    This has an effect only for external encoding (i.e. for output to
 309    file and process), not for in-buffer or Lisp string encoding.  */
 310 static Lisp_Object system_eol_type;
 311
 312 #ifdef emacs
 313
 314 /* Coding-systems are handed between Emacs Lisp programs and C internal
 315    routines by the following three variables.  */
 316 /* Coding system to be used to encode text for terminal display when
 317    terminal coding system is nil.  */
 318 struct coding_system safe_terminal_coding;
 319
 320 #endif /* emacs */
 321
 322 /* Two special coding systems.  */
 323 static Lisp_Object Vsjis_coding_system;
 324 static Lisp_Object Vbig5_coding_system;
 325
 326 /* ISO2022 section */
 327
 328 #define CODING_ISO_INITIAL(coding, reg)                 \
 329   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 330                      coding_attr_iso_initial),          \
 331                reg)))
 332
 333
 334 #define CODING_ISO_REQUEST(coding, charset_id)          \
 335   (((charset_id) <= (coding)->max_charset_id            \
 336     ? ((coding)->safe_charsets[charset_id] != 255       \
 337        ? (coding)->safe_charsets[charset_id]            \
 338        : -1)                                            \
 339     : -1))
 340
 341
 342 #define CODING_ISO_FLAGS(coding)        \
 343   ((coding)->spec.iso_2022.flags)
 344 #define CODING_ISO_DESIGNATION(coding, reg)     \
 345   ((coding)->spec.iso_2022.current_designation[reg])
 346 #define CODING_ISO_INVOCATION(coding, plane)    \
 347   ((coding)->spec.iso_2022.current_invocation[plane])
 348 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 349   ((coding)->spec.iso_2022.single_shifting)
 350 #define CODING_ISO_BOL(coding)  \
 351   ((coding)->spec.iso_2022.bol)
 352 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 353   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 354 #define CODING_ISO_CMP_STATUS(coding)   \
 355   (&(coding)->spec.iso_2022.cmp_status)
 356 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 357   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 358 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 359   ((coding)->spec.iso_2022.embedded_utf_8)
 360
 361 /* Control characters of ISO2022.  */
 362                         /* code */      /* function */
 363 #define ISO_CODE_SO     0x0E            /* shift-out */
 364 #define ISO_CODE_SI     0x0F            /* shift-in */
 365 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 366 #define ISO_CODE_ESC    0x1B            /* escape */
 367 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 368 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 369 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 370
 371 /* All code (1-byte) of ISO2022 is classified into one of the
 372    followings.  */
 373 enum iso_code_class_type
 374   {
 375     ISO_control_0,              /* Control codes in the range
 376                                    0x00..0x1F and 0x7F, except for the
 377                                    following 5 codes.  */
 378     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 379     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 380     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 381     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 382     ISO_control_1,              /* Control codes in the range
 383                                    0x80..0x9F, except for the
 384                                    following 3 codes.  */
 385     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 386     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 387     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 388     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 389     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 390     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 391     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 392   };
 393
 394 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 395     `iso-flags' attribute of an iso2022 coding system.  */
 396
 397 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 398    instead of the correct short-form sequence (e.g. ESC $ A).  */
 399 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 400
 401 /* If set, reset graphic planes and registers at end-of-line to the
 402    initial state.  */
 403 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 404
 405 /* If set, reset graphic planes and registers before any control
 406    characters to the initial state.  */
 407 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 408
 409 /* If set, encode by 7-bit environment.  */
 410 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 411
 412 /* If set, use locking-shift function.  */
 413 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 414
 415 /* If set, use single-shift function.  Overwrite
 416    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 417 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 418
 419 /* If set, use designation escape sequence.  */
 420 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 421
 422 /* If set, produce revision number sequence.  */
 423 #define CODING_ISO_FLAG_REVISION        0x0080
 424
 425 /* If set, produce ISO6429's direction specifying sequence.  */
 426 #define CODING_ISO_FLAG_DIRECTION       0x0100
 427
 428 /* If set, assume designation states are reset at beginning of line on
 429    output.  */
 430 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 431
 432 /* If set, designation sequence should be placed at beginning of line
 433    on output.  */
 434 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 435
 436 /* If set, do not encode unsafe characters on output.  */
 437 #define CODING_ISO_FLAG_SAFE            0x0800
 438
 439 /* If set, extra latin codes (128..159) are accepted as a valid code
 440    on input.  */
 441 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 442
 443 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 444
 445 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 446
 447 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 448
 449 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 450
 451 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 452
 453 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 454
 455 /* A character to be produced on output if encoding of the original
 456    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 457 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 458
 459 /* UTF-8 section */
 460 #define CODING_UTF_8_BOM(coding)        \
 461   ((coding)->spec.utf_8_bom)
 462
 463 /* UTF-16 section */
 464 #define CODING_UTF_16_BOM(coding)       \
 465   ((coding)->spec.utf_16.bom)
 466
 467 #define CODING_UTF_16_ENDIAN(coding)    \
 468   ((coding)->spec.utf_16.endian)
 469
 470 #define CODING_UTF_16_SURROGATE(coding) \
 471   ((coding)->spec.utf_16.surrogate)
 472
 473
 474 /* CCL section */
 475 #define CODING_CCL_DECODER(coding)      \
 476   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 477 #define CODING_CCL_ENCODER(coding)      \
 478   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 479 #define CODING_CCL_VALIDS(coding)                                          \
 480   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 481
 482 /* Index for each coding category in `coding_categories' */
 483
 484 enum coding_category
 485   {
 486     coding_category_iso_7,
 487     coding_category_iso_7_tight,
 488     coding_category_iso_8_1,
 489     coding_category_iso_8_2,
 490     coding_category_iso_7_else,
 491     coding_category_iso_8_else,
 492     coding_category_utf_8_auto,
 493     coding_category_utf_8_nosig,
 494     coding_category_utf_8_sig,
 495     coding_category_utf_16_auto,
 496     coding_category_utf_16_be,
 497     coding_category_utf_16_le,
 498     coding_category_utf_16_be_nosig,
 499     coding_category_utf_16_le_nosig,
 500     coding_category_charset,
 501     coding_category_sjis,
 502     coding_category_big5,
 503     coding_category_ccl,
 504     coding_category_emacs_mule,
 505     /* All above are targets of code detection.  */
 506     coding_category_raw_text,
 507     coding_category_undecided,
 508     coding_category_max
 509   };
 510
 511 /* Definitions of flag bits used in detect_coding_XXXX.  */
 512 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 513 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 514 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 515 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 516 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 517 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 518 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 519 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 520 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 521 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 522 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 523 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 524 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 525 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 526 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 527 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 528 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 529 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 530 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 531 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 532
 533 /* This value is returned if detect_coding_mask () find nothing other
 534    than ASCII characters.  */
 535 #define CATEGORY_MASK_ANY               \
 536   (CATEGORY_MASK_ISO_7                  \
 537    | CATEGORY_MASK_ISO_7_TIGHT          \
 538    | CATEGORY_MASK_ISO_8_1              \
 539    | CATEGORY_MASK_ISO_8_2              \
 540    | CATEGORY_MASK_ISO_7_ELSE           \
 541    | CATEGORY_MASK_ISO_8_ELSE           \
 542    | CATEGORY_MASK_UTF_8_AUTO           \
 543    | CATEGORY_MASK_UTF_8_NOSIG          \
 544    | CATEGORY_MASK_UTF_8_SIG            \
 545    | CATEGORY_MASK_UTF_16_AUTO          \
 546    | CATEGORY_MASK_UTF_16_BE            \
 547    | CATEGORY_MASK_UTF_16_LE            \
 548    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 549    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 550    | CATEGORY_MASK_CHARSET              \
 551    | CATEGORY_MASK_SJIS                 \
 552    | CATEGORY_MASK_BIG5                 \
 553    | CATEGORY_MASK_CCL                  \
 554    | CATEGORY_MASK_EMACS_MULE)
 555
 556
 557 #define CATEGORY_MASK_ISO_7BIT \
 558   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 559
 560 #define CATEGORY_MASK_ISO_8BIT \
 561   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 562
 563 #define CATEGORY_MASK_ISO_ELSE \
 564   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 565
 566 #define CATEGORY_MASK_ISO_ESCAPE        \
 567   (CATEGORY_MASK_ISO_7                  \
 568    | CATEGORY_MASK_ISO_7_TIGHT          \
 569    | CATEGORY_MASK_ISO_7_ELSE           \
 570    | CATEGORY_MASK_ISO_8_ELSE)
 571
 572 #define CATEGORY_MASK_ISO       \
 573   (  CATEGORY_MASK_ISO_7BIT     \
 574      | CATEGORY_MASK_ISO_8BIT   \
 575      | CATEGORY_MASK_ISO_ELSE)
 576
 577 #define CATEGORY_MASK_UTF_16            \
 578   (CATEGORY_MASK_UTF_16_AUTO            \
 579    | CATEGORY_MASK_UTF_16_BE            \
 580    | CATEGORY_MASK_UTF_16_LE            \
 581    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 582    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 583
 584 #define CATEGORY_MASK_UTF_8     \
 585   (CATEGORY_MASK_UTF_8_AUTO     \
 586    | CATEGORY_MASK_UTF_8_NOSIG  \
 587    | CATEGORY_MASK_UTF_8_SIG)
 588
 589 /* Table of coding categories (Lisp symbols).  This variable is for
 590    internal use only.  */
 591 static Lisp_Object Vcoding_category_table;
 592
 593 /* Table of coding-categories ordered by priority.  */
 594 static enum coding_category coding_priorities[coding_category_max];
 595
 596 /* Nth element is a coding context for the coding system bound to the
 597    Nth coding category.  */
 598 static struct coding_system coding_categories[coding_category_max];
 599
 600 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 601
 602 static int
 603 encode_inhibit_flag (Lisp_Object flag)
 604 {
 605   return NILP (flag) ? -1 : EQ (flag, Qt);
 606 }
 607
 608 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 609    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 610
 611 static bool
 612 inhibit_flag (int encoded_flag, bool var)
 613 {
 614   return 0 < encoded_flag + var;
 615 }
 616
 617 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 618   do {                                                  \
 619     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 620     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 621   } while (0)
 622
 623 static void
 624 CHECK_NATNUM_CAR (Lisp_Object x)
 625 {
 626   Lisp_Object tmp = XCAR (x);
 627   CHECK_NATNUM (tmp);
 628   XSETCAR (x, tmp);
 629 }
 630
 631 static void
 632 CHECK_NATNUM_CDR (Lisp_Object x)
 633 {
 634   Lisp_Object tmp = XCDR (x);
 635   CHECK_NATNUM (tmp);
 636   XSETCDR (x, tmp);
 637 }
 638
 639 /* True if CODING's destination can be grown.  */
 640
 641 static bool
 642 growable_destination (struct coding_system *coding)
 643 {
 644   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
 645 }
 646
 647
 648 /* Safely get one byte from the source text pointed by SRC which ends
 649    at SRC_END, and set C to that byte.  If there are not enough bytes
 650    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 651    and a multibyte character is found at SRC, set C to the
 652    negative value of the character code.  The caller should declare
 653    and set these variables appropriately in advance:
 654         src, src_end, multibytep */
 655
 656 #define ONE_MORE_BYTE(c)                                \
 657   do {                                                  \
 658     if (src == src_end)                                 \
 659       {                                                 \
 660         if (src_base < src)                             \
 661           record_conversion_result                      \
 662             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 663         goto no_more_source;                            \
 664       }                                                 \
 665     c = *src++;                                         \
 666     if (multibytep && (c & 0x80))                       \
 667       {                                                 \
 668         if ((c & 0xFE) == 0xC0)                         \
 669           c = ((c & 1) << 6) | *src++;                  \
 670         else                                            \
 671           {                                             \
 672             src--;                                      \
 673             c = - string_char (src, &src, NULL);        \
 674             record_conversion_result                    \
 675               (coding, CODING_RESULT_INVALID_SRC);      \
 676           }                                             \
 677       }                                                 \
 678     consumed_chars++;                                   \
 679   } while (0)
 680
 681 /* Safely get two bytes from the source text pointed by SRC which ends
 682    at SRC_END, and set C1 and C2 to those bytes while skipping the
 683    heading multibyte characters.  If there are not enough bytes in the
 684    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 685    a multibyte character is found for C2, set C2 to the negative value
 686    of the character code.  The caller should declare and set these
 687    variables appropriately in advance:
 688         src, src_end, multibytep
 689    It is intended that this macro is used in detect_coding_utf_16.  */
 690
 691 #define TWO_MORE_BYTES(c1, c2)                          \
 692   do {                                                  \
 693     do {                                                \
 694       if (src == src_end)                               \
 695         goto no_more_source;                            \
 696       c1 = *src++;                                      \
 697       if (multibytep && (c1 & 0x80))                    \
 698         {                                               \
 699           if ((c1 & 0xFE) == 0xC0)                      \
 700             c1 = ((c1 & 1) << 6) | *src++;              \
 701           else                                          \
 702             {                                           \
 703               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 704               c1 = -1;                                  \
 705             }                                           \
 706         }                                               \
 707     } while (c1 < 0);                                   \
 708     if (src == src_end)                                 \
 709       goto no_more_source;                              \
 710     c2 = *src++;                                        \
 711     if (multibytep && (c2 & 0x80))                      \
 712       {                                                 \
 713         if ((c2 & 0xFE) == 0xC0)                        \
 714           c2 = ((c2 & 1) << 6) | *src++;                \
 715         else                                            \
 716           c2 = -1;                                      \
 717       }                                                 \
 718   } while (0)
 719
 720
 721 /* Store a byte C in the place pointed by DST and increment DST to the
 722    next free point, and increment PRODUCED_CHARS.  The caller should
 723    assure that C is 0..127, and declare and set the variable `dst'
 724    appropriately in advance.
 725 */
 726
 727
 728 #define EMIT_ONE_ASCII_BYTE(c)  \
 729   do {                          \
 730     produced_chars++;           \
 731     *dst++ = (c);               \
 732   } while (0)
 733
 734
 735 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 736
 737 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 738   do {                                  \
 739     produced_chars += 2;                \
 740     *dst++ = (c1), *dst++ = (c2);       \
 741   } while (0)
 742
 743
 744 /* Store a byte C in the place pointed by DST and increment DST to the
 745    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 746    store in an appropriate multibyte form.  The caller should
 747    declare and set the variables `dst' and `multibytep' appropriately
 748    in advance.  */
 749
 750 #define EMIT_ONE_BYTE(c)                \
 751   do {                                  \
 752     produced_chars++;                   \
 753     if (multibytep)                     \
 754       {                                 \
 755         unsigned ch = (c);              \
 756         if (ch >= 0x80)                 \
 757           ch = BYTE8_TO_CHAR (ch);      \
 758         CHAR_STRING_ADVANCE (ch, dst);  \
 759       }                                 \
 760     else                                \
 761       *dst++ = (c);                     \
 762   } while (0)
 763
 764
 765 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 766
 767 #define EMIT_TWO_BYTES(c1, c2)          \
 768   do {                                  \
 769     produced_chars += 2;                \
 770     if (multibytep)                     \
 771       {                                 \
 772         unsigned ch;                    \
 773                                         \
 774         ch = (c1);                      \
 775         if (ch >= 0x80)                 \
 776           ch = BYTE8_TO_CHAR (ch);      \
 777         CHAR_STRING_ADVANCE (ch, dst);  \
 778         ch = (c2);                      \
 779         if (ch >= 0x80)                 \
 780           ch = BYTE8_TO_CHAR (ch);      \
 781         CHAR_STRING_ADVANCE (ch, dst);  \
 782       }                                 \
 783     else                                \
 784       {                                 \
 785         *dst++ = (c1);                  \
 786         *dst++ = (c2);                  \
 787       }                                 \
 788   } while (0)
 789
 790
 791 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 792   do {                                  \
 793     EMIT_ONE_BYTE (c1);                 \
 794     EMIT_TWO_BYTES (c2, c3);            \
 795   } while (0)
 796
 797
 798 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 799   do {                                          \
 800     EMIT_TWO_BYTES (c1, c2);                    \
 801     EMIT_TWO_BYTES (c3, c4);                    \
 802   } while (0)
 803
 804
 805 static void
 806 record_conversion_result (struct coding_system *coding,
 807                           enum coding_result_code result)
 808 {
 809   coding->result = result;
 810   switch (result)
 811     {
 812     case CODING_RESULT_INSUFFICIENT_SRC:
 813       Vlast_code_conversion_error = Qinsufficient_source;
 814       break;
 815     case CODING_RESULT_INVALID_SRC:
 816       Vlast_code_conversion_error = Qinvalid_source;
 817       break;
 818     case CODING_RESULT_INTERRUPT:
 819       Vlast_code_conversion_error = Qinterrupted;
 820       break;
 821     case CODING_RESULT_INSUFFICIENT_DST:
 822       /* Don't record this error in Vlast_code_conversion_error
 823          because it happens just temporarily and is resolved when the
 824          whole conversion is finished.  */
 825       break;
 826     case CODING_RESULT_SUCCESS:
 827       break;
 828     default:
 829       Vlast_code_conversion_error = intern ("Unknown error");
 830     }
 831 }
 832
 833 /* These wrapper macros are used to preserve validity of pointers into
 834    buffer text across calls to decode_char, encode_char, etc, which
 835    could cause relocation of buffers if it loads a charset map,
 836    because loading a charset map allocates large structures.  */
 837
 838 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 839   do {                                                                       \
 840     ptrdiff_t offset;                                                        \
 841                                                                              \
 842     charset_map_loaded = 0;                                                  \
 843     c = DECODE_CHAR (charset, code);                                         \
 844     if (charset_map_loaded                                                   \
 845         && (offset = coding_change_source (coding)))                         \
 846       {                                                                      \
 847         src += offset;                                                       \
 848         src_base += offset;                                                  \
 849         src_end += offset;                                                   \
 850       }                                                                      \
 851   } while (0)
 852
 853 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 854   do {                                                                  \
 855     ptrdiff_t offset;                                                   \
 856                                                                         \
 857     charset_map_loaded = 0;                                             \
 858     code = ENCODE_CHAR (charset, c);                                    \
 859     if (charset_map_loaded                                              \
 860         && (offset = coding_change_destination (coding)))               \
 861       {                                                                 \
 862         dst += offset;                                                  \
 863         dst_end += offset;                                              \
 864       }                                                                 \
 865   } while (0)
 866
 867 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 868   do {                                                                  \
 869     ptrdiff_t offset;                                                   \
 870                                                                         \
 871     charset_map_loaded = 0;                                             \
 872     charset = char_charset (c, charset_list, code_return);              \
 873     if (charset_map_loaded                                              \
 874         && (offset = coding_change_destination (coding)))               \
 875       {                                                                 \
 876         dst += offset;                                                  \
 877         dst_end += offset;                                              \
 878       }                                                                 \
 879   } while (0)
 880
 881 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 882   do {                                                                  \
 883     ptrdiff_t offset;                                                   \
 884                                                                         \
 885     charset_map_loaded = 0;                                             \
 886     result = CHAR_CHARSET_P (c, charset);                               \
 887     if (charset_map_loaded                                              \
 888         && (offset = coding_change_destination (coding)))               \
 889       {                                                                 \
 890         dst += offset;                                                  \
 891         dst_end += offset;                                              \
 892       }                                                                 \
 893   } while (0)
 894
 895
 896 /* If there are at least BYTES length of room at dst, allocate memory
 897    for coding->destination and update dst and dst_end.  We don't have
 898    to take care of coding->source which will be relocated.  It is
 899    handled by calling coding_set_source in encode_coding.  */
 900
 901 #define ASSURE_DESTINATION(bytes)                               \
 902   do {                                                          \
 903     if (dst + (bytes) >= dst_end)                               \
 904       {                                                         \
 905         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 906                                                                 \
 907         dst = alloc_destination (coding, more_bytes, dst);      \
 908         dst_end = coding->destination + coding->dst_bytes;      \
 909       }                                                         \
 910   } while (0)
 911
 912
 913 /* Store multibyte form of the character C in P, and advance P to the
 914    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 915    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 916    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 917
 918 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 919
 920 /* Return the character code of character whose multibyte form is at
 921    P, and advance P to the end of the multibyte form.  This used to be
 922    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 923    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 924
 925 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 926
 927 /* Set coding->source from coding->src_object.  */
 928
 929 static void
 930 coding_set_source (struct coding_system *coding)
 931 {
 932   if (BUFFERP (coding->src_object))
 933     {
 934       struct buffer *buf = XBUFFER (coding->src_object);
 935
 936       if (coding->src_pos < 0)
 937         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 938       else
 939         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 940     }
 941   else if (STRINGP (coding->src_object))
 942     {
 943       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 944     }
 945   else
 946     {
 947       /* Otherwise, the source is C string and is never relocated
 948          automatically.  Thus we don't have to update anything.  */
 949     }
 950 }
 951
 952
 953 /* Set coding->source from coding->src_object, and return how many
 954    bytes coding->source was changed.  */
 955
 956 static ptrdiff_t
 957 coding_change_source (struct coding_system *coding)
 958 {
 959   const unsigned char *orig = coding->source;
 960   coding_set_source (coding);
 961   return coding->source - orig;
 962 }
 963
 964
 965 /* Set coding->destination from coding->dst_object.  */
 966
 967 static void
 968 coding_set_destination (struct coding_system *coding)
 969 {
 970   if (BUFFERP (coding->dst_object))
 971     {
 972       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 973         {
 974           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 975           coding->dst_bytes = (GAP_END_ADDR
 976                                - (coding->src_bytes - coding->consumed)
 977                                - coding->destination);
 978         }
 979       else
 980         {
 981           /* We are sure that coding->dst_pos_byte is before the gap
 982              of the buffer. */
 983           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 984                                  + coding->dst_pos_byte - BEG_BYTE);
 985           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 986                                - coding->destination);
 987         }
 988     }
 989   else
 990     {
 991       /* Otherwise, the destination is C string and is never relocated
 992          automatically.  Thus we don't have to update anything.  */
 993     }
 994 }
 995
 996
 997 /* Set coding->destination from coding->dst_object, and return how
 998    many bytes coding->destination was changed.  */
 999
1000 static ptrdiff_t
1001 coding_change_destination (struct coding_system *coding)
1002 {
1003   const unsigned char *orig = coding->destination;
1004   coding_set_destination (coding);
1005   return coding->destination - orig;
1006 }
1007
1008
1009 static void
1010 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1011 {
1012   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1013     string_overflow ();
1014   coding->destination = xrealloc (coding->destination,
1015                                   coding->dst_bytes + bytes);
1016   coding->dst_bytes += bytes;
1017 }
1018
1019 static void
1020 coding_alloc_by_making_gap (struct coding_system *coding,
1021                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1022 {
1023   if (EQ (coding->src_object, coding->dst_object))
1024     {
1025       /* The gap may contain the produced data at the head and not-yet
1026          consumed data at the tail.  To preserve those data, we at
1027          first make the gap size to zero, then increase the gap
1028          size.  */
1029       ptrdiff_t add = GAP_SIZE;
1030
1031       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1032       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1033       make_gap (bytes);
1034       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1035       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1036     }
1037   else
1038     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1039 }
1040
1041
1042 static unsigned char *
1043 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1044                    unsigned char *dst)
1045 {
1046   ptrdiff_t offset = dst - coding->destination;
1047
1048   if (BUFFERP (coding->dst_object))
1049     {
1050       struct buffer *buf = XBUFFER (coding->dst_object);
1051
1052       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1053     }
1054   else
1055     coding_alloc_by_realloc (coding, nbytes);
1056   coding_set_destination (coding);
1057   dst = coding->destination + offset;
1058   return dst;
1059 }
1060
1061 /** Macros for annotations.  */
1062
1063 /* An annotation data is stored in the array coding->charbuf in this
1064    format:
1065      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1066    LENGTH is the number of elements in the annotation.
1067    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1068    NCHARS is the number of characters in the text annotated.
1069
1070    The format of the following elements depend on ANNOTATION_MASK.
1071
1072    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1073    follows:
1074      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1075
1076    NBYTES is the number of bytes specified in the header part of
1077    old-style emacs-mule encoding, or 0 for the other kind of
1078    composition.
1079
1080    METHOD is one of enum composition_method.
1081
1082    Optional COMPOSITION-COMPONENTS are characters and composition
1083    rules.
1084
1085    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1086    follows.
1087
1088    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1089    recover from an invalid annotation, and should be skipped by
1090    produce_annotation.  */
1091
1092 /* Maximum length of the header of annotation data.  */
1093 #define MAX_ANNOTATION_LENGTH 5
1094
1095 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1096   do {                                                  \
1097     *(buf)++ = -(len);                                  \
1098     *(buf)++ = (mask);                                  \
1099     *(buf)++ = (nchars);                                \
1100     coding->annotated = 1;                              \
1101   } while (0);
1102
1103 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1104   do {                                                                      \
1105     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1106     *buf++ = nbytes;                                                        \
1107     *buf++ = method;                                                        \
1108   } while (0)
1109
1110
1111 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1112   do {                                                                  \
1113     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1114     *buf++ = id;                                                        \
1115   } while (0)
1116
1117
1118 /* Bitmasks for coding->eol_seen.  */
1119
1120 #define EOL_SEEN_NONE   0
1121 #define EOL_SEEN_LF     1
1122 #define EOL_SEEN_CR     2
1123 #define EOL_SEEN_CRLF   4
1124
1125 \f
1126 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1127
1128
1129
1130 \f
1131 /*** 3. UTF-8 ***/
1132
1133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1134    Return true if a text is encoded in UTF-8.  */
1135
1136 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1137 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1138 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1139 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1140 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1141 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1142
1143 #define UTF_8_BOM_1 0xEF
1144 #define UTF_8_BOM_2 0xBB
1145 #define UTF_8_BOM_3 0xBF
1146
1147 /* Unlike the other detect_coding_XXX, this function counts the number
1148    of characters and checks the EOL format.  */
1149
1150 static bool
1151 detect_coding_utf_8 (struct coding_system *coding,
1152                      struct coding_detection_info *detect_info)
1153 {
1154   const unsigned char *src = coding->source, *src_base;
1155   const unsigned char *src_end = coding->source + coding->src_bytes;
1156   bool multibytep = coding->src_multibyte;
1157   ptrdiff_t consumed_chars = 0;
1158   bool bom_found = 0;
1159   ptrdiff_t nchars = coding->head_ascii;
1160   int eol_seen = coding->eol_seen;
1161
1162   detect_info->checked |= CATEGORY_MASK_UTF_8;
1163   /* A coding system of this category is always ASCII compatible.  */
1164   src += nchars;
1165
1166   if (src == coding->source     /* BOM should be at the head.  */
1167       && src + 3 < src_end      /* BOM is 3-byte long.  */
1168       && src[0] == UTF_8_BOM_1
1169       && src[1] == UTF_8_BOM_2
1170       && src[2] == UTF_8_BOM_3)
1171     {
1172       bom_found = 1;
1173       src += 3;
1174       nchars++;
1175     }
1176
1177   while (1)
1178     {
1179       int c, c1, c2, c3, c4;
1180
1181       src_base = src;
1182       ONE_MORE_BYTE (c);
1183       if (c < 0 || UTF_8_1_OCTET_P (c))
1184         {
1185           nchars++;
1186           if (c == '\r')
1187             {
1188               if (src < src_end && *src == '\n')
1189                 {
1190                   eol_seen |= EOL_SEEN_CRLF;
1191                   src++;
1192                   nchars++;
1193                 }
1194               else
1195                 eol_seen |= EOL_SEEN_CR;
1196             }
1197           else if (c == '\n')
1198             eol_seen |= EOL_SEEN_LF;
1199           continue;
1200         }
1201       ONE_MORE_BYTE (c1);
1202       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1203         break;
1204       if (UTF_8_2_OCTET_LEADING_P (c))
1205         {
1206           nchars++;
1207           continue;
1208         }
1209       ONE_MORE_BYTE (c2);
1210       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1211         break;
1212       if (UTF_8_3_OCTET_LEADING_P (c))
1213         {
1214           nchars++;
1215           continue;
1216         }
1217       ONE_MORE_BYTE (c3);
1218       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1219         break;
1220       if (UTF_8_4_OCTET_LEADING_P (c))
1221         {
1222           nchars++;
1223           continue;
1224         }
1225       ONE_MORE_BYTE (c4);
1226       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1227         break;
1228       if (UTF_8_5_OCTET_LEADING_P (c))
1229         {
1230           nchars++;
1231           continue;
1232         }
1233       break;
1234     }
1235   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1236   return 0;
1237
1238  no_more_source:
1239   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1240     {
1241       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1242       return 0;
1243     }
1244   if (bom_found)
1245     {
1246       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1247       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1248     }
1249   else
1250     {
1251       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1252       if (nchars < src_end - coding->source)
1253         /* The found characters are less than source bytes, which
1254            means that we found a valid non-ASCII characters.  */
1255         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1256     }
1257   coding->detected_utf8_bytes = src_base - coding->source;
1258   coding->detected_utf8_chars = nchars;
1259   return 1;
1260 }
1261
1262
1263 static void
1264 decode_coding_utf_8 (struct coding_system *coding)
1265 {
1266   const unsigned char *src = coding->source + coding->consumed;
1267   const unsigned char *src_end = coding->source + coding->src_bytes;
1268   const unsigned char *src_base;
1269   int *charbuf = coding->charbuf + coding->charbuf_used;
1270   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1271   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1272   bool multibytep = coding->src_multibyte;
1273   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1274   bool eol_dos
1275     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1276   int byte_after_cr = -1;
1277
1278   if (bom != utf_without_bom)
1279     {
1280       int c1, c2, c3;
1281
1282       src_base = src;
1283       ONE_MORE_BYTE (c1);
1284       if (! UTF_8_3_OCTET_LEADING_P (c1))
1285         src = src_base;
1286       else
1287         {
1288           ONE_MORE_BYTE (c2);
1289           if (! UTF_8_EXTRA_OCTET_P (c2))
1290             src = src_base;
1291           else
1292             {
1293               ONE_MORE_BYTE (c3);
1294               if (! UTF_8_EXTRA_OCTET_P (c3))
1295                 src = src_base;
1296               else
1297                 {
1298                   if ((c1 != UTF_8_BOM_1)
1299                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1300                     src = src_base;
1301                   else
1302                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1303                 }
1304             }
1305         }
1306     }
1307   CODING_UTF_8_BOM (coding) = utf_without_bom;
1308
1309   while (1)
1310     {
1311       int c, c1, c2, c3, c4, c5;
1312
1313       src_base = src;
1314       consumed_chars_base = consumed_chars;
1315
1316       if (charbuf >= charbuf_end)
1317         {
1318           if (byte_after_cr >= 0)
1319             src_base--;
1320           break;
1321         }
1322
1323       /* In the simple case, rapidly handle ordinary characters */
1324       if (multibytep && ! eol_dos
1325           && charbuf < charbuf_end - 6 && src < src_end - 6)
1326         {
1327           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1328             {
1329               c1 = *src;
1330               if (c1 & 0x80)
1331                 break;
1332               src++;
1333               consumed_chars++;
1334               *charbuf++ = c1;
1335
1336               c1 = *src;
1337               if (c1 & 0x80)
1338                 break;
1339               src++;
1340               consumed_chars++;
1341               *charbuf++ = c1;
1342
1343               c1 = *src;
1344               if (c1 & 0x80)
1345                 break;
1346               src++;
1347               consumed_chars++;
1348               *charbuf++ = c1;
1349
1350               c1 = *src;
1351               if (c1 & 0x80)
1352                 break;
1353               src++;
1354               consumed_chars++;
1355               *charbuf++ = c1;
1356             }
1357           /* If we handled at least one character, restart the main loop.  */
1358           if (src != src_base)
1359             continue;
1360         }
1361
1362       if (byte_after_cr >= 0)
1363         c1 = byte_after_cr, byte_after_cr = -1;
1364       else
1365         ONE_MORE_BYTE (c1);
1366       if (c1 < 0)
1367         {
1368           c = - c1;
1369         }
1370       else if (UTF_8_1_OCTET_P (c1))
1371         {
1372           if (eol_dos && c1 == '\r')
1373             ONE_MORE_BYTE (byte_after_cr);
1374           c = c1;
1375         }
1376       else
1377         {
1378           ONE_MORE_BYTE (c2);
1379           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1380             goto invalid_code;
1381           if (UTF_8_2_OCTET_LEADING_P (c1))
1382             {
1383               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1384               /* Reject overlong sequences here and below.  Encoders
1385                  producing them are incorrect, they can be misleading,
1386                  and they mess up read/write invariance.  */
1387               if (c < 128)
1388                 goto invalid_code;
1389             }
1390           else
1391             {
1392               ONE_MORE_BYTE (c3);
1393               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1394                 goto invalid_code;
1395               if (UTF_8_3_OCTET_LEADING_P (c1))
1396                 {
1397                   c = (((c1 & 0xF) << 12)
1398                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1399                   if (c < 0x800
1400                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1401                     goto invalid_code;
1402                 }
1403               else
1404                 {
1405                   ONE_MORE_BYTE (c4);
1406                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1407                     goto invalid_code;
1408                   if (UTF_8_4_OCTET_LEADING_P (c1))
1409                     {
1410                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1411                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1412                     if (c < 0x10000)
1413                       goto invalid_code;
1414                     }
1415                   else
1416                     {
1417                       ONE_MORE_BYTE (c5);
1418                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1419                         goto invalid_code;
1420                       if (UTF_8_5_OCTET_LEADING_P (c1))
1421                         {
1422                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1423                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1424                                | (c5 & 0x3F));
1425                           if ((c > MAX_CHAR) || (c < 0x200000))
1426                             goto invalid_code;
1427                         }
1428                       else
1429                         goto invalid_code;
1430                     }
1431                 }
1432             }
1433         }
1434
1435       *charbuf++ = c;
1436       continue;
1437
1438     invalid_code:
1439       src = src_base;
1440       consumed_chars = consumed_chars_base;
1441       ONE_MORE_BYTE (c);
1442       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1443     }
1444
1445  no_more_source:
1446   coding->consumed_char += consumed_chars_base;
1447   coding->consumed = src_base - coding->source;
1448   coding->charbuf_used = charbuf - coding->charbuf;
1449 }
1450
1451
1452 static bool
1453 encode_coding_utf_8 (struct coding_system *coding)
1454 {
1455   bool multibytep = coding->dst_multibyte;
1456   int *charbuf = coding->charbuf;
1457   int *charbuf_end = charbuf + coding->charbuf_used;
1458   unsigned char *dst = coding->destination + coding->produced;
1459   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1460   ptrdiff_t produced_chars = 0;
1461   int c;
1462
1463   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1464     {
1465       ASSURE_DESTINATION (3);
1466       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1467       CODING_UTF_8_BOM (coding) = utf_without_bom;
1468     }
1469
1470   if (multibytep)
1471     {
1472       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1473
1474       while (charbuf < charbuf_end)
1475         {
1476           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1477
1478           ASSURE_DESTINATION (safe_room);
1479           c = *charbuf++;
1480           if (CHAR_BYTE8_P (c))
1481             {
1482               c = CHAR_TO_BYTE8 (c);
1483               EMIT_ONE_BYTE (c);
1484             }
1485           else
1486             {
1487               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1488               for (p = str; p < pend; p++)
1489                 EMIT_ONE_BYTE (*p);
1490             }
1491         }
1492     }
1493   else
1494     {
1495       int safe_room = MAX_MULTIBYTE_LENGTH;
1496
1497       while (charbuf < charbuf_end)
1498         {
1499           ASSURE_DESTINATION (safe_room);
1500           c = *charbuf++;
1501           if (CHAR_BYTE8_P (c))
1502             *dst++ = CHAR_TO_BYTE8 (c);
1503           else
1504             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1505         }
1506       produced_chars = dst - (coding->destination + coding->produced);
1507     }
1508   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1509   coding->produced_char += produced_chars;
1510   coding->produced = dst - coding->destination;
1511   return 0;
1512 }
1513
1514
1515 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1516    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1517
1518 #define UTF_16_HIGH_SURROGATE_P(val) \
1519   (((val) & 0xFC00) == 0xD800)
1520
1521 #define UTF_16_LOW_SURROGATE_P(val) \
1522   (((val) & 0xFC00) == 0xDC00)
1523
1524
1525 static bool
1526 detect_coding_utf_16 (struct coding_system *coding,
1527                       struct coding_detection_info *detect_info)
1528 {
1529   const unsigned char *src = coding->source;
1530   const unsigned char *src_end = coding->source + coding->src_bytes;
1531   bool multibytep = coding->src_multibyte;
1532   int c1, c2;
1533
1534   detect_info->checked |= CATEGORY_MASK_UTF_16;
1535   if (coding->mode & CODING_MODE_LAST_BLOCK
1536       && (coding->src_chars & 1))
1537     {
1538       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1539       return 0;
1540     }
1541
1542   TWO_MORE_BYTES (c1, c2);
1543   if ((c1 == 0xFF) && (c2 == 0xFE))
1544     {
1545       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1546                              | CATEGORY_MASK_UTF_16_AUTO);
1547       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1548                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1549                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1550     }
1551   else if ((c1 == 0xFE) && (c2 == 0xFF))
1552     {
1553       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1554                              | CATEGORY_MASK_UTF_16_AUTO);
1555       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1556                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1557                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1558     }
1559   else if (c2 < 0)
1560     {
1561       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1562       return 0;
1563     }
1564   else
1565     {
1566       /* We check the dispersion of Eth and Oth bytes where E is even and
1567          O is odd.  If both are high, we assume binary data.*/
1568       unsigned char e[256], o[256];
1569       unsigned e_num = 1, o_num = 1;
1570
1571       memset (e, 0, 256);
1572       memset (o, 0, 256);
1573       e[c1] = 1;
1574       o[c2] = 1;
1575
1576       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1577                                 |CATEGORY_MASK_UTF_16_BE
1578                                 | CATEGORY_MASK_UTF_16_LE);
1579
1580       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1581              != CATEGORY_MASK_UTF_16)
1582         {
1583           TWO_MORE_BYTES (c1, c2);
1584           if (c2 < 0)
1585             break;
1586           if (! e[c1])
1587             {
1588               e[c1] = 1;
1589               e_num++;
1590               if (e_num >= 128)
1591                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1592             }
1593           if (! o[c2])
1594             {
1595               o[c2] = 1;
1596               o_num++;
1597               if (o_num >= 128)
1598                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1599             }
1600         }
1601       return 0;
1602     }
1603
1604  no_more_source:
1605   return 1;
1606 }
1607
1608 static void
1609 decode_coding_utf_16 (struct coding_system *coding)
1610 {
1611   const unsigned char *src = coding->source + coding->consumed;
1612   const unsigned char *src_end = coding->source + coding->src_bytes;
1613   const unsigned char *src_base;
1614   int *charbuf = coding->charbuf + coding->charbuf_used;
1615   /* We may produces at most 3 chars in one loop.  */
1616   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1617   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1618   bool multibytep = coding->src_multibyte;
1619   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1620   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1621   int surrogate = CODING_UTF_16_SURROGATE (coding);
1622   bool eol_dos
1623     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1624   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1625
1626   if (bom == utf_with_bom)
1627     {
1628       int c, c1, c2;
1629
1630       src_base = src;
1631       ONE_MORE_BYTE (c1);
1632       ONE_MORE_BYTE (c2);
1633       c = (c1 << 8) | c2;
1634
1635       if (endian == utf_16_big_endian
1636           ? c != 0xFEFF : c != 0xFFFE)
1637         {
1638           /* The first two bytes are not BOM.  Treat them as bytes
1639              for a normal character.  */
1640           src = src_base;
1641         }
1642       CODING_UTF_16_BOM (coding) = utf_without_bom;
1643     }
1644   else if (bom == utf_detect_bom)
1645     {
1646       /* We have already tried to detect BOM and failed in
1647          detect_coding.  */
1648       CODING_UTF_16_BOM (coding) = utf_without_bom;
1649     }
1650
1651   while (1)
1652     {
1653       int c, c1, c2;
1654
1655       src_base = src;
1656       consumed_chars_base = consumed_chars;
1657
1658       if (charbuf >= charbuf_end)
1659         {
1660           if (byte_after_cr1 >= 0)
1661             src_base -= 2;
1662           break;
1663         }
1664
1665       if (byte_after_cr1 >= 0)
1666         c1 = byte_after_cr1, byte_after_cr1 = -1;
1667       else
1668         ONE_MORE_BYTE (c1);
1669       if (c1 < 0)
1670         {
1671           *charbuf++ = -c1;
1672           continue;
1673         }
1674       if (byte_after_cr2 >= 0)
1675         c2 = byte_after_cr2, byte_after_cr2 = -1;
1676       else
1677         ONE_MORE_BYTE (c2);
1678       if (c2 < 0)
1679         {
1680           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1681           *charbuf++ = -c2;
1682           continue;
1683         }
1684       c = (endian == utf_16_big_endian
1685            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1686
1687       if (surrogate)
1688         {
1689           if (! UTF_16_LOW_SURROGATE_P (c))
1690             {
1691               if (endian == utf_16_big_endian)
1692                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1693               else
1694                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1695               *charbuf++ = c1;
1696               *charbuf++ = c2;
1697               if (UTF_16_HIGH_SURROGATE_P (c))
1698                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1699               else
1700                 *charbuf++ = c;
1701             }
1702           else
1703             {
1704               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1705               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1706               *charbuf++ = 0x10000 + c;
1707             }
1708         }
1709       else
1710         {
1711           if (UTF_16_HIGH_SURROGATE_P (c))
1712             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1713           else
1714             {
1715               if (eol_dos && c == '\r')
1716                 {
1717                   ONE_MORE_BYTE (byte_after_cr1);
1718                   ONE_MORE_BYTE (byte_after_cr2);
1719                 }
1720               *charbuf++ = c;
1721             }
1722         }
1723     }
1724
1725  no_more_source:
1726   coding->consumed_char += consumed_chars_base;
1727   coding->consumed = src_base - coding->source;
1728   coding->charbuf_used = charbuf - coding->charbuf;
1729 }
1730
1731 static bool
1732 encode_coding_utf_16 (struct coding_system *coding)
1733 {
1734   bool multibytep = coding->dst_multibyte;
1735   int *charbuf = coding->charbuf;
1736   int *charbuf_end = charbuf + coding->charbuf_used;
1737   unsigned char *dst = coding->destination + coding->produced;
1738   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1739   int safe_room = 8;
1740   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1741   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1742   ptrdiff_t produced_chars = 0;
1743   int c;
1744
1745   if (bom != utf_without_bom)
1746     {
1747       ASSURE_DESTINATION (safe_room);
1748       if (big_endian)
1749         EMIT_TWO_BYTES (0xFE, 0xFF);
1750       else
1751         EMIT_TWO_BYTES (0xFF, 0xFE);
1752       CODING_UTF_16_BOM (coding) = utf_without_bom;
1753     }
1754
1755   while (charbuf < charbuf_end)
1756     {
1757       ASSURE_DESTINATION (safe_room);
1758       c = *charbuf++;
1759       if (c > MAX_UNICODE_CHAR)
1760         c = coding->default_char;
1761
1762       if (c < 0x10000)
1763         {
1764           if (big_endian)
1765             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1766           else
1767             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1768         }
1769       else
1770         {
1771           int c1, c2;
1772
1773           c -= 0x10000;
1774           c1 = (c >> 10) + 0xD800;
1775           c2 = (c & 0x3FF) + 0xDC00;
1776           if (big_endian)
1777             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1778           else
1779             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1780         }
1781     }
1782   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1783   coding->produced = dst - coding->destination;
1784   coding->produced_char += produced_chars;
1785   return 0;
1786 }
1787
1788 \f
1789 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1790
1791 /* Emacs' internal format for representation of multiple character
1792    sets is a kind of multi-byte encoding, i.e. characters are
1793    represented by variable-length sequences of one-byte codes.
1794
1795    ASCII characters and control characters (e.g. `tab', `newline') are
1796    represented by one-byte sequences which are their ASCII codes, in
1797    the range 0x00 through 0x7F.
1798
1799    8-bit characters of the range 0x80..0x9F are represented by
1800    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1801    code + 0x20).
1802
1803    8-bit characters of the range 0xA0..0xFF are represented by
1804    one-byte sequences which are their 8-bit code.
1805
1806    The other characters are represented by a sequence of `base
1807    leading-code', optional `extended leading-code', and one or two
1808    `position-code's.  The length of the sequence is determined by the
1809    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1810    whereas extended leading-code and position-code take the range 0xA0
1811    through 0xFF.  See `charset.h' for more details about leading-code
1812    and position-code.
1813
1814    --- CODE RANGE of Emacs' internal format ---
1815    character set        range
1816    -------------        -----
1817    ascii                0x00..0x7F
1818    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1819    eight-bit-graphic    0xA0..0xBF
1820    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1821    ---------------------------------------------
1822
1823    As this is the internal character representation, the format is
1824    usually not used externally (i.e. in a file or in a data sent to a
1825    process).  But, it is possible to have a text externally in this
1826    format (i.e. by encoding by the coding system `emacs-mule').
1827
1828    In that case, a sequence of one-byte codes has a slightly different
1829    form.
1830
1831    At first, all characters in eight-bit-control are represented by
1832    one-byte sequences which are their 8-bit code.
1833
1834    Next, character composition data are represented by the byte
1835    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1836    where,
1837         METHOD is 0xF2 plus one of composition method (enum
1838         composition_method),
1839
1840         BYTES is 0xA0 plus a byte length of this composition data,
1841
1842         CHARS is 0xA0 plus a number of characters composed by this
1843         data,
1844
1845         COMPONENTs are characters of multibyte form or composition
1846         rules encoded by two-byte of ASCII codes.
1847
1848    In addition, for backward compatibility, the following formats are
1849    also recognized as composition data on decoding.
1850
1851    0x80 MSEQ ...
1852    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1853
1854    Here,
1855         MSEQ is a multibyte form but in these special format:
1856           ASCII: 0xA0 ASCII_CODE+0x80,
1857           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1858         RULE is a one byte code of the range 0xA0..0xF0 that
1859         represents a composition rule.
1860   */
1861
1862 char emacs_mule_bytes[256];
1863
1864
1865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1866    Return true if a text is encoded in 'emacs-mule'.  */
1867
1868 static bool
1869 detect_coding_emacs_mule (struct coding_system *coding,
1870                           struct coding_detection_info *detect_info)
1871 {
1872   const unsigned char *src = coding->source, *src_base;
1873   const unsigned char *src_end = coding->source + coding->src_bytes;
1874   bool multibytep = coding->src_multibyte;
1875   ptrdiff_t consumed_chars = 0;
1876   int c;
1877   int found = 0;
1878
1879   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1880   /* A coding system of this category is always ASCII compatible.  */
1881   src += coding->head_ascii;
1882
1883   while (1)
1884     {
1885       src_base = src;
1886       ONE_MORE_BYTE (c);
1887       if (c < 0)
1888         continue;
1889       if (c == 0x80)
1890         {
1891           /* Perhaps the start of composite character.  We simply skip
1892              it because analyzing it is too heavy for detecting.  But,
1893              at least, we check that the composite character
1894              constitutes of more than 4 bytes.  */
1895           const unsigned char *src_start;
1896
1897         repeat:
1898           src_start = src;
1899           do
1900             {
1901               ONE_MORE_BYTE (c);
1902             }
1903           while (c >= 0xA0);
1904
1905           if (src - src_start <= 4)
1906             break;
1907           found = CATEGORY_MASK_EMACS_MULE;
1908           if (c == 0x80)
1909             goto repeat;
1910         }
1911
1912       if (c < 0x80)
1913         {
1914           if (c < 0x20
1915               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1916             break;
1917         }
1918       else
1919         {
1920           int more_bytes = emacs_mule_bytes[c] - 1;
1921
1922           while (more_bytes > 0)
1923             {
1924               ONE_MORE_BYTE (c);
1925               if (c < 0xA0)
1926                 {
1927                   src--;        /* Unread the last byte.  */
1928                   break;
1929                 }
1930               more_bytes--;
1931             }
1932           if (more_bytes != 0)
1933             break;
1934           found = CATEGORY_MASK_EMACS_MULE;
1935         }
1936     }
1937   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1938   return 0;
1939
1940  no_more_source:
1941   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1942     {
1943       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1944       return 0;
1945     }
1946   detect_info->found |= found;
1947   return 1;
1948 }
1949
1950
1951 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1952    character.  If CMP_STATUS indicates that we must expect MSEQ or
1953    RULE described above, decode it and return the negative value of
1954    the decoded character or rule.  If an invalid byte is found, return
1955    -1.  If SRC is too short, return -2.  */
1956
1957 static int
1958 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1959                  int *nbytes, int *nchars, int *id,
1960                  struct composition_status *cmp_status)
1961 {
1962   const unsigned char *src_end = coding->source + coding->src_bytes;
1963   const unsigned char *src_base = src;
1964   bool multibytep = coding->src_multibyte;
1965   int charset_ID;
1966   unsigned code;
1967   int c;
1968   ptrdiff_t consumed_chars = 0;
1969   bool mseq_found = 0;
1970
1971   ONE_MORE_BYTE (c);
1972   if (c < 0)
1973     {
1974       c = -c;
1975       charset_ID = emacs_mule_charset[0];
1976     }
1977   else
1978     {
1979       if (c >= 0xA0)
1980         {
1981           if (cmp_status->state != COMPOSING_NO
1982               && cmp_status->old_form)
1983             {
1984               if (cmp_status->state == COMPOSING_CHAR)
1985                 {
1986                   if (c == 0xA0)
1987                     {
1988                       ONE_MORE_BYTE (c);
1989                       c -= 0x80;
1990                       if (c < 0)
1991                         goto invalid_code;
1992                     }
1993                   else
1994                     c -= 0x20;
1995                   mseq_found = 1;
1996                 }
1997               else
1998                 {
1999                   *nbytes = src - src_base;
2000                   *nchars = consumed_chars;
2001                   return -c;
2002                 }
2003             }
2004           else
2005             goto invalid_code;
2006         }
2007
2008       switch (emacs_mule_bytes[c])
2009         {
2010         case 2:
2011           if ((charset_ID = emacs_mule_charset[c]) < 0)
2012             goto invalid_code;
2013           ONE_MORE_BYTE (c);
2014           if (c < 0xA0)
2015             goto invalid_code;
2016           code = c & 0x7F;
2017           break;
2018
2019         case 3:
2020           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2021               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2022             {
2023               ONE_MORE_BYTE (c);
2024               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2025                 goto invalid_code;
2026               ONE_MORE_BYTE (c);
2027               if (c < 0xA0)
2028                 goto invalid_code;
2029               code = c & 0x7F;
2030             }
2031           else
2032             {
2033               if ((charset_ID = emacs_mule_charset[c]) < 0)
2034                 goto invalid_code;
2035               ONE_MORE_BYTE (c);
2036               if (c < 0xA0)
2037                 goto invalid_code;
2038               code = (c & 0x7F) << 8;
2039               ONE_MORE_BYTE (c);
2040               if (c < 0xA0)
2041                 goto invalid_code;
2042               code |= c & 0x7F;
2043             }
2044           break;
2045
2046         case 4:
2047           ONE_MORE_BYTE (c);
2048           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2049             goto invalid_code;
2050           ONE_MORE_BYTE (c);
2051           if (c < 0xA0)
2052             goto invalid_code;
2053           code = (c & 0x7F) << 8;
2054           ONE_MORE_BYTE (c);
2055           if (c < 0xA0)
2056             goto invalid_code;
2057           code |= c & 0x7F;
2058           break;
2059
2060         case 1:
2061           code = c;
2062           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2063           break;
2064
2065         default:
2066           emacs_abort ();
2067         }
2068       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2069                           CHARSET_FROM_ID (charset_ID), code, c);
2070       if (c < 0)
2071         goto invalid_code;
2072     }
2073   *nbytes = src - src_base;
2074   *nchars = consumed_chars;
2075   if (id)
2076     *id = charset_ID;
2077   return (mseq_found ? -c : c);
2078
2079  no_more_source:
2080   return -2;
2081
2082  invalid_code:
2083   return -1;
2084 }
2085
2086
2087 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2088
2089 /* Handle these composition sequence ('|': the end of header elements,
2090    BYTES and CHARS >= 0xA0):
2091
2092    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2093    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2094    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2095
2096    and these old form:
2097
2098    (4) relative composition: 0x80 | MSEQ ... MSEQ
2099    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2100
2101    When the starter 0x80 and the following header elements are found,
2102    this annotation header is produced.
2103
2104         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2105
2106    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2107    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2108
2109    Then, upon reading the following elements, these codes are produced
2110    until the composition end is found:
2111
2112    (1) CHAR ... CHAR
2113    (2) ALT ... ALT CHAR ... CHAR
2114    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2115    (4) CHAR ... CHAR
2116    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2117
2118    When the composition end is found, LENGTH and NCHARS in the
2119    annotation header is updated as below:
2120
2121    (1) LENGTH: unchanged, NCHARS: unchanged
2122    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2123    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2124    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2125    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2126
2127    If an error is found while composing, the annotation header is
2128    changed to the original composition header (plus filler -1s) as
2129    below:
2130
2131    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2132    (5)          [ 0x80 0xFF -1 -1- -1 ]
2133
2134    and the sequence [ -2 DECODED-RULE ] is changed to the original
2135    byte sequence as below:
2136         o the original byte sequence is B: [ B -1 ]
2137         o the original byte sequence is B1 B2: [ B1 B2 ]
2138
2139    Most of the routines are implemented by macros because many
2140    variables and labels in the caller decode_coding_emacs_mule must be
2141    accessible, and they are usually called just once (thus doesn't
2142    increase the size of compiled object).  */
2143
2144 /* Decode a composition rule represented by C as a component of
2145    composition sequence of Emacs 20 style.  Set RULE to the decoded
2146    rule. */
2147
2148 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2149   do {                                                  \
2150     int gref, nref;                                     \
2151                                                         \
2152     c -= 0xA0;                                          \
2153     if (c < 0 || c >= 81)                               \
2154       goto invalid_code;                                \
2155     gref = c / 9, nref = c % 9;                         \
2156     if (gref == 4) gref = 10;                           \
2157     if (nref == 4) nref = 10;                           \
2158     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2159   } while (0)
2160
2161
2162 /* Decode a composition rule represented by C and the following byte
2163    at SRC as a component of composition sequence of Emacs 21 style.
2164    Set RULE to the decoded rule.  */
2165
2166 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2167   do {                                                  \
2168     int gref, nref;                                     \
2169                                                         \
2170     gref = c - 0x20;                                    \
2171     if (gref < 0 || gref >= 81)                         \
2172       goto invalid_code;                                \
2173     ONE_MORE_BYTE (c);                                  \
2174     nref = c - 0x20;                                    \
2175     if (nref < 0 || nref >= 81)                         \
2176       goto invalid_code;                                \
2177     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2178   } while (0)
2179
2180
2181 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2182    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2183    byte length of this composition information, CHARS is the number of
2184    characters composed by this composition.  */
2185
2186 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2187   do {                                                                  \
2188     enum composition_method method = c - 0xF2;                          \
2189     int nbytes, nchars;                                                 \
2190                                                                         \
2191     ONE_MORE_BYTE (c);                                                  \
2192     if (c < 0)                                                          \
2193       goto invalid_code;                                                \
2194     nbytes = c - 0xA0;                                                  \
2195     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2196       goto invalid_code;                                                \
2197     ONE_MORE_BYTE (c);                                                  \
2198     nchars = c - 0xA0;                                                  \
2199     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2200       goto invalid_code;                                                \
2201     cmp_status->old_form = 0;                                           \
2202     cmp_status->method = method;                                        \
2203     if (method == COMPOSITION_RELATIVE)                                 \
2204       cmp_status->state = COMPOSING_CHAR;                               \
2205     else                                                                \
2206       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2207     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2208     cmp_status->nchars = nchars;                                        \
2209     cmp_status->ncomps = nbytes - 4;                                    \
2210     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2211   } while (0)
2212
2213
2214 /* Start of Emacs 20 style format for relative composition.  */
2215
2216 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2217   do {                                                          \
2218     cmp_status->old_form = 1;                                   \
2219     cmp_status->method = COMPOSITION_RELATIVE;                  \
2220     cmp_status->state = COMPOSING_CHAR;                         \
2221     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2222     cmp_status->nchars = cmp_status->ncomps = 0;                \
2223     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2224   } while (0)
2225
2226
2227 /* Start of Emacs 20 style format for rule-base composition.  */
2228
2229 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2230   do {                                                          \
2231     cmp_status->old_form = 1;                                   \
2232     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2233     cmp_status->state = COMPOSING_CHAR;                         \
2234     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2235     cmp_status->nchars = cmp_status->ncomps = 0;                \
2236     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2237   } while (0)
2238
2239
2240 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2241   do {                                                  \
2242     const unsigned char *current_src = src;             \
2243                                                         \
2244     ONE_MORE_BYTE (c);                                  \
2245     if (c < 0)                                          \
2246       goto invalid_code;                                \
2247     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2248         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2249       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2250     else if (c < 0xA0)                                  \
2251       goto invalid_code;                                \
2252     else if (c < 0xC0)                                  \
2253       {                                                 \
2254         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2255         /* Re-read C as a composition component.  */    \
2256         src = current_src;                              \
2257       }                                                 \
2258     else if (c == 0xFF)                                 \
2259       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2260     else                                                \
2261       goto invalid_code;                                \
2262   } while (0)
2263
2264 #define EMACS_MULE_COMPOSITION_END()                            \
2265   do {                                                          \
2266     int idx = - cmp_status->length;                             \
2267                                                                 \
2268     if (cmp_status->old_form)                                   \
2269       charbuf[idx + 2] = cmp_status->nchars;                    \
2270     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2271       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2272     cmp_status->state = COMPOSING_NO;                           \
2273   } while (0)
2274
2275
2276 static int
2277 emacs_mule_finish_composition (int *charbuf,
2278                                struct composition_status *cmp_status)
2279 {
2280   int idx = - cmp_status->length;
2281   int new_chars;
2282
2283   if (cmp_status->old_form && cmp_status->nchars > 0)
2284     {
2285       charbuf[idx + 2] = cmp_status->nchars;
2286       new_chars = 0;
2287       if (cmp_status->method == COMPOSITION_WITH_RULE
2288           && cmp_status->state == COMPOSING_CHAR)
2289         {
2290           /* The last rule was invalid.  */
2291           int rule = charbuf[-1] + 0xA0;
2292
2293           charbuf[-2] = BYTE8_TO_CHAR (rule);
2294           charbuf[-1] = -1;
2295           new_chars = 1;
2296         }
2297     }
2298   else
2299     {
2300       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2301
2302       if (cmp_status->method == COMPOSITION_WITH_RULE)
2303         {
2304           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2305           charbuf[idx++] = -3;
2306           charbuf[idx++] = 0;
2307           new_chars = 1;
2308         }
2309       else
2310         {
2311           int nchars = charbuf[idx + 1] + 0xA0;
2312           int nbytes = charbuf[idx + 2] + 0xA0;
2313
2314           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2315           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2316           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2317           charbuf[idx++] = -1;
2318           new_chars = 4;
2319         }
2320     }
2321   cmp_status->state = COMPOSING_NO;
2322   return new_chars;
2323 }
2324
2325 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2326   do {                                                                    \
2327     if (cmp_status->state != COMPOSING_NO)                                \
2328       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2329   } while (0)
2330
2331
2332 static void
2333 decode_coding_emacs_mule (struct coding_system *coding)
2334 {
2335   const unsigned char *src = coding->source + coding->consumed;
2336   const unsigned char *src_end = coding->source + coding->src_bytes;
2337   const unsigned char *src_base;
2338   int *charbuf = coding->charbuf + coding->charbuf_used;
2339   /* We may produce two annotations (charset and composition) in one
2340      loop and one more charset annotation at the end.  */
2341   int *charbuf_end
2342     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2343       /* We can produce up to 2 characters in a loop.  */
2344       - 1;
2345   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2346   bool multibytep = coding->src_multibyte;
2347   ptrdiff_t char_offset = coding->produced_char;
2348   ptrdiff_t last_offset = char_offset;
2349   int last_id = charset_ascii;
2350   bool eol_dos
2351     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2352   int byte_after_cr = -1;
2353   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2354
2355   if (cmp_status->state != COMPOSING_NO)
2356     {
2357       int i;
2358
2359       if (charbuf_end - charbuf < cmp_status->length)
2360         emacs_abort ();
2361       for (i = 0; i < cmp_status->length; i++)
2362         *charbuf++ = cmp_status->carryover[i];
2363       coding->annotated = 1;
2364     }
2365
2366   while (1)
2367     {
2368       int c, id IF_LINT (= 0);
2369
2370       src_base = src;
2371       consumed_chars_base = consumed_chars;
2372
2373       if (charbuf >= charbuf_end)
2374         {
2375           if (byte_after_cr >= 0)
2376             src_base--;
2377           break;
2378         }
2379
2380       if (byte_after_cr >= 0)
2381         c = byte_after_cr, byte_after_cr = -1;
2382       else
2383         ONE_MORE_BYTE (c);
2384
2385       if (c < 0 || c == 0x80)
2386         {
2387           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2388           if (c < 0)
2389             {
2390               *charbuf++ = -c;
2391               char_offset++;
2392             }
2393           else
2394             DECODE_EMACS_MULE_COMPOSITION_START ();
2395           continue;
2396         }
2397
2398       if (c < 0x80)
2399         {
2400           if (eol_dos && c == '\r')
2401             ONE_MORE_BYTE (byte_after_cr);
2402           id = charset_ascii;
2403           if (cmp_status->state != COMPOSING_NO)
2404             {
2405               if (cmp_status->old_form)
2406                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2407               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2408                 cmp_status->ncomps--;
2409             }
2410         }
2411       else
2412         {
2413           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2414           /* emacs_mule_char can load a charset map from a file, which
2415              allocates a large structure and might cause buffer text
2416              to be relocated as result.  Thus, we need to remember the
2417              original pointer to buffer text, and fix up all related
2418              pointers after the call.  */
2419           const unsigned char *orig = coding->source;
2420           ptrdiff_t offset;
2421
2422           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2423                                cmp_status);
2424           offset = coding->source - orig;
2425           if (offset)
2426             {
2427               src += offset;
2428               src_base += offset;
2429               src_end += offset;
2430             }
2431           if (c < 0)
2432             {
2433               if (c == -1)
2434                 goto invalid_code;
2435               if (c == -2)
2436                 break;
2437             }
2438           src = src_base + nbytes;
2439           consumed_chars = consumed_chars_base + nchars;
2440           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2441             cmp_status->ncomps -= nchars;
2442         }
2443
2444       /* Now if C >= 0, we found a normally encoded character, if C <
2445          0, we found an old-style composition component character or
2446          rule.  */
2447
2448       if (cmp_status->state == COMPOSING_NO)
2449         {
2450           if (last_id != id)
2451             {
2452               if (last_id != charset_ascii)
2453                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2454                                   last_id);
2455               last_id = id;
2456               last_offset = char_offset;
2457             }
2458           *charbuf++ = c;
2459           char_offset++;
2460         }
2461       else if (cmp_status->state == COMPOSING_CHAR)
2462         {
2463           if (cmp_status->old_form)
2464             {
2465               if (c >= 0)
2466                 {
2467                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2468                   *charbuf++ = c;
2469                   char_offset++;
2470                 }
2471               else
2472                 {
2473                   *charbuf++ = -c;
2474                   cmp_status->nchars++;
2475                   cmp_status->length++;
2476                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2477                     EMACS_MULE_COMPOSITION_END ();
2478                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2479                     cmp_status->state = COMPOSING_RULE;
2480                 }
2481             }
2482           else
2483             {
2484               *charbuf++ = c;
2485               cmp_status->length++;
2486               cmp_status->nchars--;
2487               if (cmp_status->nchars == 0)
2488                 EMACS_MULE_COMPOSITION_END ();
2489             }
2490         }
2491       else if (cmp_status->state == COMPOSING_RULE)
2492         {
2493           int rule;
2494
2495           if (c >= 0)
2496             {
2497               EMACS_MULE_COMPOSITION_END ();
2498               *charbuf++ = c;
2499               char_offset++;
2500             }
2501           else
2502             {
2503               c = -c;
2504               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2505               if (rule < 0)
2506                 goto invalid_code;
2507               *charbuf++ = -2;
2508               *charbuf++ = rule;
2509               cmp_status->length += 2;
2510               cmp_status->state = COMPOSING_CHAR;
2511             }
2512         }
2513       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2514         {
2515           *charbuf++ = c;
2516           cmp_status->length++;
2517           if (cmp_status->ncomps == 0)
2518             cmp_status->state = COMPOSING_CHAR;
2519           else if (cmp_status->ncomps > 0)
2520             {
2521               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2522                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2523             }
2524           else
2525             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2526         }
2527       else                      /* COMPOSING_COMPONENT_RULE */
2528         {
2529           int rule;
2530
2531           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2532           if (rule < 0)
2533             goto invalid_code;
2534           *charbuf++ = -2;
2535           *charbuf++ = rule;
2536           cmp_status->length += 2;
2537           cmp_status->ncomps--;
2538           if (cmp_status->ncomps > 0)
2539             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2540           else
2541             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2542         }
2543       continue;
2544
2545     invalid_code:
2546       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2547       src = src_base;
2548       consumed_chars = consumed_chars_base;
2549       ONE_MORE_BYTE (c);
2550       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2551       char_offset++;
2552     }
2553
2554  no_more_source:
2555   if (cmp_status->state != COMPOSING_NO)
2556     {
2557       if (coding->mode & CODING_MODE_LAST_BLOCK)
2558         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2559       else
2560         {
2561           int i;
2562
2563           charbuf -= cmp_status->length;
2564           for (i = 0; i < cmp_status->length; i++)
2565             cmp_status->carryover[i] = charbuf[i];
2566         }
2567     }
2568   if (last_id != charset_ascii)
2569     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2570   coding->consumed_char += consumed_chars_base;
2571   coding->consumed = src_base - coding->source;
2572   coding->charbuf_used = charbuf - coding->charbuf;
2573 }
2574
2575
2576 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2577   do {                                          \
2578     if (id < 0xA0)                              \
2579       codes[0] = id, codes[1] = 0;              \
2580     else if (id < 0xE0)                         \
2581       codes[0] = 0x9A, codes[1] = id;           \
2582     else if (id < 0xF0)                         \
2583       codes[0] = 0x9B, codes[1] = id;           \
2584     else if (id < 0xF5)                         \
2585       codes[0] = 0x9C, codes[1] = id;           \
2586     else                                        \
2587       codes[0] = 0x9D, codes[1] = id;           \
2588   } while (0);
2589
2590
2591 static bool
2592 encode_coding_emacs_mule (struct coding_system *coding)
2593 {
2594   bool multibytep = coding->dst_multibyte;
2595   int *charbuf = coding->charbuf;
2596   int *charbuf_end = charbuf + coding->charbuf_used;
2597   unsigned char *dst = coding->destination + coding->produced;
2598   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2599   int safe_room = 8;
2600   ptrdiff_t produced_chars = 0;
2601   Lisp_Object attrs, charset_list;
2602   int c;
2603   int preferred_charset_id = -1;
2604
2605   CODING_GET_INFO (coding, attrs, charset_list);
2606   if (! EQ (charset_list, Vemacs_mule_charset_list))
2607     {
2608       charset_list = Vemacs_mule_charset_list;
2609       ASET (attrs, coding_attr_charset_list, charset_list);
2610     }
2611
2612   while (charbuf < charbuf_end)
2613     {
2614       ASSURE_DESTINATION (safe_room);
2615       c = *charbuf++;
2616
2617       if (c < 0)
2618         {
2619           /* Handle an annotation.  */
2620           switch (*charbuf)
2621             {
2622             case CODING_ANNOTATE_COMPOSITION_MASK:
2623               /* Not yet implemented.  */
2624               break;
2625             case CODING_ANNOTATE_CHARSET_MASK:
2626               preferred_charset_id = charbuf[3];
2627               if (preferred_charset_id >= 0
2628                   && NILP (Fmemq (make_number (preferred_charset_id),
2629                                   charset_list)))
2630                 preferred_charset_id = -1;
2631               break;
2632             default:
2633               emacs_abort ();
2634             }
2635           charbuf += -c - 1;
2636           continue;
2637         }
2638
2639       if (ASCII_CHAR_P (c))
2640         EMIT_ONE_ASCII_BYTE (c);
2641       else if (CHAR_BYTE8_P (c))
2642         {
2643           c = CHAR_TO_BYTE8 (c);
2644           EMIT_ONE_BYTE (c);
2645         }
2646       else
2647         {
2648           struct charset *charset;
2649           unsigned code;
2650           int dimension;
2651           int emacs_mule_id;
2652           unsigned char leading_codes[2];
2653
2654           if (preferred_charset_id >= 0)
2655             {
2656               bool result;
2657
2658               charset = CHARSET_FROM_ID (preferred_charset_id);
2659               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2660               if (result)
2661                 code = ENCODE_CHAR (charset, c);
2662               else
2663                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2664                                      &code, charset);
2665             }
2666           else
2667             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2668                                  &code, charset);
2669           if (! charset)
2670             {
2671               c = coding->default_char;
2672               if (ASCII_CHAR_P (c))
2673                 {
2674                   EMIT_ONE_ASCII_BYTE (c);
2675                   continue;
2676                 }
2677               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2678                                    &code, charset);
2679             }
2680           dimension = CHARSET_DIMENSION (charset);
2681           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2682           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2683           EMIT_ONE_BYTE (leading_codes[0]);
2684           if (leading_codes[1])
2685             EMIT_ONE_BYTE (leading_codes[1]);
2686           if (dimension == 1)
2687             EMIT_ONE_BYTE (code | 0x80);
2688           else
2689             {
2690               code |= 0x8080;
2691               EMIT_ONE_BYTE (code >> 8);
2692               EMIT_ONE_BYTE (code & 0xFF);
2693             }
2694         }
2695     }
2696   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2697   coding->produced_char += produced_chars;
2698   coding->produced = dst - coding->destination;
2699   return 0;
2700 }
2701
2702 \f
2703 /*** 7. ISO2022 handlers ***/
2704
2705 /* The following note describes the coding system ISO2022 briefly.
2706    Since the intention of this note is to help understand the
2707    functions in this file, some parts are NOT ACCURATE or are OVERLY
2708    SIMPLIFIED.  For thorough understanding, please refer to the
2709    original document of ISO2022.  This is equivalent to the standard
2710    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2711
2712    ISO2022 provides many mechanisms to encode several character sets
2713    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2714    is encoded using bytes less than 128.  This may make the encoded
2715    text a little bit longer, but the text passes more easily through
2716    several types of gateway, some of which strip off the MSB (Most
2717    Significant Bit).
2718
2719    There are two kinds of character sets: control character sets and
2720    graphic character sets.  The former contain control characters such
2721    as `newline' and `escape' to provide control functions (control
2722    functions are also provided by escape sequences).  The latter
2723    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2724    two control character sets and many graphic character sets.
2725
2726    Graphic character sets are classified into one of the following
2727    four classes, according to the number of bytes (DIMENSION) and
2728    number of characters in one dimension (CHARS) of the set:
2729    - DIMENSION1_CHARS94
2730    - DIMENSION1_CHARS96
2731    - DIMENSION2_CHARS94
2732    - DIMENSION2_CHARS96
2733
2734    In addition, each character set is assigned an identification tag,
2735    unique for each set, called the "final character" (denoted as <F>
2736    hereafter).  The <F> of each character set is decided by ECMA(*)
2737    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2738    (0x30..0x3F are for private use only).
2739
2740    Note (*): ECMA = European Computer Manufacturers Association
2741
2742    Here are examples of graphic character sets [NAME(<F>)]:
2743         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2744         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2745         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2746         o DIMENSION2_CHARS96 -- none for the moment
2747
2748    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2749         C0 [0x00..0x1F] -- control character plane 0
2750         GL [0x20..0x7F] -- graphic character plane 0
2751         C1 [0x80..0x9F] -- control character plane 1
2752         GR [0xA0..0xFF] -- graphic character plane 1
2753
2754    A control character set is directly designated and invoked to C0 or
2755    C1 by an escape sequence.  The most common case is that:
2756    - ISO646's  control character set is designated/invoked to C0, and
2757    - ISO6429's control character set is designated/invoked to C1,
2758    and usually these designations/invocations are omitted in encoded
2759    text.  In a 7-bit environment, only C0 can be used, and a control
2760    character for C1 is encoded by an appropriate escape sequence to
2761    fit into the environment.  All control characters for C1 are
2762    defined to have corresponding escape sequences.
2763
2764    A graphic character set is at first designated to one of four
2765    graphic registers (G0 through G3), then these graphic registers are
2766    invoked to GL or GR.  These designations and invocations can be
2767    done independently.  The most common case is that G0 is invoked to
2768    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2769    these invocations and designations are omitted in encoded text.
2770    In a 7-bit environment, only GL can be used.
2771
2772    When a graphic character set of CHARS94 is invoked to GL, codes
2773    0x20 and 0x7F of the GL area work as control characters SPACE and
2774    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2775    be used.
2776
2777    There are two ways of invocation: locking-shift and single-shift.
2778    With locking-shift, the invocation lasts until the next different
2779    invocation, whereas with single-shift, the invocation affects the
2780    following character only and doesn't affect the locking-shift
2781    state.  Invocations are done by the following control characters or
2782    escape sequences:
2783
2784    ----------------------------------------------------------------------
2785    abbrev  function                  cntrl escape seq   description
2786    ----------------------------------------------------------------------
2787    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2788    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2789    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2790    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2791    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2792    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2793    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2794    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2795    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2796    ----------------------------------------------------------------------
2797    (*) These are not used by any known coding system.
2798
2799    Control characters for these functions are defined by macros
2800    ISO_CODE_XXX in `coding.h'.
2801
2802    Designations are done by the following escape sequences:
2803    ----------------------------------------------------------------------
2804    escape sequence      description
2805    ----------------------------------------------------------------------
2806    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2807    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2808    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2809    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2810    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2811    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2812    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2813    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2814    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2815    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2816    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2817    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2818    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2819    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2820    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2821    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2822    ----------------------------------------------------------------------
2823
2824    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2825    of dimension 1, chars 94, and final character <F>, etc...
2826
2827    Note (*): Although these designations are not allowed in ISO2022,
2828    Emacs accepts them on decoding, and produces them on encoding
2829    CHARS96 character sets in a coding system which is characterized as
2830    7-bit environment, non-locking-shift, and non-single-shift.
2831
2832    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2833    '(' must be omitted.  We refer to this as "short-form" hereafter.
2834
2835    Now you may notice that there are a lot of ways of encoding the
2836    same multilingual text in ISO2022.  Actually, there exist many
2837    coding systems such as Compound Text (used in X11's inter client
2838    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2839    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2840    localized platforms), and all of these are variants of ISO2022.
2841
2842    In addition to the above, Emacs handles two more kinds of escape
2843    sequences: ISO6429's direction specification and Emacs' private
2844    sequence for specifying character composition.
2845
2846    ISO6429's direction specification takes the following form:
2847         o CSI ']'      -- end of the current direction
2848         o CSI '0' ']'  -- end of the current direction
2849         o CSI '1' ']'  -- start of left-to-right text
2850         o CSI '2' ']'  -- start of right-to-left text
2851    The control character CSI (0x9B: control sequence introducer) is
2852    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2853
2854    Character composition specification takes the following form:
2855         o ESC '0' -- start relative composition
2856         o ESC '1' -- end composition
2857         o ESC '2' -- start rule-base composition (*)
2858         o ESC '3' -- start relative composition with alternate chars  (**)
2859         o ESC '4' -- start rule-base composition with alternate chars  (**)
2860   Since these are not standard escape sequences of any ISO standard,
2861   the use of them with these meanings is restricted to Emacs only.
2862
2863   (*) This form is used only in Emacs 20.7 and older versions,
2864   but newer versions can safely decode it.
2865   (**) This form is used only in Emacs 21.1 and newer versions,
2866   and older versions can't decode it.
2867
2868   Here's a list of example usages of these composition escape
2869   sequences (categorized by `enum composition_method').
2870
2871   COMPOSITION_RELATIVE:
2872         ESC 0 CHAR [ CHAR ] ESC 1
2873   COMPOSITION_WITH_RULE:
2874         ESC 2 CHAR [ RULE CHAR ] ESC 1
2875   COMPOSITION_WITH_ALTCHARS:
2876         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2877   COMPOSITION_WITH_RULE_ALTCHARS:
2878         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2879
2880 static enum iso_code_class_type iso_code_class[256];
2881
2882 #define SAFE_CHARSET_P(coding, id)      \
2883   ((id) <= (coding)->max_charset_id     \
2884    && (coding)->safe_charsets[id] != 255)
2885
2886 static void
2887 setup_iso_safe_charsets (Lisp_Object attrs)
2888 {
2889   Lisp_Object charset_list, safe_charsets;
2890   Lisp_Object request;
2891   Lisp_Object reg_usage;
2892   Lisp_Object tail;
2893   EMACS_INT reg94, reg96;
2894   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2895   int max_charset_id;
2896
2897   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2898   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2899       && ! EQ (charset_list, Viso_2022_charset_list))
2900     {
2901       charset_list = Viso_2022_charset_list;
2902       ASET (attrs, coding_attr_charset_list, charset_list);
2903       ASET (attrs, coding_attr_safe_charsets, Qnil);
2904     }
2905
2906   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2907     return;
2908
2909   max_charset_id = 0;
2910   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2911     {
2912       int id = XINT (XCAR (tail));
2913       if (max_charset_id < id)
2914         max_charset_id = id;
2915     }
2916
2917   safe_charsets = make_uninit_string (max_charset_id + 1);
2918   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2919   request = AREF (attrs, coding_attr_iso_request);
2920   reg_usage = AREF (attrs, coding_attr_iso_usage);
2921   reg94 = XINT (XCAR (reg_usage));
2922   reg96 = XINT (XCDR (reg_usage));
2923
2924   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2925     {
2926       Lisp_Object id;
2927       Lisp_Object reg;
2928       struct charset *charset;
2929
2930       id = XCAR (tail);
2931       charset = CHARSET_FROM_ID (XINT (id));
2932       reg = Fcdr (Fassq (id, request));
2933       if (! NILP (reg))
2934         SSET (safe_charsets, XINT (id), XINT (reg));
2935       else if (charset->iso_chars_96)
2936         {
2937           if (reg96 < 4)
2938             SSET (safe_charsets, XINT (id), reg96);
2939         }
2940       else
2941         {
2942           if (reg94 < 4)
2943             SSET (safe_charsets, XINT (id), reg94);
2944         }
2945     }
2946   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2947 }
2948
2949
2950 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2951    Return true if a text is encoded in one of ISO-2022 based coding
2952    systems.  */
2953
2954 static bool
2955 detect_coding_iso_2022 (struct coding_system *coding,
2956                         struct coding_detection_info *detect_info)
2957 {
2958   const unsigned char *src = coding->source, *src_base = src;
2959   const unsigned char *src_end = coding->source + coding->src_bytes;
2960   bool multibytep = coding->src_multibyte;
2961   bool single_shifting = 0;
2962   int id;
2963   int c, c1;
2964   ptrdiff_t consumed_chars = 0;
2965   int i;
2966   int rejected = 0;
2967   int found = 0;
2968   int composition_count = -1;
2969
2970   detect_info->checked |= CATEGORY_MASK_ISO;
2971
2972   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2973     {
2974       struct coding_system *this = &(coding_categories[i]);
2975       Lisp_Object attrs, val;
2976
2977       if (this->id < 0)
2978         continue;
2979       attrs = CODING_ID_ATTRS (this->id);
2980       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2981           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2982         setup_iso_safe_charsets (attrs);
2983       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2984       this->max_charset_id = SCHARS (val) - 1;
2985       this->safe_charsets = SDATA (val);
2986     }
2987
2988   /* A coding system of this category is always ASCII compatible.  */
2989   src += coding->head_ascii;
2990
2991   while (rejected != CATEGORY_MASK_ISO)
2992     {
2993       src_base = src;
2994       ONE_MORE_BYTE (c);
2995       switch (c)
2996         {
2997         case ISO_CODE_ESC:
2998           if (inhibit_iso_escape_detection)
2999             break;
3000           single_shifting = 0;
3001           ONE_MORE_BYTE (c);
3002           if (c == 'N' || c == 'O')
3003             {
3004               /* ESC <Fe> for SS2 or SS3.  */
3005               single_shifting = 1;
3006               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3007             }
3008           else if (c == '1')
3009             {
3010               /* End of composition.  */
3011               if (composition_count < 0
3012                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3013                 /* Invalid */
3014                 break;
3015               composition_count = -1;
3016               found |= CATEGORY_MASK_ISO;
3017             }
3018           else if (c >= '0' && c <= '4')
3019             {
3020               /* ESC <Fp> for start/end composition.  */
3021               composition_count = 0;
3022             }
3023           else
3024             {
3025               if (c >= '(' && c <= '/')
3026                 {
3027                   /* Designation sequence for a charset of dimension 1.  */
3028                   ONE_MORE_BYTE (c1);
3029                   if (c1 < ' ' || c1 >= 0x80
3030                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3031                     {
3032                       /* Invalid designation sequence.  Just ignore.  */
3033                       if (c1 >= 0x80)
3034                         rejected |= (CATEGORY_MASK_ISO_7BIT
3035                                      | CATEGORY_MASK_ISO_7_ELSE);
3036                       break;
3037                     }
3038                 }
3039               else if (c == '$')
3040                 {
3041                   /* Designation sequence for a charset of dimension 2.  */
3042                   ONE_MORE_BYTE (c);
3043                   if (c >= '@' && c <= 'B')
3044                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3045                     id = iso_charset_table[1][0][c];
3046                   else if (c >= '(' && c <= '/')
3047                     {
3048                       ONE_MORE_BYTE (c1);
3049                       if (c1 < ' ' || c1 >= 0x80
3050                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3051                         {
3052                           /* Invalid designation sequence.  Just ignore.  */
3053                           if (c1 >= 0x80)
3054                             rejected |= (CATEGORY_MASK_ISO_7BIT
3055                                          | CATEGORY_MASK_ISO_7_ELSE);
3056                           break;
3057                         }
3058                     }
3059                   else
3060                     {
3061                       /* Invalid designation sequence.  Just ignore it.  */
3062                       if (c >= 0x80)
3063                         rejected |= (CATEGORY_MASK_ISO_7BIT
3064                                      | CATEGORY_MASK_ISO_7_ELSE);
3065                       break;
3066                     }
3067                 }
3068               else
3069                 {
3070                   /* Invalid escape sequence.  Just ignore it.  */
3071                   if (c >= 0x80)
3072                     rejected |= (CATEGORY_MASK_ISO_7BIT
3073                                  | CATEGORY_MASK_ISO_7_ELSE);
3074                   break;
3075                 }
3076
3077               /* We found a valid designation sequence for CHARSET.  */
3078               rejected |= CATEGORY_MASK_ISO_8BIT;
3079               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3080                                   id))
3081                 found |= CATEGORY_MASK_ISO_7;
3082               else
3083                 rejected |= CATEGORY_MASK_ISO_7;
3084               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3085                                   id))
3086                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3087               else
3088                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3089               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3090                                   id))
3091                 found |= CATEGORY_MASK_ISO_7_ELSE;
3092               else
3093                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3094               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3095                                   id))
3096                 found |= CATEGORY_MASK_ISO_8_ELSE;
3097               else
3098                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3099             }
3100           break;
3101
3102         case ISO_CODE_SO:
3103         case ISO_CODE_SI:
3104           /* Locking shift out/in.  */
3105           if (inhibit_iso_escape_detection)
3106             break;
3107           single_shifting = 0;
3108           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3109           break;
3110
3111         case ISO_CODE_CSI:
3112           /* Control sequence introducer.  */
3113           single_shifting = 0;
3114           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3115           found |= CATEGORY_MASK_ISO_8_ELSE;
3116           goto check_extra_latin;
3117
3118         case ISO_CODE_SS2:
3119         case ISO_CODE_SS3:
3120           /* Single shift.   */
3121           if (inhibit_iso_escape_detection)
3122             break;
3123           single_shifting = 0;
3124           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3125           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3126               & CODING_ISO_FLAG_SINGLE_SHIFT)
3127             {
3128               found |= CATEGORY_MASK_ISO_8_1;
3129               single_shifting = 1;
3130             }
3131           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3132               & CODING_ISO_FLAG_SINGLE_SHIFT)
3133             {
3134               found |= CATEGORY_MASK_ISO_8_2;
3135               single_shifting = 1;
3136             }
3137           if (single_shifting)
3138             break;
3139           goto check_extra_latin;
3140
3141         default:
3142           if (c < 0)
3143             continue;
3144           if (c < 0x80)
3145             {
3146               if (composition_count >= 0)
3147                 composition_count++;
3148               single_shifting = 0;
3149               break;
3150             }
3151           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3152           if (c >= 0xA0)
3153             {
3154               found |= CATEGORY_MASK_ISO_8_1;
3155               /* Check the length of succeeding codes of the range
3156                  0xA0..0FF.  If the byte length is even, we include
3157                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3158                  only when we are not single shifting.  */
3159               if (! single_shifting
3160                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3161                 {
3162                   ptrdiff_t len = 1;
3163                   while (src < src_end)
3164                     {
3165                       src_base = src;
3166                       ONE_MORE_BYTE (c);
3167                       if (c < 0xA0)
3168                         {
3169                           src = src_base;
3170                           break;
3171                         }
3172                       len++;
3173                     }
3174
3175                   if (len & 1 && src < src_end)
3176                     {
3177                       rejected |= CATEGORY_MASK_ISO_8_2;
3178                       if (composition_count >= 0)
3179                         composition_count += len;
3180                     }
3181                   else
3182                     {
3183                       found |= CATEGORY_MASK_ISO_8_2;
3184                       if (composition_count >= 0)
3185                         composition_count += len / 2;
3186                     }
3187                 }
3188               break;
3189             }
3190         check_extra_latin:
3191           if (! VECTORP (Vlatin_extra_code_table)
3192               || NILP (AREF (Vlatin_extra_code_table, c)))
3193             {
3194               rejected = CATEGORY_MASK_ISO;
3195               break;
3196             }
3197           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3198               & CODING_ISO_FLAG_LATIN_EXTRA)
3199             found |= CATEGORY_MASK_ISO_8_1;
3200           else
3201             rejected |= CATEGORY_MASK_ISO_8_1;
3202           rejected |= CATEGORY_MASK_ISO_8_2;
3203           break;
3204         }
3205     }
3206   detect_info->rejected |= CATEGORY_MASK_ISO;
3207   return 0;
3208
3209  no_more_source:
3210   detect_info->rejected |= rejected;
3211   detect_info->found |= (found & ~rejected);
3212   return 1;
3213 }
3214
3215
3216 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3217    escape sequence should be kept.  */
3218 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3219   do {                                                                  \
3220     int id, prev;                                                       \
3221                                                                         \
3222     if (final < '0' || final >= 128                                     \
3223         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3224         || !SAFE_CHARSET_P (coding, id))                                \
3225       {                                                                 \
3226         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3227         chars_96 = -1;                                                  \
3228         break;                                                          \
3229       }                                                                 \
3230     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3231     if (id == charset_jisx0201_roman)                                   \
3232       {                                                                 \
3233         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3234           id = charset_ascii;                                           \
3235       }                                                                 \
3236     else if (id == charset_jisx0208_1978)                               \
3237       {                                                                 \
3238         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3239           id = charset_jisx0208;                                        \
3240       }                                                                 \
3241     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3242     /* If there was an invalid designation to REG previously, and this  \
3243        designation is ASCII to REG, we should keep this designation     \
3244        sequence.  */                                                    \
3245     if (prev == -2 && id == charset_ascii)                              \
3246       chars_96 = -1;                                                    \
3247   } while (0)
3248
3249
3250 /* Handle these composition sequence (ALT: alternate char):
3251
3252    (1) relative composition: ESC 0 CHAR ... ESC 1
3253    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3254    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3255    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3256
3257    When the start sequence (ESC 0/2/3/4) is found, this annotation
3258    header is produced.
3259
3260         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3261
3262    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3263    produced until the end sequence (ESC 1) is found:
3264
3265    (1) CHAR ... CHAR
3266    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3267    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3268    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3269
3270    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3271    annotation header is updated as below:
3272
3273    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3274    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3275    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3276    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3277
3278    If an error is found while composing, the annotation header is
3279    changed to:
3280
3281         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3282
3283    and the sequence [ -2 DECODED-RULE ] is changed to the original
3284    byte sequence as below:
3285         o the original byte sequence is B: [ B -1 ]
3286         o the original byte sequence is B1 B2: [ B1 B2 ]
3287    and the sequence [ -1 -1 ] is changed to the original byte
3288    sequence:
3289         [ ESC '0' ]
3290 */
3291
3292 /* Decode a composition rule C1 and maybe one more byte from the
3293    source, and set RULE to the encoded composition rule.  If the rule
3294    is invalid, goto invalid_code.  */
3295
3296 #define DECODE_COMPOSITION_RULE(rule)                                   \
3297   do {                                                                  \
3298     rule = c1 - 32;                                                     \
3299     if (rule < 0)                                                       \
3300       goto invalid_code;                                                \
3301     if (rule < 81)              /* old format (before ver.21) */        \
3302       {                                                                 \
3303         int gref = (rule) / 9;                                          \
3304         int nref = (rule) % 9;                                          \
3305         if (gref == 4) gref = 10;                                       \
3306         if (nref == 4) nref = 10;                                       \
3307         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3308       }                                                                 \
3309     else                        /* new format (after ver.21) */         \
3310       {                                                                 \
3311         int b;                                                          \
3312                                                                         \
3313         ONE_MORE_BYTE (b);                                              \
3314         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3315           goto invalid_code;                                            \
3316         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3317         rule += 0x100;   /* Distinguish it from the old format.  */     \
3318       }                                                                 \
3319   } while (0)
3320
3321 #define ENCODE_COMPOSITION_RULE(rule)                           \
3322   do {                                                          \
3323     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3324                                                                 \
3325     if (rule < 0x100)           /* old format */                \
3326       {                                                         \
3327         if (gref == 10) gref = 4;                               \
3328         if (nref == 10) nref = 4;                               \
3329         charbuf[idx] = 32 + gref * 9 + nref;                    \
3330         charbuf[idx + 1] = -1;                                  \
3331         new_chars++;                                            \
3332       }                                                         \
3333     else                                /* new format */        \
3334       {                                                         \
3335         charbuf[idx] = 32 + 81 + gref;                          \
3336         charbuf[idx + 1] = 32 + nref;                           \
3337         new_chars += 2;                                         \
3338       }                                                         \
3339   } while (0)
3340
3341 /* Finish the current composition as invalid.  */
3342
3343 static int
3344 finish_composition (int *charbuf, struct composition_status *cmp_status)
3345 {
3346   int idx = - cmp_status->length;
3347   int new_chars;
3348
3349   /* Recover the original ESC sequence */
3350   charbuf[idx++] = ISO_CODE_ESC;
3351   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3352                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3353                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3354                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3355                     : '4');
3356   charbuf[idx++] = -2;
3357   charbuf[idx++] = 0;
3358   charbuf[idx++] = -1;
3359   new_chars = cmp_status->nchars;
3360   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3361     for (; idx < 0; idx++)
3362       {
3363         int elt = charbuf[idx];
3364
3365         if (elt == -2)
3366           {
3367             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3368             idx++;
3369           }
3370         else if (elt == -1)
3371           {
3372             charbuf[idx++] = ISO_CODE_ESC;
3373             charbuf[idx] = '0';
3374             new_chars += 2;
3375           }
3376       }
3377   cmp_status->state = COMPOSING_NO;
3378   return new_chars;
3379 }
3380
3381 /* If characters are under composition, finish the composition.  */
3382 #define MAYBE_FINISH_COMPOSITION()                              \
3383   do {                                                          \
3384     if (cmp_status->state != COMPOSING_NO)                      \
3385       char_offset += finish_composition (charbuf, cmp_status);  \
3386   } while (0)
3387
3388 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3389
3390    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3391    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3392    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3393    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3394
3395    Produce this annotation sequence now:
3396
3397    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3398 */
3399
3400 #define DECODE_COMPOSITION_START(c1)                                       \
3401   do {                                                                     \
3402     if (c1 == '0'                                                          \
3403         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3404              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3405             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3406                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3407       {                                                                    \
3408         *charbuf++ = -1;                                                   \
3409         *charbuf++= -1;                                                    \
3410         cmp_status->state = COMPOSING_CHAR;                                \
3411         cmp_status->length += 2;                                           \
3412       }                                                                    \
3413     else                                                                   \
3414       {                                                                    \
3415         MAYBE_FINISH_COMPOSITION ();                                       \
3416         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3417                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3418                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3419                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3420         cmp_status->state                                                  \
3421           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3422         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3423         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3424         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3425         coding->annotated = 1;                                             \
3426       }                                                                    \
3427   } while (0)
3428
3429
3430 /* Handle composition end sequence ESC 1.  */
3431
3432 #define DECODE_COMPOSITION_END()                                        \
3433   do {                                                                  \
3434     if (cmp_status->nchars == 0                                         \
3435         || ((cmp_status->state == COMPOSING_CHAR)                       \
3436             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3437       {                                                                 \
3438         MAYBE_FINISH_COMPOSITION ();                                    \
3439         goto invalid_code;                                              \
3440       }                                                                 \
3441     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3442       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3443     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3444       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3445     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3446     char_offset += cmp_status->nchars;                                  \
3447     cmp_status->state = COMPOSING_NO;                                   \
3448   } while (0)
3449
3450 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3451
3452 #define STORE_COMPOSITION_RULE(rule)    \
3453   do {                                  \
3454     *charbuf++ = -2;                    \
3455     *charbuf++ = rule;                  \
3456     cmp_status->length += 2;            \
3457     cmp_status->state--;                \
3458   } while (0)
3459
3460 /* Store a composed char or a component char C in charbuf, and update
3461    cmp_status.  */
3462
3463 #define STORE_COMPOSITION_CHAR(c)                                       \
3464   do {                                                                  \
3465     *charbuf++ = (c);                                                   \
3466     cmp_status->length++;                                               \
3467     if (cmp_status->state == COMPOSING_CHAR)                            \
3468       cmp_status->nchars++;                                             \
3469     else                                                                \
3470       cmp_status->ncomps++;                                             \
3471     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3472         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3473             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3474       cmp_status->state++;                                              \
3475   } while (0)
3476
3477
3478 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3479
3480 static void
3481 decode_coding_iso_2022 (struct coding_system *coding)
3482 {
3483   const unsigned char *src = coding->source + coding->consumed;
3484   const unsigned char *src_end = coding->source + coding->src_bytes;
3485   const unsigned char *src_base;
3486   int *charbuf = coding->charbuf + coding->charbuf_used;
3487   /* We may produce two annotations (charset and composition) in one
3488      loop and one more charset annotation at the end.  */
3489   int *charbuf_end
3490     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3491   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3492   bool multibytep = coding->src_multibyte;
3493   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3494   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3495   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3496   int charset_id_2, charset_id_3;
3497   struct charset *charset;
3498   int c;
3499   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3500   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3501   ptrdiff_t char_offset = coding->produced_char;
3502   ptrdiff_t last_offset = char_offset;
3503   int last_id = charset_ascii;
3504   bool eol_dos
3505     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3506   int byte_after_cr = -1;
3507   int i;
3508
3509   setup_iso_safe_charsets (attrs);
3510   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3511
3512   if (cmp_status->state != COMPOSING_NO)
3513     {
3514       if (charbuf_end - charbuf < cmp_status->length)
3515         emacs_abort ();
3516       for (i = 0; i < cmp_status->length; i++)
3517         *charbuf++ = cmp_status->carryover[i];
3518       coding->annotated = 1;
3519     }
3520
3521   while (1)
3522     {
3523       int c1, c2, c3;
3524
3525       src_base = src;
3526       consumed_chars_base = consumed_chars;
3527
3528       if (charbuf >= charbuf_end)
3529         {
3530           if (byte_after_cr >= 0)
3531             src_base--;
3532           break;
3533         }
3534
3535       if (byte_after_cr >= 0)
3536         c1 = byte_after_cr, byte_after_cr = -1;
3537       else
3538         ONE_MORE_BYTE (c1);
3539       if (c1 < 0)
3540         goto invalid_code;
3541
3542       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3543         {
3544           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3545           char_offset++;
3546           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3547           continue;
3548         }
3549
3550       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3551         {
3552           if (c1 == ISO_CODE_ESC)
3553             {
3554               if (src + 1 >= src_end)
3555                 goto no_more_source;
3556               *charbuf++ = ISO_CODE_ESC;
3557               char_offset++;
3558               if (src[0] == '%' && src[1] == '@')
3559                 {
3560                   src += 2;
3561                   consumed_chars += 2;
3562                   char_offset += 2;
3563                   /* We are sure charbuf can contain two more chars. */
3564                   *charbuf++ = '%';
3565                   *charbuf++ = '@';
3566                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3567                 }
3568             }
3569           else
3570             {
3571               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3572               char_offset++;
3573             }
3574           continue;
3575         }
3576
3577       if ((cmp_status->state == COMPOSING_RULE
3578            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3579           && c1 != ISO_CODE_ESC)
3580         {
3581           int rule;
3582
3583           DECODE_COMPOSITION_RULE (rule);
3584           STORE_COMPOSITION_RULE (rule);
3585           continue;
3586         }
3587
3588       /* We produce at most one character.  */
3589       switch (iso_code_class [c1])
3590         {
3591         case ISO_0x20_or_0x7F:
3592           if (charset_id_0 < 0
3593               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3594             /* This is SPACE or DEL.  */
3595             charset = CHARSET_FROM_ID (charset_ascii);
3596           else
3597             charset = CHARSET_FROM_ID (charset_id_0);
3598           break;
3599
3600         case ISO_graphic_plane_0:
3601           if (charset_id_0 < 0)
3602             charset = CHARSET_FROM_ID (charset_ascii);
3603           else
3604             charset = CHARSET_FROM_ID (charset_id_0);
3605           break;
3606
3607         case ISO_0xA0_or_0xFF:
3608           if (charset_id_1 < 0
3609               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3610               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3611             goto invalid_code;
3612           /* This is a graphic character, we fall down ... */
3613
3614         case ISO_graphic_plane_1:
3615           if (charset_id_1 < 0)
3616             goto invalid_code;
3617           charset = CHARSET_FROM_ID (charset_id_1);
3618           break;
3619
3620         case ISO_control_0:
3621           if (eol_dos && c1 == '\r')
3622             ONE_MORE_BYTE (byte_after_cr);
3623           MAYBE_FINISH_COMPOSITION ();
3624           charset = CHARSET_FROM_ID (charset_ascii);
3625           break;
3626
3627         case ISO_control_1:
3628           goto invalid_code;
3629
3630         case ISO_shift_out:
3631           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3632               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3633             goto invalid_code;
3634           CODING_ISO_INVOCATION (coding, 0) = 1;
3635           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3636           continue;
3637
3638         case ISO_shift_in:
3639           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3640             goto invalid_code;
3641           CODING_ISO_INVOCATION (coding, 0) = 0;
3642           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3643           continue;
3644
3645         case ISO_single_shift_2_7:
3646           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3647             goto invalid_code;
3648         case ISO_single_shift_2:
3649           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3650             goto invalid_code;
3651           /* SS2 is handled as an escape sequence of ESC 'N' */
3652           c1 = 'N';
3653           goto label_escape_sequence;
3654
3655         case ISO_single_shift_3:
3656           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3657             goto invalid_code;
3658           /* SS2 is handled as an escape sequence of ESC 'O' */
3659           c1 = 'O';
3660           goto label_escape_sequence;
3661
3662         case ISO_control_sequence_introducer:
3663           /* CSI is handled as an escape sequence of ESC '[' ...  */
3664           c1 = '[';
3665           goto label_escape_sequence;
3666
3667         case ISO_escape:
3668           ONE_MORE_BYTE (c1);
3669         label_escape_sequence:
3670           /* Escape sequences handled here are invocation,
3671              designation, direction specification, and character
3672              composition specification.  */
3673           switch (c1)
3674             {
3675             case '&':           /* revision of following character set */
3676               ONE_MORE_BYTE (c1);
3677               if (!(c1 >= '@' && c1 <= '~'))
3678                 goto invalid_code;
3679               ONE_MORE_BYTE (c1);
3680               if (c1 != ISO_CODE_ESC)
3681                 goto invalid_code;
3682               ONE_MORE_BYTE (c1);
3683               goto label_escape_sequence;
3684
3685             case '$':           /* designation of 2-byte character set */
3686               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3687                 goto invalid_code;
3688               {
3689                 int reg, chars96;
3690
3691                 ONE_MORE_BYTE (c1);
3692                 if (c1 >= '@' && c1 <= 'B')
3693                   {     /* designation of JISX0208.1978, GB2312.1980,
3694                            or JISX0208.1980 */
3695                     reg = 0, chars96 = 0;
3696                   }
3697                 else if (c1 >= 0x28 && c1 <= 0x2B)
3698                   { /* designation of DIMENSION2_CHARS94 character set */
3699                     reg = c1 - 0x28, chars96 = 0;
3700                     ONE_MORE_BYTE (c1);
3701                   }
3702                 else if (c1 >= 0x2C && c1 <= 0x2F)
3703                   { /* designation of DIMENSION2_CHARS96 character set */
3704                     reg = c1 - 0x2C, chars96 = 1;
3705                     ONE_MORE_BYTE (c1);
3706                   }
3707                 else
3708                   goto invalid_code;
3709                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3710                 /* We must update these variables now.  */
3711                 if (reg == 0)
3712                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3713                 else if (reg == 1)
3714                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3715                 if (chars96 < 0)
3716                   goto invalid_code;
3717               }
3718               continue;
3719
3720             case 'n':           /* invocation of locking-shift-2 */
3721               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3722                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3723                 goto invalid_code;
3724               CODING_ISO_INVOCATION (coding, 0) = 2;
3725               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3726               continue;
3727
3728             case 'o':           /* invocation of locking-shift-3 */
3729               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3730                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3731                 goto invalid_code;
3732               CODING_ISO_INVOCATION (coding, 0) = 3;
3733               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3734               continue;
3735
3736             case 'N':           /* invocation of single-shift-2 */
3737               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3738                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3739                 goto invalid_code;
3740               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3741               if (charset_id_2 < 0)
3742                 charset = CHARSET_FROM_ID (charset_ascii);
3743               else
3744                 charset = CHARSET_FROM_ID (charset_id_2);
3745               ONE_MORE_BYTE (c1);
3746               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3747                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3748                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3749                           ? c1 >= 0x80 : c1 < 0x80)))
3750                 goto invalid_code;
3751               break;
3752
3753             case 'O':           /* invocation of single-shift-3 */
3754               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3755                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3756                 goto invalid_code;
3757               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3758               if (charset_id_3 < 0)
3759                 charset = CHARSET_FROM_ID (charset_ascii);
3760               else
3761                 charset = CHARSET_FROM_ID (charset_id_3);
3762               ONE_MORE_BYTE (c1);
3763               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3764                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3765                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3766                           ? c1 >= 0x80 : c1 < 0x80)))
3767                 goto invalid_code;
3768               break;
3769
3770             case '0': case '2': case '3': case '4': /* start composition */
3771               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3772                 goto invalid_code;
3773               if (last_id != charset_ascii)
3774                 {
3775                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3776                   last_id = charset_ascii;
3777                   last_offset = char_offset;
3778                 }
3779               DECODE_COMPOSITION_START (c1);
3780               continue;
3781
3782             case '1':           /* end composition */
3783               if (cmp_status->state == COMPOSING_NO)
3784                 goto invalid_code;
3785               DECODE_COMPOSITION_END ();
3786               continue;
3787
3788             case '[':           /* specification of direction */
3789               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3790                 goto invalid_code;
3791               /* For the moment, nested direction is not supported.
3792                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3793                  left-to-right, and nonzero means right-to-left.  */
3794               ONE_MORE_BYTE (c1);
3795               switch (c1)
3796                 {
3797                 case ']':       /* end of the current direction */
3798                   coding->mode &= ~CODING_MODE_DIRECTION;
3799
3800                 case '0':       /* end of the current direction */
3801                 case '1':       /* start of left-to-right direction */
3802                   ONE_MORE_BYTE (c1);
3803                   if (c1 == ']')
3804                     coding->mode &= ~CODING_MODE_DIRECTION;
3805                   else
3806                     goto invalid_code;
3807                   break;
3808
3809                 case '2':       /* start of right-to-left direction */
3810                   ONE_MORE_BYTE (c1);
3811                   if (c1 == ']')
3812                     coding->mode |= CODING_MODE_DIRECTION;
3813                   else
3814                     goto invalid_code;
3815                   break;
3816
3817                 default:
3818                   goto invalid_code;
3819                 }
3820               continue;
3821
3822             case '%':
3823               ONE_MORE_BYTE (c1);
3824               if (c1 == '/')
3825                 {
3826                   /* CTEXT extended segment:
3827                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3828                      We keep these bytes as is for the moment.
3829                      They may be decoded by post-read-conversion.  */
3830                   int dim, M, L;
3831                   int size;
3832
3833                   ONE_MORE_BYTE (dim);
3834                   if (dim < '0' || dim > '4')
3835                     goto invalid_code;
3836                   ONE_MORE_BYTE (M);
3837                   if (M < 128)
3838                     goto invalid_code;
3839                   ONE_MORE_BYTE (L);
3840                   if (L < 128)
3841                     goto invalid_code;
3842                   size = ((M - 128) * 128) + (L - 128);
3843                   if (charbuf + 6 > charbuf_end)
3844                     goto break_loop;
3845                   *charbuf++ = ISO_CODE_ESC;
3846                   *charbuf++ = '%';
3847                   *charbuf++ = '/';
3848                   *charbuf++ = dim;
3849                   *charbuf++ = BYTE8_TO_CHAR (M);
3850                   *charbuf++ = BYTE8_TO_CHAR (L);
3851                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3852                 }
3853               else if (c1 == 'G')
3854                 {
3855                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3856                      ESC % G --UTF-8-BYTES-- ESC % @
3857                      We keep these bytes as is for the moment.
3858                      They may be decoded by post-read-conversion.  */
3859                   if (charbuf + 3 > charbuf_end)
3860                     goto break_loop;
3861                   *charbuf++ = ISO_CODE_ESC;
3862                   *charbuf++ = '%';
3863                   *charbuf++ = 'G';
3864                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3865                 }
3866               else
3867                 goto invalid_code;
3868               continue;
3869               break;
3870
3871             default:
3872               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3873                 goto invalid_code;
3874               {
3875                 int reg, chars96;
3876
3877                 if (c1 >= 0x28 && c1 <= 0x2B)
3878                   { /* designation of DIMENSION1_CHARS94 character set */
3879                     reg = c1 - 0x28, chars96 = 0;
3880                     ONE_MORE_BYTE (c1);
3881                   }
3882                 else if (c1 >= 0x2C && c1 <= 0x2F)
3883                   { /* designation of DIMENSION1_CHARS96 character set */
3884                     reg = c1 - 0x2C, chars96 = 1;
3885                     ONE_MORE_BYTE (c1);
3886                   }
3887                 else
3888                   goto invalid_code;
3889                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3890                 /* We must update these variables now.  */
3891                 if (reg == 0)
3892                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3893                 else if (reg == 1)
3894                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3895                 if (chars96 < 0)
3896                   goto invalid_code;
3897               }
3898               continue;
3899             }
3900           break;
3901
3902         default:
3903           emacs_abort ();
3904         }
3905
3906       if (cmp_status->state == COMPOSING_NO
3907           && charset->id != charset_ascii
3908           && last_id != charset->id)
3909         {
3910           if (last_id != charset_ascii)
3911             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3912           last_id = charset->id;
3913           last_offset = char_offset;
3914         }
3915
3916       /* Now we know CHARSET and 1st position code C1 of a character.
3917          Produce a decoded character while getting 2nd and 3rd
3918          position codes C2, C3 if necessary.  */
3919       if (CHARSET_DIMENSION (charset) > 1)
3920         {
3921           ONE_MORE_BYTE (c2);
3922           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3923               || ((c1 & 0x80) != (c2 & 0x80)))
3924             /* C2 is not in a valid range.  */
3925             goto invalid_code;
3926           if (CHARSET_DIMENSION (charset) == 2)
3927             c1 = (c1 << 8) | c2;
3928           else
3929             {
3930               ONE_MORE_BYTE (c3);
3931               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3932                   || ((c1 & 0x80) != (c3 & 0x80)))
3933                 /* C3 is not in a valid range.  */
3934                 goto invalid_code;
3935               c1 = (c1 << 16) | (c2 << 8) | c2;
3936             }
3937         }
3938       c1 &= 0x7F7F7F;
3939       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3940       if (c < 0)
3941         {
3942           MAYBE_FINISH_COMPOSITION ();
3943           for (; src_base < src; src_base++, char_offset++)
3944             {
3945               if (ASCII_CHAR_P (*src_base))
3946                 *charbuf++ = *src_base;
3947               else
3948                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3949             }
3950         }
3951       else if (cmp_status->state == COMPOSING_NO)
3952         {
3953           *charbuf++ = c;
3954           char_offset++;
3955         }
3956       else if ((cmp_status->state == COMPOSING_CHAR
3957                 ? cmp_status->nchars
3958                 : cmp_status->ncomps)
3959                >= MAX_COMPOSITION_COMPONENTS)
3960         {
3961           /* Too long composition.  */
3962           MAYBE_FINISH_COMPOSITION ();
3963           *charbuf++ = c;
3964           char_offset++;
3965         }
3966       else
3967         STORE_COMPOSITION_CHAR (c);
3968       continue;
3969
3970     invalid_code:
3971       MAYBE_FINISH_COMPOSITION ();
3972       src = src_base;
3973       consumed_chars = consumed_chars_base;
3974       ONE_MORE_BYTE (c);
3975       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
3976       char_offset++;
3977       /* Reset the invocation and designation status to the safest
3978          one; i.e. designate ASCII to the graphic register 0, and
3979          invoke that register to the graphic plane 0.  This typically
3980          helps the case that an designation sequence for ASCII "ESC (
3981          B" is somehow broken (e.g. broken by a newline).  */
3982       CODING_ISO_INVOCATION (coding, 0) = 0;
3983       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3984       charset_id_0 = charset_ascii;
3985       continue;
3986
3987     break_loop:
3988       break;
3989     }
3990
3991  no_more_source:
3992   if (cmp_status->state != COMPOSING_NO)
3993     {
3994       if (coding->mode & CODING_MODE_LAST_BLOCK)
3995         MAYBE_FINISH_COMPOSITION ();
3996       else
3997         {
3998           charbuf -= cmp_status->length;
3999           for (i = 0; i < cmp_status->length; i++)
4000             cmp_status->carryover[i] = charbuf[i];
4001         }
4002     }
4003   else if (last_id != charset_ascii)
4004     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4005   coding->consumed_char += consumed_chars_base;
4006   coding->consumed = src_base - coding->source;
4007   coding->charbuf_used = charbuf - coding->charbuf;
4008 }
4009
4010
4011 /* ISO2022 encoding stuff.  */
4012
4013 /*
4014    It is not enough to say just "ISO2022" on encoding, we have to
4015    specify more details.  In Emacs, each coding system of ISO2022
4016    variant has the following specifications:
4017         1. Initial designation to G0 thru G3.
4018         2. Allows short-form designation?
4019         3. ASCII should be designated to G0 before control characters?
4020         4. ASCII should be designated to G0 at end of line?
4021         5. 7-bit environment or 8-bit environment?
4022         6. Use locking-shift?
4023         7. Use Single-shift?
4024    And the following two are only for Japanese:
4025         8. Use ASCII in place of JIS0201-1976-Roman?
4026         9. Use JISX0208-1983 in place of JISX0208-1978?
4027    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4028    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4029    details.
4030 */
4031
4032 /* Produce codes (escape sequence) for designating CHARSET to graphic
4033    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4034    '@', 'A', or 'B' and the coding system CODING allows, produce
4035    designation sequence of short-form.  */
4036
4037 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4038   do {                                                                  \
4039     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4040     const char *intermediate_char_94 = "()*+";                          \
4041     const char *intermediate_char_96 = ",-./";                          \
4042     int revision = -1;                                                  \
4043                                                                         \
4044     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4045       revision = CHARSET_ISO_REVISION (charset);                        \
4046                                                                         \
4047     if (revision >= 0)                                                  \
4048       {                                                                 \
4049         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4050         EMIT_ONE_BYTE ('@' + revision);                                 \
4051       }                                                                 \
4052     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4053     if (CHARSET_DIMENSION (charset) == 1)                               \
4054       {                                                                 \
4055         int b;                                                          \
4056         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4057           b = intermediate_char_94[reg];                                \
4058         else                                                            \
4059           b = intermediate_char_96[reg];                                \
4060         EMIT_ONE_ASCII_BYTE (b);                                        \
4061       }                                                                 \
4062     else                                                                \
4063       {                                                                 \
4064         EMIT_ONE_ASCII_BYTE ('$');                                      \
4065         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4066           {                                                             \
4067             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4068                 || reg != 0                                             \
4069                 || final_char < '@' || final_char > 'B')                \
4070               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4071           }                                                             \
4072         else                                                            \
4073           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4074       }                                                                 \
4075     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4076                                                                         \
4077     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4078   } while (0)
4079
4080
4081 /* The following two macros produce codes (control character or escape
4082    sequence) for ISO2022 single-shift functions (single-shift-2 and
4083    single-shift-3).  */
4084
4085 #define ENCODE_SINGLE_SHIFT_2                                           \
4086   do {                                                                  \
4087     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4088       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4089     else                                                                \
4090       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4091     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4092   } while (0)
4093
4094
4095 #define ENCODE_SINGLE_SHIFT_3                                           \
4096   do {                                                                  \
4097     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4098       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4099     else                                                                \
4100       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4101     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4102   } while (0)
4103
4104
4105 /* The following four macros produce codes (control character or
4106    escape sequence) for ISO2022 locking-shift functions (shift-in,
4107    shift-out, locking-shift-2, and locking-shift-3).  */
4108
4109 #define ENCODE_SHIFT_IN                                 \
4110   do {                                                  \
4111     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4112     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4113   } while (0)
4114
4115
4116 #define ENCODE_SHIFT_OUT                                \
4117   do {                                                  \
4118     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4119     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4120   } while (0)
4121
4122
4123 #define ENCODE_LOCKING_SHIFT_2                          \
4124   do {                                                  \
4125     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4126     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4127   } while (0)
4128
4129
4130 #define ENCODE_LOCKING_SHIFT_3                          \
4131   do {                                                  \
4132     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4133     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4134   } while (0)
4135
4136
4137 /* Produce codes for a DIMENSION1 character whose character set is
4138    CHARSET and whose position-code is C1.  Designation and invocation
4139    sequences are also produced in advance if necessary.  */
4140
4141 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4142   do {                                                                  \
4143     int id = CHARSET_ID (charset);                                      \
4144                                                                         \
4145     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4146         && id == charset_ascii)                                         \
4147       {                                                                 \
4148         id = charset_jisx0201_roman;                                    \
4149         charset = CHARSET_FROM_ID (id);                                 \
4150       }                                                                 \
4151                                                                         \
4152     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4153       {                                                                 \
4154         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4155           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4156         else                                                            \
4157           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4158         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4159         break;                                                          \
4160       }                                                                 \
4161     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4162       {                                                                 \
4163         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4164         break;                                                          \
4165       }                                                                 \
4166     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4167       {                                                                 \
4168         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4169         break;                                                          \
4170       }                                                                 \
4171     else                                                                \
4172       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4173          must invoke it, or, at first, designate it to some graphic     \
4174          register.  Then repeat the loop to actually produce the        \
4175          character.  */                                                 \
4176       dst = encode_invocation_designation (charset, coding, dst,        \
4177                                            &produced_chars);            \
4178   } while (1)
4179
4180
4181 /* Produce codes for a DIMENSION2 character whose character set is
4182    CHARSET and whose position-codes are C1 and C2.  Designation and
4183    invocation codes are also produced in advance if necessary.  */
4184
4185 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4186   do {                                                                  \
4187     int id = CHARSET_ID (charset);                                      \
4188                                                                         \
4189     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4190         && id == charset_jisx0208)                                      \
4191       {                                                                 \
4192         id = charset_jisx0208_1978;                                     \
4193         charset = CHARSET_FROM_ID (id);                                 \
4194       }                                                                 \
4195                                                                         \
4196     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4197       {                                                                 \
4198         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4199           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4200         else                                                            \
4201           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4202         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4203         break;                                                          \
4204       }                                                                 \
4205     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4206       {                                                                 \
4207         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4208         break;                                                          \
4209       }                                                                 \
4210     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4211       {                                                                 \
4212         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4213         break;                                                          \
4214       }                                                                 \
4215     else                                                                \
4216       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4217          must invoke it, or, at first, designate it to some graphic     \
4218          register.  Then repeat the loop to actually produce the        \
4219          character.  */                                                 \
4220       dst = encode_invocation_designation (charset, coding, dst,        \
4221                                            &produced_chars);            \
4222   } while (1)
4223
4224
4225 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4226   do {                                                                     \
4227     unsigned code;                                                         \
4228     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4229                                                                            \
4230     if (CHARSET_DIMENSION (charset) == 1)                                  \
4231       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4232     else                                                                   \
4233       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4234   } while (0)
4235
4236
4237 /* Produce designation and invocation codes at a place pointed by DST
4238    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4239    Return new DST.  */
4240
4241 static unsigned char *
4242 encode_invocation_designation (struct charset *charset,
4243                                struct coding_system *coding,
4244                                unsigned char *dst, ptrdiff_t *p_nchars)
4245 {
4246   bool multibytep = coding->dst_multibyte;
4247   ptrdiff_t produced_chars = *p_nchars;
4248   int reg;                      /* graphic register number */
4249   int id = CHARSET_ID (charset);
4250
4251   /* At first, check designations.  */
4252   for (reg = 0; reg < 4; reg++)
4253     if (id == CODING_ISO_DESIGNATION (coding, reg))
4254       break;
4255
4256   if (reg >= 4)
4257     {
4258       /* CHARSET is not yet designated to any graphic registers.  */
4259       /* At first check the requested designation.  */
4260       reg = CODING_ISO_REQUEST (coding, id);
4261       if (reg < 0)
4262         /* Since CHARSET requests no special designation, designate it
4263            to graphic register 0.  */
4264         reg = 0;
4265
4266       ENCODE_DESIGNATION (charset, reg, coding);
4267     }
4268
4269   if (CODING_ISO_INVOCATION (coding, 0) != reg
4270       && CODING_ISO_INVOCATION (coding, 1) != reg)
4271     {
4272       /* Since the graphic register REG is not invoked to any graphic
4273          planes, invoke it to graphic plane 0.  */
4274       switch (reg)
4275         {
4276         case 0:                 /* graphic register 0 */
4277           ENCODE_SHIFT_IN;
4278           break;
4279
4280         case 1:                 /* graphic register 1 */
4281           ENCODE_SHIFT_OUT;
4282           break;
4283
4284         case 2:                 /* graphic register 2 */
4285           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4286             ENCODE_SINGLE_SHIFT_2;
4287           else
4288             ENCODE_LOCKING_SHIFT_2;
4289           break;
4290
4291         case 3:                 /* graphic register 3 */
4292           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4293             ENCODE_SINGLE_SHIFT_3;
4294           else
4295             ENCODE_LOCKING_SHIFT_3;
4296           break;
4297         }
4298     }
4299
4300   *p_nchars = produced_chars;
4301   return dst;
4302 }
4303
4304
4305 /* Produce codes for designation and invocation to reset the graphic
4306    planes and registers to initial state.  */
4307 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4308   do {                                                                  \
4309     int reg;                                                            \
4310     struct charset *charset;                                            \
4311                                                                         \
4312     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4313       ENCODE_SHIFT_IN;                                                  \
4314     for (reg = 0; reg < 4; reg++)                                       \
4315       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4316           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4317               != CODING_ISO_INITIAL (coding, reg)))                     \
4318         {                                                               \
4319           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4320           ENCODE_DESIGNATION (charset, reg, coding);                    \
4321         }                                                               \
4322   } while (0)
4323
4324
4325 /* Produce designation sequences of charsets in the line started from
4326    CHARBUF to a place pointed by DST, and return the number of
4327    produced bytes.  DST should not directly point a buffer text area
4328    which may be relocated by char_charset call.
4329
4330    If the current block ends before any end-of-line, we may fail to
4331    find all the necessary designations.  */
4332
4333 static ptrdiff_t
4334 encode_designation_at_bol (struct coding_system *coding,
4335                            int *charbuf, int *charbuf_end,
4336                            unsigned char *dst)
4337 {
4338   unsigned char *orig = dst;
4339   struct charset *charset;
4340   /* Table of charsets to be designated to each graphic register.  */
4341   int r[4];
4342   int c, found = 0, reg;
4343   ptrdiff_t produced_chars = 0;
4344   bool multibytep = coding->dst_multibyte;
4345   Lisp_Object attrs;
4346   Lisp_Object charset_list;
4347
4348   attrs = CODING_ID_ATTRS (coding->id);
4349   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4350   if (EQ (charset_list, Qiso_2022))
4351     charset_list = Viso_2022_charset_list;
4352
4353   for (reg = 0; reg < 4; reg++)
4354     r[reg] = -1;
4355
4356   while (charbuf < charbuf_end && found < 4)
4357     {
4358       int id;
4359
4360       c = *charbuf++;
4361       if (c == '\n')
4362         break;
4363       charset = char_charset (c, charset_list, NULL);
4364       id = CHARSET_ID (charset);
4365       reg = CODING_ISO_REQUEST (coding, id);
4366       if (reg >= 0 && r[reg] < 0)
4367         {
4368           found++;
4369           r[reg] = id;
4370         }
4371     }
4372
4373   if (found)
4374     {
4375       for (reg = 0; reg < 4; reg++)
4376         if (r[reg] >= 0
4377             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4378           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4379     }
4380
4381   return dst - orig;
4382 }
4383
4384 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4385
4386 static bool
4387 encode_coding_iso_2022 (struct coding_system *coding)
4388 {
4389   bool multibytep = coding->dst_multibyte;
4390   int *charbuf = coding->charbuf;
4391   int *charbuf_end = charbuf + coding->charbuf_used;
4392   unsigned char *dst = coding->destination + coding->produced;
4393   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4394   int safe_room = 16;
4395   bool bol_designation
4396     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4397        && CODING_ISO_BOL (coding));
4398   ptrdiff_t produced_chars = 0;
4399   Lisp_Object attrs, eol_type, charset_list;
4400   bool ascii_compatible;
4401   int c;
4402   int preferred_charset_id = -1;
4403
4404   CODING_GET_INFO (coding, attrs, charset_list);
4405   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4406   if (VECTORP (eol_type))
4407     eol_type = Qunix;
4408
4409   setup_iso_safe_charsets (attrs);
4410   /* Charset list may have been changed.  */
4411   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4412   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4413
4414   ascii_compatible
4415     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4416        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4417                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4418
4419   while (charbuf < charbuf_end)
4420     {
4421       ASSURE_DESTINATION (safe_room);
4422
4423       if (bol_designation)
4424         {
4425           /* We have to produce designation sequences if any now.  */
4426           unsigned char desig_buf[16];
4427           ptrdiff_t nbytes;
4428           ptrdiff_t offset;
4429
4430           charset_map_loaded = 0;
4431           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4432                                               desig_buf);
4433           if (charset_map_loaded
4434               && (offset = coding_change_destination (coding)))
4435             {
4436               dst += offset;
4437               dst_end += offset;
4438             }
4439           memcpy (dst, desig_buf, nbytes);
4440           dst += nbytes;
4441           /* We are sure that designation sequences are all ASCII bytes.  */
4442           produced_chars += nbytes;
4443           bol_designation = 0;
4444           ASSURE_DESTINATION (safe_room);
4445         }
4446
4447       c = *charbuf++;
4448
4449       if (c < 0)
4450         {
4451           /* Handle an annotation.  */
4452           switch (*charbuf)
4453             {
4454             case CODING_ANNOTATE_COMPOSITION_MASK:
4455               /* Not yet implemented.  */
4456               break;
4457             case CODING_ANNOTATE_CHARSET_MASK:
4458               preferred_charset_id = charbuf[2];
4459               if (preferred_charset_id >= 0
4460                   && NILP (Fmemq (make_number (preferred_charset_id),
4461                                   charset_list)))
4462                 preferred_charset_id = -1;
4463               break;
4464             default:
4465               emacs_abort ();
4466             }
4467           charbuf += -c - 1;
4468           continue;
4469         }
4470
4471       /* Now encode the character C.  */
4472       if (c < 0x20 || c == 0x7F)
4473         {
4474           if (c == '\n'
4475               || (c == '\r' && EQ (eol_type, Qmac)))
4476             {
4477               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4478                 ENCODE_RESET_PLANE_AND_REGISTER ();
4479               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4480                 {
4481                   int i;
4482
4483                   for (i = 0; i < 4; i++)
4484                     CODING_ISO_DESIGNATION (coding, i)
4485                       = CODING_ISO_INITIAL (coding, i);
4486                 }
4487               bol_designation = ((CODING_ISO_FLAGS (coding)
4488                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4489                                  != 0);
4490             }
4491           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4492             ENCODE_RESET_PLANE_AND_REGISTER ();
4493           EMIT_ONE_ASCII_BYTE (c);
4494         }
4495       else if (ASCII_CHAR_P (c))
4496         {
4497           if (ascii_compatible)
4498             EMIT_ONE_ASCII_BYTE (c);
4499           else
4500             {
4501               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4502               ENCODE_ISO_CHARACTER (charset, c);
4503             }
4504         }
4505       else if (CHAR_BYTE8_P (c))
4506         {
4507           c = CHAR_TO_BYTE8 (c);
4508           EMIT_ONE_BYTE (c);
4509         }
4510       else
4511         {
4512           struct charset *charset;
4513
4514           if (preferred_charset_id >= 0)
4515             {
4516               bool result;
4517
4518               charset = CHARSET_FROM_ID (preferred_charset_id);
4519               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4520               if (! result)
4521                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4522                                      NULL, charset);
4523             }
4524           else
4525             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4526                                  NULL, charset);
4527           if (!charset)
4528             {
4529               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4530                 {
4531                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4532                   charset = CHARSET_FROM_ID (charset_ascii);
4533                 }
4534               else
4535                 {
4536                   c = coding->default_char;
4537                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4538                                        charset_list, NULL, charset);
4539                 }
4540             }
4541           ENCODE_ISO_CHARACTER (charset, c);
4542         }
4543     }
4544
4545   if (coding->mode & CODING_MODE_LAST_BLOCK
4546       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4547     {
4548       ASSURE_DESTINATION (safe_room);
4549       ENCODE_RESET_PLANE_AND_REGISTER ();
4550     }
4551   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4552   CODING_ISO_BOL (coding) = bol_designation;
4553   coding->produced_char += produced_chars;
4554   coding->produced = dst - coding->destination;
4555   return 0;
4556 }
4557
4558 \f
4559 /*** 8,9. SJIS and BIG5 handlers ***/
4560
4561 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4562    quite widely.  So, for the moment, Emacs supports them in the bare
4563    C code.  But, in the future, they may be supported only by CCL.  */
4564
4565 /* SJIS is a coding system encoding three character sets: ASCII, right
4566    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4567    as is.  A character of charset katakana-jisx0201 is encoded by
4568    "position-code + 0x80".  A character of charset japanese-jisx0208
4569    is encoded in 2-byte but two position-codes are divided and shifted
4570    so that it fit in the range below.
4571
4572    --- CODE RANGE of SJIS ---
4573    (character set)      (range)
4574    ASCII                0x00 .. 0x7F
4575    KATAKANA-JISX0201    0xA0 .. 0xDF
4576    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4577             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4578    -------------------------------
4579
4580 */
4581
4582 /* BIG5 is a coding system encoding two character sets: ASCII and
4583    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4584    character set and is encoded in two-byte.
4585
4586    --- CODE RANGE of BIG5 ---
4587    (character set)      (range)
4588    ASCII                0x00 .. 0x7F
4589    Big5 (1st byte)      0xA1 .. 0xFE
4590         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4591    --------------------------
4592
4593   */
4594
4595 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4596    Return true if a text is encoded in SJIS.  */
4597
4598 static bool
4599 detect_coding_sjis (struct coding_system *coding,
4600                     struct coding_detection_info *detect_info)
4601 {
4602   const unsigned char *src = coding->source, *src_base;
4603   const unsigned char *src_end = coding->source + coding->src_bytes;
4604   bool multibytep = coding->src_multibyte;
4605   ptrdiff_t consumed_chars = 0;
4606   int found = 0;
4607   int c;
4608   Lisp_Object attrs, charset_list;
4609   int max_first_byte_of_2_byte_code;
4610
4611   CODING_GET_INFO (coding, attrs, charset_list);
4612   max_first_byte_of_2_byte_code
4613     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4614
4615   detect_info->checked |= CATEGORY_MASK_SJIS;
4616   /* A coding system of this category is always ASCII compatible.  */
4617   src += coding->head_ascii;
4618
4619   while (1)
4620     {
4621       src_base = src;
4622       ONE_MORE_BYTE (c);
4623       if (c < 0x80)
4624         continue;
4625       if ((c >= 0x81 && c <= 0x9F)
4626           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4627         {
4628           ONE_MORE_BYTE (c);
4629           if (c < 0x40 || c == 0x7F || c > 0xFC)
4630             break;
4631           found = CATEGORY_MASK_SJIS;
4632         }
4633       else if (c >= 0xA0 && c < 0xE0)
4634         found = CATEGORY_MASK_SJIS;
4635       else
4636         break;
4637     }
4638   detect_info->rejected |= CATEGORY_MASK_SJIS;
4639   return 0;
4640
4641  no_more_source:
4642   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4643     {
4644       detect_info->rejected |= CATEGORY_MASK_SJIS;
4645       return 0;
4646     }
4647   detect_info->found |= found;
4648   return 1;
4649 }
4650
4651 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4652    Return true if a text is encoded in BIG5.  */
4653
4654 static bool
4655 detect_coding_big5 (struct coding_system *coding,
4656                     struct coding_detection_info *detect_info)
4657 {
4658   const unsigned char *src = coding->source, *src_base;
4659   const unsigned char *src_end = coding->source + coding->src_bytes;
4660   bool multibytep = coding->src_multibyte;
4661   ptrdiff_t consumed_chars = 0;
4662   int found = 0;
4663   int c;
4664
4665   detect_info->checked |= CATEGORY_MASK_BIG5;
4666   /* A coding system of this category is always ASCII compatible.  */
4667   src += coding->head_ascii;
4668
4669   while (1)
4670     {
4671       src_base = src;
4672       ONE_MORE_BYTE (c);
4673       if (c < 0x80)
4674         continue;
4675       if (c >= 0xA1)
4676         {
4677           ONE_MORE_BYTE (c);
4678           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4679             return 0;
4680           found = CATEGORY_MASK_BIG5;
4681         }
4682       else
4683         break;
4684     }
4685   detect_info->rejected |= CATEGORY_MASK_BIG5;
4686   return 0;
4687
4688  no_more_source:
4689   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4690     {
4691       detect_info->rejected |= CATEGORY_MASK_BIG5;
4692       return 0;
4693     }
4694   detect_info->found |= found;
4695   return 1;
4696 }
4697
4698 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4699
4700 static void
4701 decode_coding_sjis (struct coding_system *coding)
4702 {
4703   const unsigned char *src = coding->source + coding->consumed;
4704   const unsigned char *src_end = coding->source + coding->src_bytes;
4705   const unsigned char *src_base;
4706   int *charbuf = coding->charbuf + coding->charbuf_used;
4707   /* We may produce one charset annotation in one loop and one more at
4708      the end.  */
4709   int *charbuf_end
4710     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4711   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4712   bool multibytep = coding->src_multibyte;
4713   struct charset *charset_roman, *charset_kanji, *charset_kana;
4714   struct charset *charset_kanji2;
4715   Lisp_Object attrs, charset_list, val;
4716   ptrdiff_t char_offset = coding->produced_char;
4717   ptrdiff_t last_offset = char_offset;
4718   int last_id = charset_ascii;
4719   bool eol_dos
4720     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4721   int byte_after_cr = -1;
4722
4723   CODING_GET_INFO (coding, attrs, charset_list);
4724
4725   val = charset_list;
4726   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4727   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4728   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4729   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4730
4731   while (1)
4732     {
4733       int c, c1;
4734       struct charset *charset;
4735
4736       src_base = src;
4737       consumed_chars_base = consumed_chars;
4738
4739       if (charbuf >= charbuf_end)
4740         {
4741           if (byte_after_cr >= 0)
4742             src_base--;
4743           break;
4744         }
4745
4746       if (byte_after_cr >= 0)
4747         c = byte_after_cr, byte_after_cr = -1;
4748       else
4749         ONE_MORE_BYTE (c);
4750       if (c < 0)
4751         goto invalid_code;
4752       if (c < 0x80)
4753         {
4754           if (eol_dos && c == '\r')
4755             ONE_MORE_BYTE (byte_after_cr);
4756           charset = charset_roman;
4757         }
4758       else if (c == 0x80 || c == 0xA0)
4759         goto invalid_code;
4760       else if (c >= 0xA1 && c <= 0xDF)
4761         {
4762           /* SJIS -> JISX0201-Kana */
4763           c &= 0x7F;
4764           charset = charset_kana;
4765         }
4766       else if (c <= 0xEF)
4767         {
4768           /* SJIS -> JISX0208 */
4769           ONE_MORE_BYTE (c1);
4770           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4771             goto invalid_code;
4772           c = (c << 8) | c1;
4773           SJIS_TO_JIS (c);
4774           charset = charset_kanji;
4775         }
4776       else if (c <= 0xFC && charset_kanji2)
4777         {
4778           /* SJIS -> JISX0213-2 */
4779           ONE_MORE_BYTE (c1);
4780           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4781             goto invalid_code;
4782           c = (c << 8) | c1;
4783           SJIS_TO_JIS2 (c);
4784           charset = charset_kanji2;
4785         }
4786       else
4787         goto invalid_code;
4788       if (charset->id != charset_ascii
4789           && last_id != charset->id)
4790         {
4791           if (last_id != charset_ascii)
4792             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4793           last_id = charset->id;
4794           last_offset = char_offset;
4795         }
4796       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4797       *charbuf++ = c;
4798       char_offset++;
4799       continue;
4800
4801     invalid_code:
4802       src = src_base;
4803       consumed_chars = consumed_chars_base;
4804       ONE_MORE_BYTE (c);
4805       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4806       char_offset++;
4807     }
4808
4809  no_more_source:
4810   if (last_id != charset_ascii)
4811     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4812   coding->consumed_char += consumed_chars_base;
4813   coding->consumed = src_base - coding->source;
4814   coding->charbuf_used = charbuf - coding->charbuf;
4815 }
4816
4817 static void
4818 decode_coding_big5 (struct coding_system *coding)
4819 {
4820   const unsigned char *src = coding->source + coding->consumed;
4821   const unsigned char *src_end = coding->source + coding->src_bytes;
4822   const unsigned char *src_base;
4823   int *charbuf = coding->charbuf + coding->charbuf_used;
4824   /* We may produce one charset annotation in one loop and one more at
4825      the end.  */
4826   int *charbuf_end
4827     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4828   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4829   bool multibytep = coding->src_multibyte;
4830   struct charset *charset_roman, *charset_big5;
4831   Lisp_Object attrs, charset_list, val;
4832   ptrdiff_t char_offset = coding->produced_char;
4833   ptrdiff_t last_offset = char_offset;
4834   int last_id = charset_ascii;
4835   bool eol_dos
4836     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4837   int byte_after_cr = -1;
4838
4839   CODING_GET_INFO (coding, attrs, charset_list);
4840   val = charset_list;
4841   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4842   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4843
4844   while (1)
4845     {
4846       int c, c1;
4847       struct charset *charset;
4848
4849       src_base = src;
4850       consumed_chars_base = consumed_chars;
4851
4852       if (charbuf >= charbuf_end)
4853         {
4854           if (byte_after_cr >= 0)
4855             src_base--;
4856           break;
4857         }
4858
4859       if (byte_after_cr >= 0)
4860         c = byte_after_cr, byte_after_cr = -1;
4861       else
4862         ONE_MORE_BYTE (c);
4863
4864       if (c < 0)
4865         goto invalid_code;
4866       if (c < 0x80)
4867         {
4868           if (eol_dos && c == '\r')
4869             ONE_MORE_BYTE (byte_after_cr);
4870           charset = charset_roman;
4871         }
4872       else
4873         {
4874           /* BIG5 -> Big5 */
4875           if (c < 0xA1 || c > 0xFE)
4876             goto invalid_code;
4877           ONE_MORE_BYTE (c1);
4878           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4879             goto invalid_code;
4880           c = c << 8 | c1;
4881           charset = charset_big5;
4882         }
4883       if (charset->id != charset_ascii
4884           && last_id != charset->id)
4885         {
4886           if (last_id != charset_ascii)
4887             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4888           last_id = charset->id;
4889           last_offset = char_offset;
4890         }
4891       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4892       *charbuf++ = c;
4893       char_offset++;
4894       continue;
4895
4896     invalid_code:
4897       src = src_base;
4898       consumed_chars = consumed_chars_base;
4899       ONE_MORE_BYTE (c);
4900       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4901       char_offset++;
4902     }
4903
4904  no_more_source:
4905   if (last_id != charset_ascii)
4906     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4907   coding->consumed_char += consumed_chars_base;
4908   coding->consumed = src_base - coding->source;
4909   coding->charbuf_used = charbuf - coding->charbuf;
4910 }
4911
4912 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4913    This function can encode charsets `ascii', `katakana-jisx0201',
4914    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4915    are sure that all these charsets are registered as official charset
4916    (i.e. do not have extended leading-codes).  Characters of other
4917    charsets are produced without any encoding.  */
4918
4919 static bool
4920 encode_coding_sjis (struct coding_system *coding)
4921 {
4922   bool multibytep = coding->dst_multibyte;
4923   int *charbuf = coding->charbuf;
4924   int *charbuf_end = charbuf + coding->charbuf_used;
4925   unsigned char *dst = coding->destination + coding->produced;
4926   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4927   int safe_room = 4;
4928   ptrdiff_t produced_chars = 0;
4929   Lisp_Object attrs, charset_list, val;
4930   bool ascii_compatible;
4931   struct charset *charset_kanji, *charset_kana;
4932   struct charset *charset_kanji2;
4933   int c;
4934
4935   CODING_GET_INFO (coding, attrs, charset_list);
4936   val = XCDR (charset_list);
4937   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4938   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4939   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4940
4941   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4942
4943   while (charbuf < charbuf_end)
4944     {
4945       ASSURE_DESTINATION (safe_room);
4946       c = *charbuf++;
4947       /* Now encode the character C.  */
4948       if (ASCII_CHAR_P (c) && ascii_compatible)
4949         EMIT_ONE_ASCII_BYTE (c);
4950       else if (CHAR_BYTE8_P (c))
4951         {
4952           c = CHAR_TO_BYTE8 (c);
4953           EMIT_ONE_BYTE (c);
4954         }
4955       else
4956         {
4957           unsigned code;
4958           struct charset *charset;
4959           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4960                                &code, charset);
4961
4962           if (!charset)
4963             {
4964               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4965                 {
4966                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4967                   charset = CHARSET_FROM_ID (charset_ascii);
4968                 }
4969               else
4970                 {
4971                   c = coding->default_char;
4972                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4973                                        charset_list, &code, charset);
4974                 }
4975             }
4976           if (code == CHARSET_INVALID_CODE (charset))
4977             emacs_abort ();
4978           if (charset == charset_kanji)
4979             {
4980               int c1, c2;
4981               JIS_TO_SJIS (code);
4982               c1 = code >> 8, c2 = code & 0xFF;
4983               EMIT_TWO_BYTES (c1, c2);
4984             }
4985           else if (charset == charset_kana)
4986             EMIT_ONE_BYTE (code | 0x80);
4987           else if (charset_kanji2 && charset == charset_kanji2)
4988             {
4989               int c1, c2;
4990
4991               c1 = code >> 8;
4992               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4993                   || c1 == 0x28
4994                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4995                 {
4996                   JIS_TO_SJIS2 (code);
4997                   c1 = code >> 8, c2 = code & 0xFF;
4998                   EMIT_TWO_BYTES (c1, c2);
4999                 }
5000               else
5001                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5002             }
5003           else
5004             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5005         }
5006     }
5007   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5008   coding->produced_char += produced_chars;
5009   coding->produced = dst - coding->destination;
5010   return 0;
5011 }
5012
5013 static bool
5014 encode_coding_big5 (struct coding_system *coding)
5015 {
5016   bool multibytep = coding->dst_multibyte;
5017   int *charbuf = coding->charbuf;
5018   int *charbuf_end = charbuf + coding->charbuf_used;
5019   unsigned char *dst = coding->destination + coding->produced;
5020   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5021   int safe_room = 4;
5022   ptrdiff_t produced_chars = 0;
5023   Lisp_Object attrs, charset_list, val;
5024   bool ascii_compatible;
5025   struct charset *charset_big5;
5026   int c;
5027
5028   CODING_GET_INFO (coding, attrs, charset_list);
5029   val = XCDR (charset_list);
5030   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5031   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5032
5033   while (charbuf < charbuf_end)
5034     {
5035       ASSURE_DESTINATION (safe_room);
5036       c = *charbuf++;
5037       /* Now encode the character C.  */
5038       if (ASCII_CHAR_P (c) && ascii_compatible)
5039         EMIT_ONE_ASCII_BYTE (c);
5040       else if (CHAR_BYTE8_P (c))
5041         {
5042           c = CHAR_TO_BYTE8 (c);
5043           EMIT_ONE_BYTE (c);
5044         }
5045       else
5046         {
5047           unsigned code;
5048           struct charset *charset;
5049           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5050                                &code, charset);
5051
5052           if (! charset)
5053             {
5054               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5055                 {
5056                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5057                   charset = CHARSET_FROM_ID (charset_ascii);
5058                 }
5059               else
5060                 {
5061                   c = coding->default_char;
5062                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5063                                        charset_list, &code, charset);
5064                 }
5065             }
5066           if (code == CHARSET_INVALID_CODE (charset))
5067             emacs_abort ();
5068           if (charset == charset_big5)
5069             {
5070               int c1, c2;
5071
5072               c1 = code >> 8, c2 = code & 0xFF;
5073               EMIT_TWO_BYTES (c1, c2);
5074             }
5075           else
5076             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5077         }
5078     }
5079   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5080   coding->produced_char += produced_chars;
5081   coding->produced = dst - coding->destination;
5082   return 0;
5083 }
5084
5085 \f
5086 /*** 10. CCL handlers ***/
5087
5088 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5089    Return true if a text is encoded in a coding system of which
5090    encoder/decoder are written in CCL program.  */
5091
5092 static bool
5093 detect_coding_ccl (struct coding_system *coding,
5094                    struct coding_detection_info *detect_info)
5095 {
5096   const unsigned char *src = coding->source, *src_base;
5097   const unsigned char *src_end = coding->source + coding->src_bytes;
5098   bool multibytep = coding->src_multibyte;
5099   ptrdiff_t consumed_chars = 0;
5100   int found = 0;
5101   unsigned char *valids;
5102   ptrdiff_t head_ascii = coding->head_ascii;
5103   Lisp_Object attrs;
5104
5105   detect_info->checked |= CATEGORY_MASK_CCL;
5106
5107   coding = &coding_categories[coding_category_ccl];
5108   valids = CODING_CCL_VALIDS (coding);
5109   attrs = CODING_ID_ATTRS (coding->id);
5110   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5111     src += head_ascii;
5112
5113   while (1)
5114     {
5115       int c;
5116
5117       src_base = src;
5118       ONE_MORE_BYTE (c);
5119       if (c < 0 || ! valids[c])
5120         break;
5121       if ((valids[c] > 1))
5122         found = CATEGORY_MASK_CCL;
5123     }
5124   detect_info->rejected |= CATEGORY_MASK_CCL;
5125   return 0;
5126
5127  no_more_source:
5128   detect_info->found |= found;
5129   return 1;
5130 }
5131
5132 static void
5133 decode_coding_ccl (struct coding_system *coding)
5134 {
5135   const unsigned char *src = coding->source + coding->consumed;
5136   const unsigned char *src_end = coding->source + coding->src_bytes;
5137   int *charbuf = coding->charbuf + coding->charbuf_used;
5138   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5139   ptrdiff_t consumed_chars = 0;
5140   bool multibytep = coding->src_multibyte;
5141   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5142   int source_charbuf[1024];
5143   int source_byteidx[1025];
5144   Lisp_Object attrs, charset_list;
5145
5146   CODING_GET_INFO (coding, attrs, charset_list);
5147
5148   while (1)
5149     {
5150       const unsigned char *p = src;
5151       ptrdiff_t offset;
5152       int i = 0;
5153
5154       if (multibytep)
5155         {
5156           while (i < 1024 && p < src_end)
5157             {
5158               source_byteidx[i] = p - src;
5159               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5160             }
5161           source_byteidx[i] = p - src;
5162         }
5163       else
5164         while (i < 1024 && p < src_end)
5165           source_charbuf[i++] = *p++;
5166
5167       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5168         ccl->last_block = true;
5169       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5170       charset_map_loaded = 0;
5171       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5172                   charset_list);
5173       if (charset_map_loaded
5174           && (offset = coding_change_source (coding)))
5175         {
5176           p += offset;
5177           src += offset;
5178           src_end += offset;
5179         }
5180       charbuf += ccl->produced;
5181       if (multibytep)
5182         src += source_byteidx[ccl->consumed];
5183       else
5184         src += ccl->consumed;
5185       consumed_chars += ccl->consumed;
5186       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5187         break;
5188     }
5189
5190   switch (ccl->status)
5191     {
5192     case CCL_STAT_SUSPEND_BY_SRC:
5193       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5194       break;
5195     case CCL_STAT_SUSPEND_BY_DST:
5196       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5197       break;
5198     case CCL_STAT_QUIT:
5199     case CCL_STAT_INVALID_CMD:
5200       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5201       break;
5202     default:
5203       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5204       break;
5205     }
5206   coding->consumed_char += consumed_chars;
5207   coding->consumed = src - coding->source;
5208   coding->charbuf_used = charbuf - coding->charbuf;
5209 }
5210
5211 static bool
5212 encode_coding_ccl (struct coding_system *coding)
5213 {
5214   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5215   bool multibytep = coding->dst_multibyte;
5216   int *charbuf = coding->charbuf;
5217   int *charbuf_end = charbuf + coding->charbuf_used;
5218   unsigned char *dst = coding->destination + coding->produced;
5219   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5220   int destination_charbuf[1024];
5221   ptrdiff_t produced_chars = 0;
5222   int i;
5223   Lisp_Object attrs, charset_list;
5224
5225   CODING_GET_INFO (coding, attrs, charset_list);
5226   if (coding->consumed_char == coding->src_chars
5227       && coding->mode & CODING_MODE_LAST_BLOCK)
5228     ccl->last_block = true;
5229
5230   do
5231     {
5232       ptrdiff_t offset;
5233
5234       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5235       charset_map_loaded = 0;
5236       ccl_driver (ccl, charbuf, destination_charbuf,
5237                   charbuf_end - charbuf, 1024, charset_list);
5238       if (charset_map_loaded
5239           && (offset = coding_change_destination (coding)))
5240         dst += offset;
5241       if (multibytep)
5242         {
5243           ASSURE_DESTINATION (ccl->produced * 2);
5244           for (i = 0; i < ccl->produced; i++)
5245             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5246         }
5247       else
5248         {
5249           ASSURE_DESTINATION (ccl->produced);
5250           for (i = 0; i < ccl->produced; i++)
5251             *dst++ = destination_charbuf[i] & 0xFF;
5252           produced_chars += ccl->produced;
5253         }
5254       charbuf += ccl->consumed;
5255       if (ccl->status == CCL_STAT_QUIT
5256           || ccl->status == CCL_STAT_INVALID_CMD)
5257         break;
5258     }
5259   while (charbuf < charbuf_end);
5260
5261   switch (ccl->status)
5262     {
5263     case CCL_STAT_SUSPEND_BY_SRC:
5264       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5265       break;
5266     case CCL_STAT_SUSPEND_BY_DST:
5267       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5268       break;
5269     case CCL_STAT_QUIT:
5270     case CCL_STAT_INVALID_CMD:
5271       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5272       break;
5273     default:
5274       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5275       break;
5276     }
5277
5278   coding->produced_char += produced_chars;
5279   coding->produced = dst - coding->destination;
5280   return 0;
5281 }
5282
5283 \f
5284 /*** 10, 11. no-conversion handlers ***/
5285
5286 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5287
5288 static void
5289 decode_coding_raw_text (struct coding_system *coding)
5290 {
5291   bool eol_dos
5292     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5293
5294   coding->chars_at_source = 1;
5295   coding->consumed_char = coding->src_chars;
5296   coding->consumed = coding->src_bytes;
5297   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5298     {
5299       coding->consumed_char--;
5300       coding->consumed--;
5301       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5302     }
5303   else
5304     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5305 }
5306
5307 static bool
5308 encode_coding_raw_text (struct coding_system *coding)
5309 {
5310   bool multibytep = coding->dst_multibyte;
5311   int *charbuf = coding->charbuf;
5312   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5313   unsigned char *dst = coding->destination + coding->produced;
5314   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5315   ptrdiff_t produced_chars = 0;
5316   int c;
5317
5318   if (multibytep)
5319     {
5320       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5321
5322       if (coding->src_multibyte)
5323         while (charbuf < charbuf_end)
5324           {
5325             ASSURE_DESTINATION (safe_room);
5326             c = *charbuf++;
5327             if (ASCII_CHAR_P (c))
5328               EMIT_ONE_ASCII_BYTE (c);
5329             else if (CHAR_BYTE8_P (c))
5330               {
5331                 c = CHAR_TO_BYTE8 (c);
5332                 EMIT_ONE_BYTE (c);
5333               }
5334             else
5335               {
5336                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5337
5338                 CHAR_STRING_ADVANCE (c, p1);
5339                 do
5340                   {
5341                     EMIT_ONE_BYTE (*p0);
5342                     p0++;
5343                   }
5344                 while (p0 < p1);
5345               }
5346           }
5347       else
5348         while (charbuf < charbuf_end)
5349           {
5350             ASSURE_DESTINATION (safe_room);
5351             c = *charbuf++;
5352             EMIT_ONE_BYTE (c);
5353           }
5354     }
5355   else
5356     {
5357       if (coding->src_multibyte)
5358         {
5359           int safe_room = MAX_MULTIBYTE_LENGTH;
5360
5361           while (charbuf < charbuf_end)
5362             {
5363               ASSURE_DESTINATION (safe_room);
5364               c = *charbuf++;
5365               if (ASCII_CHAR_P (c))
5366                 *dst++ = c;
5367               else if (CHAR_BYTE8_P (c))
5368                 *dst++ = CHAR_TO_BYTE8 (c);
5369               else
5370                 CHAR_STRING_ADVANCE (c, dst);
5371             }
5372         }
5373       else
5374         {
5375           ASSURE_DESTINATION (charbuf_end - charbuf);
5376           while (charbuf < charbuf_end && dst < dst_end)
5377             *dst++ = *charbuf++;
5378         }
5379       produced_chars = dst - (coding->destination + coding->produced);
5380     }
5381   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5382   coding->produced_char += produced_chars;
5383   coding->produced = dst - coding->destination;
5384   return 0;
5385 }
5386
5387 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5388    Return true if a text is encoded in a charset-based coding system.  */
5389
5390 static bool
5391 detect_coding_charset (struct coding_system *coding,
5392                        struct coding_detection_info *detect_info)
5393 {
5394   const unsigned char *src = coding->source, *src_base;
5395   const unsigned char *src_end = coding->source + coding->src_bytes;
5396   bool multibytep = coding->src_multibyte;
5397   ptrdiff_t consumed_chars = 0;
5398   Lisp_Object attrs, valids, name;
5399   int found = 0;
5400   ptrdiff_t head_ascii = coding->head_ascii;
5401   bool check_latin_extra = 0;
5402
5403   detect_info->checked |= CATEGORY_MASK_CHARSET;
5404
5405   coding = &coding_categories[coding_category_charset];
5406   attrs = CODING_ID_ATTRS (coding->id);
5407   valids = AREF (attrs, coding_attr_charset_valids);
5408   name = CODING_ID_NAME (coding->id);
5409   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5410                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5411       || strncmp (SSDATA (SYMBOL_NAME (name)),
5412                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5413     check_latin_extra = 1;
5414
5415   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5416     src += head_ascii;
5417
5418   while (1)
5419     {
5420       int c;
5421       Lisp_Object val;
5422       struct charset *charset;
5423       int dim, idx;
5424
5425       src_base = src;
5426       ONE_MORE_BYTE (c);
5427       if (c < 0)
5428         continue;
5429       val = AREF (valids, c);
5430       if (NILP (val))
5431         break;
5432       if (c >= 0x80)
5433         {
5434           if (c < 0xA0
5435               && check_latin_extra
5436               && (!VECTORP (Vlatin_extra_code_table)
5437                   || NILP (AREF (Vlatin_extra_code_table, c))))
5438             break;
5439           found = CATEGORY_MASK_CHARSET;
5440         }
5441       if (INTEGERP (val))
5442         {
5443           charset = CHARSET_FROM_ID (XFASTINT (val));
5444           dim = CHARSET_DIMENSION (charset);
5445           for (idx = 1; idx < dim; idx++)
5446             {
5447               if (src == src_end)
5448                 goto too_short;
5449               ONE_MORE_BYTE (c);
5450               if (c < charset->code_space[(dim - 1 - idx) * 4]
5451                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5452                 break;
5453             }
5454           if (idx < dim)
5455             break;
5456         }
5457       else
5458         {
5459           idx = 1;
5460           for (; CONSP (val); val = XCDR (val))
5461             {
5462               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5463               dim = CHARSET_DIMENSION (charset);
5464               while (idx < dim)
5465                 {
5466                   if (src == src_end)
5467                     goto too_short;
5468                   ONE_MORE_BYTE (c);
5469                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5470                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5471                     break;
5472                   idx++;
5473                 }
5474               if (idx == dim)
5475                 {
5476                   val = Qnil;
5477                   break;
5478                 }
5479             }
5480           if (CONSP (val))
5481             break;
5482         }
5483     }
5484  too_short:
5485   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5486   return 0;
5487
5488  no_more_source:
5489   detect_info->found |= found;
5490   return 1;
5491 }
5492
5493 static void
5494 decode_coding_charset (struct coding_system *coding)
5495 {
5496   const unsigned char *src = coding->source + coding->consumed;
5497   const unsigned char *src_end = coding->source + coding->src_bytes;
5498   const unsigned char *src_base;
5499   int *charbuf = coding->charbuf + coding->charbuf_used;
5500   /* We may produce one charset annotation in one loop and one more at
5501      the end.  */
5502   int *charbuf_end
5503     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5504   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5505   bool multibytep = coding->src_multibyte;
5506   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5507   Lisp_Object valids;
5508   ptrdiff_t char_offset = coding->produced_char;
5509   ptrdiff_t last_offset = char_offset;
5510   int last_id = charset_ascii;
5511   bool eol_dos
5512     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5513   int byte_after_cr = -1;
5514
5515   valids = AREF (attrs, coding_attr_charset_valids);
5516
5517   while (1)
5518     {
5519       int c;
5520       Lisp_Object val;
5521       struct charset *charset;
5522       int dim;
5523       int len = 1;
5524       unsigned code;
5525
5526       src_base = src;
5527       consumed_chars_base = consumed_chars;
5528
5529       if (charbuf >= charbuf_end)
5530         {
5531           if (byte_after_cr >= 0)
5532             src_base--;
5533           break;
5534         }
5535
5536       if (byte_after_cr >= 0)
5537         {
5538           c = byte_after_cr;
5539           byte_after_cr = -1;
5540         }
5541       else
5542         {
5543           ONE_MORE_BYTE (c);
5544           if (eol_dos && c == '\r')
5545             ONE_MORE_BYTE (byte_after_cr);
5546         }
5547       if (c < 0)
5548         goto invalid_code;
5549       code = c;
5550
5551       val = AREF (valids, c);
5552       if (! INTEGERP (val) && ! CONSP (val))
5553         goto invalid_code;
5554       if (INTEGERP (val))
5555         {
5556           charset = CHARSET_FROM_ID (XFASTINT (val));
5557           dim = CHARSET_DIMENSION (charset);
5558           while (len < dim)
5559             {
5560               ONE_MORE_BYTE (c);
5561               code = (code << 8) | c;
5562               len++;
5563             }
5564           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5565                               charset, code, c);
5566         }
5567       else
5568         {
5569           /* VAL is a list of charset IDs.  It is assured that the
5570              list is sorted by charset dimensions (smaller one
5571              comes first).  */
5572           while (CONSP (val))
5573             {
5574               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5575               dim = CHARSET_DIMENSION (charset);
5576               while (len < dim)
5577                 {
5578                   ONE_MORE_BYTE (c);
5579                   code = (code << 8) | c;
5580                   len++;
5581                 }
5582               CODING_DECODE_CHAR (coding, src, src_base,
5583                                   src_end, charset, code, c);
5584               if (c >= 0)
5585                 break;
5586               val = XCDR (val);
5587             }
5588         }
5589       if (c < 0)
5590         goto invalid_code;
5591       if (charset->id != charset_ascii
5592           && last_id != charset->id)
5593         {
5594           if (last_id != charset_ascii)
5595             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5596           last_id = charset->id;
5597           last_offset = char_offset;
5598         }
5599
5600       *charbuf++ = c;
5601       char_offset++;
5602       continue;
5603
5604     invalid_code:
5605       src = src_base;
5606       consumed_chars = consumed_chars_base;
5607       ONE_MORE_BYTE (c);
5608       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5609       char_offset++;
5610     }
5611
5612  no_more_source:
5613   if (last_id != charset_ascii)
5614     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5615   coding->consumed_char += consumed_chars_base;
5616   coding->consumed = src_base - coding->source;
5617   coding->charbuf_used = charbuf - coding->charbuf;
5618 }
5619
5620 static bool
5621 encode_coding_charset (struct coding_system *coding)
5622 {
5623   bool multibytep = coding->dst_multibyte;
5624   int *charbuf = coding->charbuf;
5625   int *charbuf_end = charbuf + coding->charbuf_used;
5626   unsigned char *dst = coding->destination + coding->produced;
5627   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5628   int safe_room = MAX_MULTIBYTE_LENGTH;
5629   ptrdiff_t produced_chars = 0;
5630   Lisp_Object attrs, charset_list;
5631   bool ascii_compatible;
5632   int c;
5633
5634   CODING_GET_INFO (coding, attrs, charset_list);
5635   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5636
5637   while (charbuf < charbuf_end)
5638     {
5639       struct charset *charset;
5640       unsigned code;
5641
5642       ASSURE_DESTINATION (safe_room);
5643       c = *charbuf++;
5644       if (ascii_compatible && ASCII_CHAR_P (c))
5645         EMIT_ONE_ASCII_BYTE (c);
5646       else if (CHAR_BYTE8_P (c))
5647         {
5648           c = CHAR_TO_BYTE8 (c);
5649           EMIT_ONE_BYTE (c);
5650         }
5651       else
5652         {
5653           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5654                                &code, charset);
5655
5656           if (charset)
5657             {
5658               if (CHARSET_DIMENSION (charset) == 1)
5659                 EMIT_ONE_BYTE (code);
5660               else if (CHARSET_DIMENSION (charset) == 2)
5661                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5662               else if (CHARSET_DIMENSION (charset) == 3)
5663                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5664               else
5665                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5666                                  (code >> 8) & 0xFF, code & 0xFF);
5667             }
5668           else
5669             {
5670               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5671                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5672               else
5673                 c = coding->default_char;
5674               EMIT_ONE_BYTE (c);
5675             }
5676         }
5677     }
5678
5679   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5680   coding->produced_char += produced_chars;
5681   coding->produced = dst - coding->destination;
5682   return 0;
5683 }
5684
5685 \f
5686 /*** 7. C library functions ***/
5687
5688 /* Setup coding context CODING from information about CODING_SYSTEM.
5689    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5690    CODING_SYSTEM is invalid, signal an error.  */
5691
5692 void
5693 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5694 {
5695   Lisp_Object attrs;
5696   Lisp_Object eol_type;
5697   Lisp_Object coding_type;
5698   Lisp_Object val;
5699
5700   if (NILP (coding_system))
5701     coding_system = Qundecided;
5702
5703   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5704
5705   attrs = CODING_ID_ATTRS (coding->id);
5706   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5707
5708   coding->mode = 0;
5709   if (VECTORP (eol_type))
5710     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5711                             | CODING_REQUIRE_DETECTION_MASK);
5712   else if (! EQ (eol_type, Qunix))
5713     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5714                             | CODING_REQUIRE_ENCODING_MASK);
5715   else
5716     coding->common_flags = 0;
5717   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5718     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5719   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5720     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5721   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5722     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5723
5724   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5725   coding->max_charset_id = SCHARS (val) - 1;
5726   coding->safe_charsets = SDATA (val);
5727   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5728   coding->carryover_bytes = 0;
5729   coding->raw_destination = 0;
5730
5731   coding_type = CODING_ATTR_TYPE (attrs);
5732   if (EQ (coding_type, Qundecided))
5733     {
5734       coding->detector = NULL;
5735       coding->decoder = decode_coding_raw_text;
5736       coding->encoder = encode_coding_raw_text;
5737       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5738       coding->spec.undecided.inhibit_nbd
5739         = (encode_inhibit_flag
5740            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5741       coding->spec.undecided.inhibit_ied
5742         = (encode_inhibit_flag
5743            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5744       coding->spec.undecided.prefer_utf_8
5745         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5746     }
5747   else if (EQ (coding_type, Qiso_2022))
5748     {
5749       int i;
5750       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5751
5752       /* Invoke graphic register 0 to plane 0.  */
5753       CODING_ISO_INVOCATION (coding, 0) = 0;
5754       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5755       CODING_ISO_INVOCATION (coding, 1)
5756         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5757       /* Setup the initial status of designation.  */
5758       for (i = 0; i < 4; i++)
5759         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5760       /* Not single shifting initially.  */
5761       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5762       /* Beginning of buffer should also be regarded as bol. */
5763       CODING_ISO_BOL (coding) = 1;
5764       coding->detector = detect_coding_iso_2022;
5765       coding->decoder = decode_coding_iso_2022;
5766       coding->encoder = encode_coding_iso_2022;
5767       if (flags & CODING_ISO_FLAG_SAFE)
5768         coding->mode |= CODING_MODE_SAFE_ENCODING;
5769       coding->common_flags
5770         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5771             | CODING_REQUIRE_FLUSHING_MASK);
5772       if (flags & CODING_ISO_FLAG_COMPOSITION)
5773         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5774       if (flags & CODING_ISO_FLAG_DESIGNATION)
5775         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5776       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5777         {
5778           setup_iso_safe_charsets (attrs);
5779           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5780           coding->max_charset_id = SCHARS (val) - 1;
5781           coding->safe_charsets = SDATA (val);
5782         }
5783       CODING_ISO_FLAGS (coding) = flags;
5784       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5785       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5786       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5787       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5788     }
5789   else if (EQ (coding_type, Qcharset))
5790     {
5791       coding->detector = detect_coding_charset;
5792       coding->decoder = decode_coding_charset;
5793       coding->encoder = encode_coding_charset;
5794       coding->common_flags
5795         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5796     }
5797   else if (EQ (coding_type, Qutf_8))
5798     {
5799       val = AREF (attrs, coding_attr_utf_bom);
5800       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5801                                    : EQ (val, Qt) ? utf_with_bom
5802                                    : utf_without_bom);
5803       coding->detector = detect_coding_utf_8;
5804       coding->decoder = decode_coding_utf_8;
5805       coding->encoder = encode_coding_utf_8;
5806       coding->common_flags
5807         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5808       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5809         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5810     }
5811   else if (EQ (coding_type, Qutf_16))
5812     {
5813       val = AREF (attrs, coding_attr_utf_bom);
5814       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5815                                     : EQ (val, Qt) ? utf_with_bom
5816                                     : utf_without_bom);
5817       val = AREF (attrs, coding_attr_utf_16_endian);
5818       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5819                                        : utf_16_little_endian);
5820       CODING_UTF_16_SURROGATE (coding) = 0;
5821       coding->detector = detect_coding_utf_16;
5822       coding->decoder = decode_coding_utf_16;
5823       coding->encoder = encode_coding_utf_16;
5824       coding->common_flags
5825         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5826       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5827         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5828     }
5829   else if (EQ (coding_type, Qccl))
5830     {
5831       coding->detector = detect_coding_ccl;
5832       coding->decoder = decode_coding_ccl;
5833       coding->encoder = encode_coding_ccl;
5834       coding->common_flags
5835         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5836             | CODING_REQUIRE_FLUSHING_MASK);
5837     }
5838   else if (EQ (coding_type, Qemacs_mule))
5839     {
5840       coding->detector = detect_coding_emacs_mule;
5841       coding->decoder = decode_coding_emacs_mule;
5842       coding->encoder = encode_coding_emacs_mule;
5843       coding->common_flags
5844         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5845       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5846           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5847         {
5848           Lisp_Object tail, safe_charsets;
5849           int max_charset_id = 0;
5850
5851           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5852                tail = XCDR (tail))
5853             if (max_charset_id < XFASTINT (XCAR (tail)))
5854               max_charset_id = XFASTINT (XCAR (tail));
5855           safe_charsets = make_uninit_string (max_charset_id + 1);
5856           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5857           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5858                tail = XCDR (tail))
5859             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5860           coding->max_charset_id = max_charset_id;
5861           coding->safe_charsets = SDATA (safe_charsets);
5862         }
5863       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5864       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5865     }
5866   else if (EQ (coding_type, Qshift_jis))
5867     {
5868       coding->detector = detect_coding_sjis;
5869       coding->decoder = decode_coding_sjis;
5870       coding->encoder = encode_coding_sjis;
5871       coding->common_flags
5872         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5873     }
5874   else if (EQ (coding_type, Qbig5))
5875     {
5876       coding->detector = detect_coding_big5;
5877       coding->decoder = decode_coding_big5;
5878       coding->encoder = encode_coding_big5;
5879       coding->common_flags
5880         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5881     }
5882   else                          /* EQ (coding_type, Qraw_text) */
5883     {
5884       coding->detector = NULL;
5885       coding->decoder = decode_coding_raw_text;
5886       coding->encoder = encode_coding_raw_text;
5887       if (! EQ (eol_type, Qunix))
5888         {
5889           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5890           if (! VECTORP (eol_type))
5891             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5892         }
5893
5894     }
5895
5896   return;
5897 }
5898
5899 /* Return a list of charsets supported by CODING.  */
5900
5901 Lisp_Object
5902 coding_charset_list (struct coding_system *coding)
5903 {
5904   Lisp_Object attrs, charset_list;
5905
5906   CODING_GET_INFO (coding, attrs, charset_list);
5907   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5908     {
5909       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5910
5911       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5912         charset_list = Viso_2022_charset_list;
5913     }
5914   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5915     {
5916       charset_list = Vemacs_mule_charset_list;
5917     }
5918   return charset_list;
5919 }
5920
5921
5922 /* Return a list of charsets supported by CODING-SYSTEM.  */
5923
5924 Lisp_Object
5925 coding_system_charset_list (Lisp_Object coding_system)
5926 {
5927   ptrdiff_t id;
5928   Lisp_Object attrs, charset_list;
5929
5930   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5931   attrs = CODING_ID_ATTRS (id);
5932
5933   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5934     {
5935       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5936
5937       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5938         charset_list = Viso_2022_charset_list;
5939       else
5940         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5941     }
5942   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5943     {
5944       charset_list = Vemacs_mule_charset_list;
5945     }
5946   else
5947     {
5948       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5949     }
5950   return charset_list;
5951 }
5952
5953
5954 /* Return raw-text or one of its subsidiaries that has the same
5955    eol_type as CODING-SYSTEM.  */
5956
5957 Lisp_Object
5958 raw_text_coding_system (Lisp_Object coding_system)
5959 {
5960   Lisp_Object spec, attrs;
5961   Lisp_Object eol_type, raw_text_eol_type;
5962
5963   if (NILP (coding_system))
5964     return Qraw_text;
5965   spec = CODING_SYSTEM_SPEC (coding_system);
5966   attrs = AREF (spec, 0);
5967
5968   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5969     return coding_system;
5970
5971   eol_type = AREF (spec, 2);
5972   if (VECTORP (eol_type))
5973     return Qraw_text;
5974   spec = CODING_SYSTEM_SPEC (Qraw_text);
5975   raw_text_eol_type = AREF (spec, 2);
5976   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5977           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5978           : AREF (raw_text_eol_type, 2));
5979 }
5980
5981
5982 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5983    the subsidiary that has the same eol-spec as PARENT (if it is not
5984    nil and specifies end-of-line format) or the system's setting
5985    (system_eol_type).  */
5986
5987 Lisp_Object
5988 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5989 {
5990   Lisp_Object spec, eol_type;
5991
5992   if (NILP (coding_system))
5993     coding_system = Qraw_text;
5994   spec = CODING_SYSTEM_SPEC (coding_system);
5995   eol_type = AREF (spec, 2);
5996   if (VECTORP (eol_type))
5997     {
5998       Lisp_Object parent_eol_type;
5999
6000       if (! NILP (parent))
6001         {
6002           Lisp_Object parent_spec;
6003
6004           parent_spec = CODING_SYSTEM_SPEC (parent);
6005           parent_eol_type = AREF (parent_spec, 2);
6006           if (VECTORP (parent_eol_type))
6007             parent_eol_type = system_eol_type;
6008         }
6009       else
6010         parent_eol_type = system_eol_type;
6011       if (EQ (parent_eol_type, Qunix))
6012         coding_system = AREF (eol_type, 0);
6013       else if (EQ (parent_eol_type, Qdos))
6014         coding_system = AREF (eol_type, 1);
6015       else if (EQ (parent_eol_type, Qmac))
6016         coding_system = AREF (eol_type, 2);
6017     }
6018   return coding_system;
6019 }
6020
6021
6022 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6023    decided for writing to a process.  If not, complement them, and
6024    return a new coding system.  */
6025
6026 Lisp_Object
6027 complement_process_encoding_system (Lisp_Object coding_system)
6028 {
6029   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6030   Lisp_Object spec, attrs;
6031   int i;
6032
6033   for (i = 0; i < 3; i++)
6034     {
6035       if (i == 1)
6036         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6037       else if (i == 2)
6038         coding_system = preferred_coding_system ();
6039       spec = CODING_SYSTEM_SPEC (coding_system);
6040       if (NILP (spec))
6041         continue;
6042       attrs = AREF (spec, 0);
6043       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6044         coding_base = CODING_ATTR_BASE_NAME (attrs);
6045       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6046         eol_base = coding_system;
6047       if (! NILP (coding_base) && ! NILP (eol_base))
6048         break;
6049     }
6050
6051   if (i > 0)
6052     /* The original CODING_SYSTEM didn't specify text-conversion or
6053        eol-conversion.  Be sure that we return a fully complemented
6054        coding system.  */
6055     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6056   return coding_system;
6057 }
6058
6059
6060 /* Emacs has a mechanism to automatically detect a coding system if it
6061    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6062    it's impossible to distinguish some coding systems accurately
6063    because they use the same range of codes.  So, at first, coding
6064    systems are categorized into 7, those are:
6065
6066    o coding-category-emacs-mule
6067
6068         The category for a coding system which has the same code range
6069         as Emacs' internal format.  Assigned the coding-system (Lisp
6070         symbol) `emacs-mule' by default.
6071
6072    o coding-category-sjis
6073
6074         The category for a coding system which has the same code range
6075         as SJIS.  Assigned the coding-system (Lisp
6076         symbol) `japanese-shift-jis' by default.
6077
6078    o coding-category-iso-7
6079
6080         The category for a coding system which has the same code range
6081         as ISO2022 of 7-bit environment.  This doesn't use any locking
6082         shift and single shift functions.  This can encode/decode all
6083         charsets.  Assigned the coding-system (Lisp symbol)
6084         `iso-2022-7bit' by default.
6085
6086    o coding-category-iso-7-tight
6087
6088         Same as coding-category-iso-7 except that this can
6089         encode/decode only the specified charsets.
6090
6091    o coding-category-iso-8-1
6092
6093         The category for a coding system which has the same code range
6094         as ISO2022 of 8-bit environment and graphic plane 1 used only
6095         for DIMENSION1 charset.  This doesn't use any locking shift
6096         and single shift functions.  Assigned the coding-system (Lisp
6097         symbol) `iso-latin-1' by default.
6098
6099    o coding-category-iso-8-2
6100
6101         The category for a coding system which has the same code range
6102         as ISO2022 of 8-bit environment and graphic plane 1 used only
6103         for DIMENSION2 charset.  This doesn't use any locking shift
6104         and single shift functions.  Assigned the coding-system (Lisp
6105         symbol) `japanese-iso-8bit' by default.
6106
6107    o coding-category-iso-7-else
6108
6109         The category for a coding system which has the same code range
6110         as ISO2022 of 7-bit environment but uses locking shift or
6111         single shift functions.  Assigned the coding-system (Lisp
6112         symbol) `iso-2022-7bit-lock' by default.
6113
6114    o coding-category-iso-8-else
6115
6116         The category for a coding system which has the same code range
6117         as ISO2022 of 8-bit environment but uses locking shift or
6118         single shift functions.  Assigned the coding-system (Lisp
6119         symbol) `iso-2022-8bit-ss2' by default.
6120
6121    o coding-category-big5
6122
6123         The category for a coding system which has the same code range
6124         as BIG5.  Assigned the coding-system (Lisp symbol)
6125         `cn-big5' by default.
6126
6127    o coding-category-utf-8
6128
6129         The category for a coding system which has the same code range
6130         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6131         symbol) `utf-8' by default.
6132
6133    o coding-category-utf-16-be
6134
6135         The category for a coding system in which a text has an
6136         Unicode signature (cf. Unicode Standard) in the order of BIG
6137         endian at the head.  Assigned the coding-system (Lisp symbol)
6138         `utf-16-be' by default.
6139
6140    o coding-category-utf-16-le
6141
6142         The category for a coding system in which a text has an
6143         Unicode signature (cf. Unicode Standard) in the order of
6144         LITTLE endian at the head.  Assigned the coding-system (Lisp
6145         symbol) `utf-16-le' by default.
6146
6147    o coding-category-ccl
6148
6149         The category for a coding system of which encoder/decoder is
6150         written in CCL programs.  The default value is nil, i.e., no
6151         coding system is assigned.
6152
6153    o coding-category-binary
6154
6155         The category for a coding system not categorized in any of the
6156         above.  Assigned the coding-system (Lisp symbol)
6157         `no-conversion' by default.
6158
6159    Each of them is a Lisp symbol and the value is an actual
6160    `coding-system's (this is also a Lisp symbol) assigned by a user.
6161    What Emacs does actually is to detect a category of coding system.
6162    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6163    decide only one possible category, it selects a category of the
6164    highest priority.  Priorities of categories are also specified by a
6165    user in a Lisp variable `coding-category-list'.
6166
6167 */
6168
6169 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6170                                            int eol_seen);
6171
6172
6173 /* Return the number of ASCII characters at the head of the source.
6174    By side effects, set coding->head_ascii and update
6175    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6176    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6177    reliable only when all the source bytes are ASCII.  */
6178
6179 static ptrdiff_t
6180 check_ascii (struct coding_system *coding)
6181 {
6182   const unsigned char *src, *end;
6183   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6184   int eol_seen = coding->eol_seen;
6185
6186   coding_set_source (coding);
6187   src = coding->source;
6188   end = src + coding->src_bytes;
6189
6190   if (inhibit_eol_conversion
6191       || SYMBOLP (eol_type))
6192     {
6193       /* We don't have to check EOL format.  */
6194       while (src < end && !( *src & 0x80))
6195         {
6196           if (*src++ == '\n')
6197             eol_seen |= EOL_SEEN_LF;
6198         }
6199     }
6200   else
6201     {
6202       end--;                /* We look ahead one byte for "CR LF".  */
6203       while (src < end)
6204         {
6205           int c = *src;
6206
6207           if (c & 0x80)
6208             break;
6209           src++;
6210           if (c == '\r')
6211             {
6212               if (*src == '\n')
6213                 {
6214                   eol_seen |= EOL_SEEN_CRLF;
6215                   src++;
6216                 }
6217               else
6218                 eol_seen |= EOL_SEEN_CR;
6219             }
6220           else if (c == '\n')
6221             eol_seen |= EOL_SEEN_LF;
6222         }
6223       if (src == end)
6224         {
6225           int c = *src;
6226
6227           /* All bytes but the last one C are ASCII.  */
6228           if (! (c & 0x80))
6229             {
6230               if (c == '\r')
6231                 eol_seen |= EOL_SEEN_CR;
6232               else if (c  == '\n')
6233                 eol_seen |= EOL_SEEN_LF;
6234               src++;
6235             }
6236         }
6237     }
6238   coding->head_ascii = src - coding->source;
6239   coding->eol_seen = eol_seen;
6240   return (coding->head_ascii);
6241 }
6242
6243
6244 /* Return the number of characters at the source if all the bytes are
6245    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6246    effects, update coding->eol_seen.  The value of coding->eol_seen is
6247    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6248    the value is reliable only when all the source bytes are valid
6249    UTF-8.  */
6250
6251 static ptrdiff_t
6252 check_utf_8 (struct coding_system *coding)
6253 {
6254   const unsigned char *src, *end;
6255   int eol_seen;
6256   ptrdiff_t nchars = coding->head_ascii;
6257
6258   if (coding->head_ascii < 0)
6259     check_ascii (coding);
6260   else
6261     coding_set_source (coding);
6262   src = coding->source + coding->head_ascii;
6263   /* We look ahead one byte for CR LF.  */
6264   end = coding->source + coding->src_bytes - 1;
6265   eol_seen = coding->eol_seen;
6266   while (src < end)
6267     {
6268       int c = *src;
6269
6270       if (UTF_8_1_OCTET_P (*src))
6271         {
6272           src++;
6273           if (c < 0x20)
6274             {
6275               if (c == '\r')
6276                 {
6277                   if (*src == '\n')
6278                     {
6279                       eol_seen |= EOL_SEEN_CRLF;
6280                       src++;
6281                       nchars++;
6282                     }
6283                   else
6284                     eol_seen |= EOL_SEEN_CR;
6285                 }
6286               else if (c == '\n')
6287                 eol_seen |= EOL_SEEN_LF;
6288             }
6289         }
6290       else if (UTF_8_2_OCTET_LEADING_P (c))
6291         {
6292           if (c < 0xC2          /* overlong sequence */
6293               || src + 1 >= end
6294               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6295             return -1;
6296           src += 2;
6297         }
6298       else if (UTF_8_3_OCTET_LEADING_P (c))
6299         {
6300           if (src + 2 >= end
6301               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6302                     && UTF_8_EXTRA_OCTET_P (src[2])))
6303             return -1;
6304           c = (((c & 0xF) << 12)
6305                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6306           if (c < 0x800                       /* overlong sequence */
6307               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6308             return -1;
6309           src += 3;
6310         }
6311       else if (UTF_8_4_OCTET_LEADING_P (c))
6312         {
6313           if (src + 3 >= end
6314               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6315                     && UTF_8_EXTRA_OCTET_P (src[2])
6316                     && UTF_8_EXTRA_OCTET_P (src[3])))
6317             return -1;
6318           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6319                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6320           if (c < 0x10000       /* overlong sequence */
6321               || c >= 0x110000) /* non-Unicode character  */
6322             return -1;
6323           src += 4;
6324         }
6325       else
6326         return -1;
6327       nchars++;
6328     }
6329
6330   if (src == end)
6331     {
6332       if (! UTF_8_1_OCTET_P (*src))
6333         return -1;
6334       nchars++;
6335       if (*src == '\r')
6336         eol_seen |= EOL_SEEN_CR;
6337       else if (*src  == '\n')
6338         eol_seen |= EOL_SEEN_LF;
6339     }
6340   coding->eol_seen = eol_seen;
6341   return nchars;
6342 }
6343
6344
6345 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6346    SOURCE is encoded.  If CATEGORY is one of
6347    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6348    two-byte, else they are encoded by one-byte.
6349
6350    Return one of EOL_SEEN_XXX.  */
6351
6352 #define MAX_EOL_CHECK_COUNT 3
6353
6354 static int
6355 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6356             enum coding_category category)
6357 {
6358   const unsigned char *src = source, *src_end = src + src_bytes;
6359   unsigned char c;
6360   int total  = 0;
6361   int eol_seen = EOL_SEEN_NONE;
6362
6363   if ((1 << category) & CATEGORY_MASK_UTF_16)
6364     {
6365       bool msb = category == (coding_category_utf_16_le
6366                               | coding_category_utf_16_le_nosig);
6367       bool lsb = !msb;
6368
6369       while (src + 1 < src_end)
6370         {
6371           c = src[lsb];
6372           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6373             {
6374               int this_eol;
6375
6376               if (c == '\n')
6377                 this_eol = EOL_SEEN_LF;
6378               else if (src + 3 >= src_end
6379                        || src[msb + 2] != 0
6380                        || src[lsb + 2] != '\n')
6381                 this_eol = EOL_SEEN_CR;
6382               else
6383                 {
6384                   this_eol = EOL_SEEN_CRLF;
6385                   src += 2;
6386                 }
6387
6388               if (eol_seen == EOL_SEEN_NONE)
6389                 /* This is the first end-of-line.  */
6390                 eol_seen = this_eol;
6391               else if (eol_seen != this_eol)
6392                 {
6393                   /* The found type is different from what found before.
6394                      Allow for stray ^M characters in DOS EOL files.  */
6395                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6396                       || (eol_seen == EOL_SEEN_CRLF
6397                           && this_eol == EOL_SEEN_CR))
6398                     eol_seen = EOL_SEEN_CRLF;
6399                   else
6400                     {
6401                       eol_seen = EOL_SEEN_LF;
6402                       break;
6403                     }
6404                 }
6405               if (++total == MAX_EOL_CHECK_COUNT)
6406                 break;
6407             }
6408           src += 2;
6409         }
6410     }
6411   else
6412     while (src < src_end)
6413       {
6414         c = *src++;
6415         if (c == '\n' || c == '\r')
6416           {
6417             int this_eol;
6418
6419             if (c == '\n')
6420               this_eol = EOL_SEEN_LF;
6421             else if (src >= src_end || *src != '\n')
6422               this_eol = EOL_SEEN_CR;
6423             else
6424               this_eol = EOL_SEEN_CRLF, src++;
6425
6426             if (eol_seen == EOL_SEEN_NONE)
6427               /* This is the first end-of-line.  */
6428               eol_seen = this_eol;
6429             else if (eol_seen != this_eol)
6430               {
6431                 /* The found type is different from what found before.
6432                    Allow for stray ^M characters in DOS EOL files.  */
6433                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6434                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6435                   eol_seen = EOL_SEEN_CRLF;
6436                 else
6437                   {
6438                     eol_seen = EOL_SEEN_LF;
6439                     break;
6440                   }
6441               }
6442             if (++total == MAX_EOL_CHECK_COUNT)
6443               break;
6444           }
6445       }
6446   return eol_seen;
6447 }
6448
6449
6450 static Lisp_Object
6451 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6452 {
6453   Lisp_Object eol_type;
6454
6455   eol_type = CODING_ID_EOL_TYPE (coding->id);
6456   if (! VECTORP (eol_type))
6457     /* Already adjusted.  */
6458     return eol_type;
6459   if (eol_seen & EOL_SEEN_LF)
6460     {
6461       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6462       eol_type = Qunix;
6463     }
6464   else if (eol_seen & EOL_SEEN_CRLF)
6465     {
6466       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6467       eol_type = Qdos;
6468     }
6469   else if (eol_seen & EOL_SEEN_CR)
6470     {
6471       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6472       eol_type = Qmac;
6473     }
6474   return eol_type;
6475 }
6476
6477 /* Detect how a text specified in CODING is encoded.  If a coding
6478    system is detected, update fields of CODING by the detected coding
6479    system.  */
6480
6481 static void
6482 detect_coding (struct coding_system *coding)
6483 {
6484   const unsigned char *src, *src_end;
6485   unsigned int saved_mode = coding->mode;
6486   Lisp_Object found = Qnil;
6487   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6488
6489   coding->consumed = coding->consumed_char = 0;
6490   coding->produced = coding->produced_char = 0;
6491   coding_set_source (coding);
6492
6493   src_end = coding->source + coding->src_bytes;
6494
6495   coding->eol_seen = EOL_SEEN_NONE;
6496   /* If we have not yet decided the text encoding type, detect it
6497      now.  */
6498   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6499     {
6500       int c, i;
6501       struct coding_detection_info detect_info;
6502       bool null_byte_found = 0, eight_bit_found = 0;
6503       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6504                                        inhibit_null_byte_detection);
6505       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6506                                        inhibit_iso_escape_detection);
6507       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6508
6509       coding->head_ascii = 0;
6510       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6511       for (src = coding->source; src < src_end; src++)
6512         {
6513           c = *src;
6514           if (c & 0x80)
6515             {
6516               eight_bit_found = 1;
6517               if (null_byte_found)
6518                 break;
6519             }
6520           else if (c < 0x20)
6521             {
6522               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6523                   && ! inhibit_ied
6524                   && ! detect_info.checked)
6525                 {
6526                   if (detect_coding_iso_2022 (coding, &detect_info))
6527                     {
6528                       /* We have scanned the whole data.  */
6529                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6530                         {
6531                           /* We didn't find an 8-bit code.  We may
6532                              have found a null-byte, but it's very
6533                              rare that a binary file conforms to
6534                              ISO-2022.  */
6535                           src = src_end;
6536                           coding->head_ascii = src - coding->source;
6537                         }
6538                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6539                       break;
6540                     }
6541                 }
6542               else if (! c && !inhibit_nbd)
6543                 {
6544                   null_byte_found = 1;
6545                   if (eight_bit_found)
6546                     break;
6547                 }
6548               else if (! disable_ascii_optimization
6549                        && ! inhibit_eol_conversion)
6550                 {
6551                   if (c == '\r')
6552                     {
6553                       if (src < src_end && src[1] == '\n')
6554                         {
6555                           coding->eol_seen |= EOL_SEEN_CRLF;
6556                           src++;
6557                           if (! eight_bit_found)
6558                             coding->head_ascii++;
6559                         }
6560                       else
6561                         coding->eol_seen |= EOL_SEEN_CR;
6562                     }
6563                   else if (c == '\n')
6564                     {
6565                       coding->eol_seen |= EOL_SEEN_LF;
6566                     }
6567                 }
6568
6569               if (! eight_bit_found)
6570                 coding->head_ascii++;
6571             }
6572           else if (! eight_bit_found)
6573             coding->head_ascii++;
6574         }
6575
6576       if (null_byte_found || eight_bit_found
6577           || coding->head_ascii < coding->src_bytes
6578           || detect_info.found)
6579         {
6580           enum coding_category category;
6581           struct coding_system *this;
6582
6583           if (coding->head_ascii == coding->src_bytes)
6584             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6585             for (i = 0; i < coding_category_raw_text; i++)
6586               {
6587                 category = coding_priorities[i];
6588                 this = coding_categories + category;
6589                 if (detect_info.found & (1 << category))
6590                   break;
6591               }
6592           else
6593             {
6594               if (null_byte_found)
6595                 {
6596                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6597                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6598                 }
6599               else if (prefer_utf_8
6600                        && detect_coding_utf_8 (coding, &detect_info))
6601                 {
6602                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6603                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6604                 }
6605               for (i = 0; i < coding_category_raw_text; i++)
6606                 {
6607                   category = coding_priorities[i];
6608                   this = coding_categories + category;
6609                   /* Some of this->detector (e.g. detect_coding_sjis)
6610                      require this information.  */
6611                   coding->id = this->id;
6612                   if (this->id < 0)
6613                     {
6614                       /* No coding system of this category is defined.  */
6615                       detect_info.rejected |= (1 << category);
6616                     }
6617                   else if (category >= coding_category_raw_text)
6618                     continue;
6619                   else if (detect_info.checked & (1 << category))
6620                     {
6621                       if (detect_info.found & (1 << category))
6622                         break;
6623                     }
6624                   else if ((*(this->detector)) (coding, &detect_info)
6625                            && detect_info.found & (1 << category))
6626                     break;
6627                 }
6628             }
6629
6630           if (i < coding_category_raw_text)
6631             {
6632               if (category == coding_category_utf_8_auto)
6633                 {
6634                   Lisp_Object coding_systems;
6635
6636                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6637                                          coding_attr_utf_bom);
6638                   if (CONSP (coding_systems))
6639                     {
6640                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6641                         found = XCAR (coding_systems);
6642                       else
6643                         found = XCDR (coding_systems);
6644                     }
6645                   else
6646                     found = CODING_ID_NAME (this->id);
6647                 }
6648               else if (category == coding_category_utf_16_auto)
6649                 {
6650                   Lisp_Object coding_systems;
6651
6652                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6653                                          coding_attr_utf_bom);
6654                   if (CONSP (coding_systems))
6655                     {
6656                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6657                         found = XCAR (coding_systems);
6658                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6659                         found = XCDR (coding_systems);
6660                     }
6661                   else
6662                     found = CODING_ID_NAME (this->id);
6663                 }
6664               else
6665                 found = CODING_ID_NAME (this->id);
6666             }
6667           else if (null_byte_found)
6668             found = Qno_conversion;
6669           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6670                    == CATEGORY_MASK_ANY)
6671             found = Qraw_text;
6672           else if (detect_info.rejected)
6673             for (i = 0; i < coding_category_raw_text; i++)
6674               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6675                 {
6676                   this = coding_categories + coding_priorities[i];
6677                   found = CODING_ID_NAME (this->id);
6678                   break;
6679                 }
6680         }
6681     }
6682   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6683            == coding_category_utf_8_auto)
6684     {
6685       Lisp_Object coding_systems;
6686       struct coding_detection_info detect_info;
6687
6688       coding_systems
6689         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6690       detect_info.found = detect_info.rejected = 0;
6691       if (check_ascii (coding) == coding->src_bytes)
6692         {
6693           if (CONSP (coding_systems))
6694             found = XCDR (coding_systems);
6695         }
6696       else
6697         {
6698           if (CONSP (coding_systems)
6699               && detect_coding_utf_8 (coding, &detect_info))
6700             {
6701               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6702                 found = XCAR (coding_systems);
6703               else
6704                 found = XCDR (coding_systems);
6705             }
6706         }
6707     }
6708   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6709            == coding_category_utf_16_auto)
6710     {
6711       Lisp_Object coding_systems;
6712       struct coding_detection_info detect_info;
6713
6714       coding_systems
6715         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6716       detect_info.found = detect_info.rejected = 0;
6717       coding->head_ascii = 0;
6718       if (CONSP (coding_systems)
6719           && detect_coding_utf_16 (coding, &detect_info))
6720         {
6721           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6722             found = XCAR (coding_systems);
6723           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6724             found = XCDR (coding_systems);
6725         }
6726     }
6727
6728   if (! NILP (found))
6729     {
6730       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6731                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6732                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6733                            : EOL_SEEN_LF);
6734
6735       setup_coding_system (found, coding);
6736       if (specified_eol != EOL_SEEN_NONE)
6737         adjust_coding_eol_type (coding, specified_eol);
6738     }
6739
6740   coding->mode = saved_mode;
6741 }
6742
6743
6744 static void
6745 decode_eol (struct coding_system *coding)
6746 {
6747   Lisp_Object eol_type;
6748   unsigned char *p, *pbeg, *pend;
6749
6750   eol_type = CODING_ID_EOL_TYPE (coding->id);
6751   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6752     return;
6753
6754   if (NILP (coding->dst_object))
6755     pbeg = coding->destination;
6756   else
6757     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6758   pend = pbeg + coding->produced;
6759
6760   if (VECTORP (eol_type))
6761     {
6762       int eol_seen = EOL_SEEN_NONE;
6763
6764       for (p = pbeg; p < pend; p++)
6765         {
6766           if (*p == '\n')
6767             eol_seen |= EOL_SEEN_LF;
6768           else if (*p == '\r')
6769             {
6770               if (p + 1 < pend && *(p + 1) == '\n')
6771                 {
6772                   eol_seen |= EOL_SEEN_CRLF;
6773                   p++;
6774                 }
6775               else
6776                 eol_seen |= EOL_SEEN_CR;
6777             }
6778         }
6779       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6780       if ((eol_seen & EOL_SEEN_CRLF) != 0
6781           && (eol_seen & EOL_SEEN_CR) != 0
6782           && (eol_seen & EOL_SEEN_LF) == 0)
6783         eol_seen = EOL_SEEN_CRLF;
6784       else if (eol_seen != EOL_SEEN_NONE
6785           && eol_seen != EOL_SEEN_LF
6786           && eol_seen != EOL_SEEN_CRLF
6787           && eol_seen != EOL_SEEN_CR)
6788         eol_seen = EOL_SEEN_LF;
6789       if (eol_seen != EOL_SEEN_NONE)
6790         eol_type = adjust_coding_eol_type (coding, eol_seen);
6791     }
6792
6793   if (EQ (eol_type, Qmac))
6794     {
6795       for (p = pbeg; p < pend; p++)
6796         if (*p == '\r')
6797           *p = '\n';
6798     }
6799   else if (EQ (eol_type, Qdos))
6800     {
6801       ptrdiff_t n = 0;
6802
6803       if (NILP (coding->dst_object))
6804         {
6805           /* Start deleting '\r' from the tail to minimize the memory
6806              movement.  */
6807           for (p = pend - 2; p >= pbeg; p--)
6808             if (*p == '\r')
6809               {
6810                 memmove (p, p + 1, pend-- - p - 1);
6811                 n++;
6812               }
6813         }
6814       else
6815         {
6816           ptrdiff_t pos_byte = coding->dst_pos_byte;
6817           ptrdiff_t pos = coding->dst_pos;
6818           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6819
6820           while (pos < pos_end)
6821             {
6822               p = BYTE_POS_ADDR (pos_byte);
6823               if (*p == '\r' && p[1] == '\n')
6824                 {
6825                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6826                   n++;
6827                   pos_end--;
6828                 }
6829               pos++;
6830               if (coding->dst_multibyte)
6831                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6832               else
6833                 pos_byte++;
6834             }
6835         }
6836       coding->produced -= n;
6837       coding->produced_char -= n;
6838     }
6839 }
6840
6841
6842 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
6843    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
6844    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
6845 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6846
6847 /* Return a translation table (or list of them) from coding system
6848    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6849    not ENCODEP). */
6850
6851 static Lisp_Object
6852 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6853 {
6854   Lisp_Object standard, translation_table;
6855   Lisp_Object val;
6856
6857   if (NILP (Venable_character_translation))
6858     {
6859       if (max_lookup)
6860         *max_lookup = 0;
6861       return Qnil;
6862     }
6863   if (encodep)
6864     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6865       standard = Vstandard_translation_table_for_encode;
6866   else
6867     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6868       standard = Vstandard_translation_table_for_decode;
6869   if (NILP (translation_table))
6870     translation_table = standard;
6871   else
6872     {
6873       if (SYMBOLP (translation_table))
6874         translation_table = Fget (translation_table, Qtranslation_table);
6875       else if (CONSP (translation_table))
6876         {
6877           translation_table = Fcopy_sequence (translation_table);
6878           for (val = translation_table; CONSP (val); val = XCDR (val))
6879             if (SYMBOLP (XCAR (val)))
6880               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6881         }
6882       if (CHAR_TABLE_P (standard))
6883         {
6884           if (CONSP (translation_table))
6885             translation_table = nconc2 (translation_table, list1 (standard));
6886           else
6887             translation_table = list2 (translation_table, standard);
6888         }
6889     }
6890
6891   if (max_lookup)
6892     {
6893       *max_lookup = 1;
6894       if (CHAR_TABLE_P (translation_table)
6895           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6896         {
6897           val = XCHAR_TABLE (translation_table)->extras[1];
6898           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6899             *max_lookup = min (XFASTINT (val), MAX_LOOKUP_MAX);
6900         }
6901       else if (CONSP (translation_table))
6902         {
6903           Lisp_Object tail;
6904
6905           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6906             if (CHAR_TABLE_P (XCAR (tail))
6907                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6908               {
6909                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6910                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6911                   *max_lookup = min (XFASTINT (tailval), MAX_LOOKUP_MAX);
6912               }
6913         }
6914     }
6915   return translation_table;
6916 }
6917
6918 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6919   do {                                                          \
6920     trans = Qnil;                                               \
6921     if (CHAR_TABLE_P (table))                                   \
6922       {                                                         \
6923         trans = CHAR_TABLE_REF (table, c);                      \
6924         if (CHARACTERP (trans))                                 \
6925           c = XFASTINT (trans), trans = Qnil;                   \
6926       }                                                         \
6927     else if (CONSP (table))                                     \
6928       {                                                         \
6929         Lisp_Object tail;                                       \
6930                                                                 \
6931         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6932           if (CHAR_TABLE_P (XCAR (tail)))                       \
6933             {                                                   \
6934               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6935               if (CHARACTERP (trans))                           \
6936                 c = XFASTINT (trans), trans = Qnil;             \
6937               else if (! NILP (trans))                          \
6938                 break;                                          \
6939             }                                                   \
6940       }                                                         \
6941   } while (0)
6942
6943
6944 /* Return a translation of character(s) at BUF according to TRANS.
6945    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6946    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6947    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6948    translation is found, and Qnil if not found..
6949    If BUF is too short to lookup characters in FROM, return Qt.  */
6950
6951 static Lisp_Object
6952 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6953 {
6954
6955   if (INTEGERP (trans))
6956     return trans;
6957   for (; CONSP (trans); trans = XCDR (trans))
6958     {
6959       Lisp_Object val = XCAR (trans);
6960       Lisp_Object from = XCAR (val);
6961       ptrdiff_t len = ASIZE (from);
6962       ptrdiff_t i;
6963
6964       for (i = 0; i < len; i++)
6965         {
6966           if (buf + i == buf_end)
6967             return Qt;
6968           if (XINT (AREF (from, i)) != buf[i])
6969             break;
6970         }
6971       if (i == len)
6972         return val;
6973     }
6974   return Qnil;
6975 }
6976
6977
6978 static int
6979 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6980                bool last_block)
6981 {
6982   unsigned char *dst = coding->destination + coding->produced;
6983   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6984   ptrdiff_t produced;
6985   ptrdiff_t produced_chars = 0;
6986   int carryover = 0;
6987
6988   if (! coding->chars_at_source)
6989     {
6990       /* Source characters are in coding->charbuf.  */
6991       int *buf = coding->charbuf;
6992       int *buf_end = buf + coding->charbuf_used;
6993
6994       if (EQ (coding->src_object, coding->dst_object)
6995           && ! NILP (coding->dst_object))
6996         {
6997           eassert (growable_destination (coding));
6998           coding_set_source (coding);
6999           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7000         }
7001
7002       while (buf < buf_end)
7003         {
7004           int c = *buf;
7005           ptrdiff_t i;
7006
7007           if (c >= 0)
7008             {
7009               ptrdiff_t from_nchars = 1, to_nchars = 1;
7010               Lisp_Object trans = Qnil;
7011
7012               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7013               if (! NILP (trans))
7014                 {
7015                   trans = get_translation (trans, buf, buf_end);
7016                   if (INTEGERP (trans))
7017                     c = XINT (trans);
7018                   else if (CONSP (trans))
7019                     {
7020                       from_nchars = ASIZE (XCAR (trans));
7021                       trans = XCDR (trans);
7022                       if (INTEGERP (trans))
7023                         c = XINT (trans);
7024                       else
7025                         {
7026                           to_nchars = ASIZE (trans);
7027                           c = XINT (AREF (trans, 0));
7028                         }
7029                     }
7030                   else if (EQ (trans, Qt) && ! last_block)
7031                     break;
7032                 }
7033
7034               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7035                 {
7036                   eassert (growable_destination (coding));
7037                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7038                        / MAX_MULTIBYTE_LENGTH)
7039                       < to_nchars)
7040                     memory_full (SIZE_MAX);
7041                   dst = alloc_destination (coding,
7042                                            buf_end - buf
7043                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7044                                            dst);
7045                   if (EQ (coding->src_object, coding->dst_object))
7046                     {
7047                       coding_set_source (coding);
7048                       dst_end = (((unsigned char *) coding->source)
7049                                  + coding->consumed);
7050                     }
7051                   else
7052                     dst_end = coding->destination + coding->dst_bytes;
7053                 }
7054
7055               for (i = 0; i < to_nchars; i++)
7056                 {
7057                   if (i > 0)
7058                     c = XINT (AREF (trans, i));
7059                   if (coding->dst_multibyte
7060                       || ! CHAR_BYTE8_P (c))
7061                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7062                   else
7063                     *dst++ = CHAR_TO_BYTE8 (c);
7064                 }
7065               produced_chars += to_nchars;
7066               buf += from_nchars;
7067             }
7068           else
7069             /* This is an annotation datum.  (-C) is the length.  */
7070             buf += -c;
7071         }
7072       carryover = buf_end - buf;
7073     }
7074   else
7075     {
7076       /* Source characters are at coding->source.  */
7077       const unsigned char *src = coding->source;
7078       const unsigned char *src_end = src + coding->consumed;
7079
7080       if (EQ (coding->dst_object, coding->src_object))
7081         {
7082           eassert (growable_destination (coding));
7083           dst_end = (unsigned char *) src;
7084         }
7085       if (coding->src_multibyte != coding->dst_multibyte)
7086         {
7087           if (coding->src_multibyte)
7088             {
7089               bool multibytep = 1;
7090               ptrdiff_t consumed_chars = 0;
7091
7092               while (1)
7093                 {
7094                   const unsigned char *src_base = src;
7095                   int c;
7096
7097                   ONE_MORE_BYTE (c);
7098                   if (dst == dst_end)
7099                     {
7100                       eassert (growable_destination (coding));
7101                       if (EQ (coding->src_object, coding->dst_object))
7102                         dst_end = (unsigned char *) src;
7103                       if (dst == dst_end)
7104                         {
7105                           ptrdiff_t offset = src - coding->source;
7106
7107                           dst = alloc_destination (coding, src_end - src + 1,
7108                                                    dst);
7109                           dst_end = coding->destination + coding->dst_bytes;
7110                           coding_set_source (coding);
7111                           src = coding->source + offset;
7112                           src_end = coding->source + coding->consumed;
7113                           if (EQ (coding->src_object, coding->dst_object))
7114                             dst_end = (unsigned char *) src;
7115                         }
7116                     }
7117                   *dst++ = c;
7118                   produced_chars++;
7119                 }
7120             no_more_source:
7121               ;
7122             }
7123           else
7124             while (src < src_end)
7125               {
7126                 bool multibytep = 1;
7127                 int c = *src++;
7128
7129                 if (dst >= dst_end - 1)
7130                   {
7131                     eassert (growable_destination (coding));
7132                     if (EQ (coding->src_object, coding->dst_object))
7133                       dst_end = (unsigned char *) src;
7134                     if (dst >= dst_end - 1)
7135                       {
7136                         ptrdiff_t offset = src - coding->source;
7137                         ptrdiff_t more_bytes;
7138
7139                         if (EQ (coding->src_object, coding->dst_object))
7140                           more_bytes = ((src_end - src) / 2) + 2;
7141                         else
7142                           more_bytes = src_end - src + 2;
7143                         dst = alloc_destination (coding, more_bytes, dst);
7144                         dst_end = coding->destination + coding->dst_bytes;
7145                         coding_set_source (coding);
7146                         src = coding->source + offset;
7147                         src_end = coding->source + coding->consumed;
7148                         if (EQ (coding->src_object, coding->dst_object))
7149                           dst_end = (unsigned char *) src;
7150                       }
7151                   }
7152                 EMIT_ONE_BYTE (c);
7153               }
7154         }
7155       else
7156         {
7157           if (!EQ (coding->src_object, coding->dst_object))
7158             {
7159               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7160
7161               if (require > 0)
7162                 {
7163                   ptrdiff_t offset = src - coding->source;
7164
7165                   dst = alloc_destination (coding, require, dst);
7166                   coding_set_source (coding);
7167                   src = coding->source + offset;
7168                   src_end = coding->source + coding->consumed;
7169                 }
7170             }
7171           produced_chars = coding->consumed_char;
7172           while (src < src_end)
7173             *dst++ = *src++;
7174         }
7175     }
7176
7177   produced = dst - (coding->destination + coding->produced);
7178   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7179     insert_from_gap (produced_chars, produced, 0);
7180   coding->produced += produced;
7181   coding->produced_char += produced_chars;
7182   return carryover;
7183 }
7184
7185 /* Compose text in CODING->object according to the annotation data at
7186    CHARBUF.  CHARBUF is an array:
7187      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7188  */
7189
7190 static void
7191 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7192 {
7193   int len;
7194   ptrdiff_t to;
7195   enum composition_method method;
7196   Lisp_Object components;
7197
7198   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7199   to = pos + charbuf[2];
7200   method = (enum composition_method) (charbuf[4]);
7201
7202   if (method == COMPOSITION_RELATIVE)
7203     components = Qnil;
7204   else
7205     {
7206       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7207       int i, j;
7208
7209       if (method == COMPOSITION_WITH_RULE)
7210         len = charbuf[2] * 3 - 2;
7211       charbuf += MAX_ANNOTATION_LENGTH;
7212       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7213       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7214         {
7215           if (charbuf[i] >= 0)
7216             args[j] = make_number (charbuf[i]);
7217           else
7218             {
7219               i++;
7220               args[j] = make_number (charbuf[i] % 0x100);
7221             }
7222         }
7223       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7224     }
7225   compose_text (pos, to, components, Qnil, coding->dst_object);
7226 }
7227
7228
7229 /* Put `charset' property on text in CODING->object according to
7230    the annotation data at CHARBUF.  CHARBUF is an array:
7231      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7232  */
7233
7234 static void
7235 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7236 {
7237   ptrdiff_t from = pos - charbuf[2];
7238   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7239
7240   Fput_text_property (make_number (from), make_number (pos),
7241                       Qcharset, CHARSET_NAME (charset),
7242                       coding->dst_object);
7243 }
7244
7245 #define MAX_CHARBUF_SIZE 0x4000
7246 /* How many units decoding functions expect in coding->charbuf at
7247    most.  Currently, decode_coding_emacs_mule expects the following
7248    size, and that is the largest value.  */
7249 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7250
7251 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7252   do {                                                          \
7253     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
7254                            MAX_CHARBUF_SIZE);                   \
7255     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
7256     coding->charbuf_size = units;                               \
7257   } while (0)
7258
7259 static void
7260 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7261 {
7262   int *charbuf = coding->charbuf;
7263   int *charbuf_end = charbuf + coding->charbuf_used;
7264
7265   if (NILP (coding->dst_object))
7266     return;
7267
7268   while (charbuf < charbuf_end)
7269     {
7270       if (*charbuf >= 0)
7271         pos++, charbuf++;
7272       else
7273         {
7274           int len = -*charbuf;
7275
7276           if (len > 2)
7277             switch (charbuf[1])
7278               {
7279               case CODING_ANNOTATE_COMPOSITION_MASK:
7280                 produce_composition (coding, charbuf, pos);
7281                 break;
7282               case CODING_ANNOTATE_CHARSET_MASK:
7283                 produce_charset (coding, charbuf, pos);
7284                 break;
7285               }
7286           charbuf += len;
7287         }
7288     }
7289 }
7290
7291 /* Decode the data at CODING->src_object into CODING->dst_object.
7292    CODING->src_object is a buffer, a string, or nil.
7293    CODING->dst_object is a buffer.
7294
7295    If CODING->src_object is a buffer, it must be the current buffer.
7296    In this case, if CODING->src_pos is positive, it is a position of
7297    the source text in the buffer, otherwise, the source text is in the
7298    gap area of the buffer, and CODING->src_pos specifies the offset of
7299    the text from GPT (which must be the same as PT).  If this is the
7300    same buffer as CODING->dst_object, CODING->src_pos must be
7301    negative.
7302
7303    If CODING->src_object is a string, CODING->src_pos is an index to
7304    that string.
7305
7306    If CODING->src_object is nil, CODING->source must already point to
7307    the non-relocatable memory area.  In this case, CODING->src_pos is
7308    an offset from CODING->source.
7309
7310    The decoded data is inserted at the current point of the buffer
7311    CODING->dst_object.
7312 */
7313
7314 static void
7315 decode_coding (struct coding_system *coding)
7316 {
7317   Lisp_Object attrs;
7318   Lisp_Object undo_list;
7319   Lisp_Object translation_table;
7320   struct ccl_spec cclspec;
7321   int carryover;
7322   int i;
7323
7324   USE_SAFE_ALLOCA;
7325
7326   if (BUFFERP (coding->src_object)
7327       && coding->src_pos > 0
7328       && coding->src_pos < GPT
7329       && coding->src_pos + coding->src_chars > GPT)
7330     move_gap_both (coding->src_pos, coding->src_pos_byte);
7331
7332   undo_list = Qt;
7333   if (BUFFERP (coding->dst_object))
7334     {
7335       set_buffer_internal (XBUFFER (coding->dst_object));
7336       if (GPT != PT)
7337         move_gap_both (PT, PT_BYTE);
7338
7339       /* We must disable undo_list in order to record the whole insert
7340          transaction via record_insert at the end.  But doing so also
7341          disables the recording of the first change to the undo_list.
7342          Therefore we check for first change here and record it via
7343          record_first_change if needed.  */
7344       if (MODIFF <= SAVE_MODIFF)
7345         record_first_change ();
7346
7347       undo_list = BVAR (current_buffer, undo_list);
7348       bset_undo_list (current_buffer, Qt);
7349     }
7350
7351   coding->consumed = coding->consumed_char = 0;
7352   coding->produced = coding->produced_char = 0;
7353   coding->chars_at_source = 0;
7354   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7355
7356   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7357
7358   attrs = CODING_ID_ATTRS (coding->id);
7359   translation_table = get_translation_table (attrs, 0, NULL);
7360
7361   carryover = 0;
7362   if (coding->decoder == decode_coding_ccl)
7363     {
7364       coding->spec.ccl = &cclspec;
7365       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7366     }
7367   do
7368     {
7369       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7370
7371       coding_set_source (coding);
7372       coding->annotated = 0;
7373       coding->charbuf_used = carryover;
7374       (*(coding->decoder)) (coding);
7375       coding_set_destination (coding);
7376       carryover = produce_chars (coding, translation_table, 0);
7377       if (coding->annotated)
7378         produce_annotation (coding, pos);
7379       for (i = 0; i < carryover; i++)
7380         coding->charbuf[i]
7381           = coding->charbuf[coding->charbuf_used - carryover + i];
7382     }
7383   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7384          || (coding->consumed < coding->src_bytes
7385              && (coding->result == CODING_RESULT_SUCCESS
7386                  || coding->result == CODING_RESULT_INVALID_SRC)));
7387
7388   if (carryover > 0)
7389     {
7390       coding_set_destination (coding);
7391       coding->charbuf_used = carryover;
7392       produce_chars (coding, translation_table, 1);
7393     }
7394
7395   coding->carryover_bytes = 0;
7396   if (coding->consumed < coding->src_bytes)
7397     {
7398       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7399       const unsigned char *src;
7400
7401       coding_set_source (coding);
7402       coding_set_destination (coding);
7403       src = coding->source + coding->consumed;
7404
7405       if (coding->mode & CODING_MODE_LAST_BLOCK)
7406         {
7407           /* Flush out unprocessed data as binary chars.  We are sure
7408              that the number of data is less than the size of
7409              coding->charbuf.  */
7410           coding->charbuf_used = 0;
7411           coding->chars_at_source = 0;
7412
7413           while (nbytes-- > 0)
7414             {
7415               int c = *src++;
7416
7417               if (c & 0x80)
7418                 c = BYTE8_TO_CHAR (c);
7419               coding->charbuf[coding->charbuf_used++] = c;
7420             }
7421           produce_chars (coding, Qnil, 1);
7422         }
7423       else
7424         {
7425           /* Record unprocessed bytes in coding->carryover.  We are
7426              sure that the number of data is less than the size of
7427              coding->carryover.  */
7428           unsigned char *p = coding->carryover;
7429
7430           if (nbytes > sizeof coding->carryover)
7431             nbytes = sizeof coding->carryover;
7432           coding->carryover_bytes = nbytes;
7433           while (nbytes-- > 0)
7434             *p++ = *src++;
7435         }
7436       coding->consumed = coding->src_bytes;
7437     }
7438
7439   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7440       && !inhibit_eol_conversion)
7441     decode_eol (coding);
7442   if (BUFFERP (coding->dst_object))
7443     {
7444       bset_undo_list (current_buffer, undo_list);
7445       record_insert (coding->dst_pos, coding->produced_char);
7446     }
7447
7448   SAFE_FREE ();
7449 }
7450
7451
7452 /* Extract an annotation datum from a composition starting at POS and
7453    ending before LIMIT of CODING->src_object (buffer or string), store
7454    the data in BUF, set *STOP to a starting position of the next
7455    composition (if any) or to LIMIT, and return the address of the
7456    next element of BUF.
7457
7458    If such an annotation is not found, set *STOP to a starting
7459    position of a composition after POS (if any) or to LIMIT, and
7460    return BUF.  */
7461
7462 static int *
7463 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7464                                struct coding_system *coding, int *buf,
7465                                ptrdiff_t *stop)
7466 {
7467   ptrdiff_t start, end;
7468   Lisp_Object prop;
7469
7470   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7471       || end > limit)
7472     *stop = limit;
7473   else if (start > pos)
7474     *stop = start;
7475   else
7476     {
7477       if (start == pos)
7478         {
7479           /* We found a composition.  Store the corresponding
7480              annotation data in BUF.  */
7481           int *head = buf;
7482           enum composition_method method = composition_method (prop);
7483           int nchars = COMPOSITION_LENGTH (prop);
7484
7485           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7486           if (method != COMPOSITION_RELATIVE)
7487             {
7488               Lisp_Object components;
7489               ptrdiff_t i, len, i_byte;
7490
7491               components = COMPOSITION_COMPONENTS (prop);
7492               if (VECTORP (components))
7493                 {
7494                   len = ASIZE (components);
7495                   for (i = 0; i < len; i++)
7496                     *buf++ = XINT (AREF (components, i));
7497                 }
7498               else if (STRINGP (components))
7499                 {
7500                   len = SCHARS (components);
7501                   i = i_byte = 0;
7502                   while (i < len)
7503                     {
7504                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7505                       buf++;
7506                     }
7507                 }
7508               else if (INTEGERP (components))
7509                 {
7510                   len = 1;
7511                   *buf++ = XINT (components);
7512                 }
7513               else if (CONSP (components))
7514                 {
7515                   for (len = 0; CONSP (components);
7516                        len++, components = XCDR (components))
7517                     *buf++ = XINT (XCAR (components));
7518                 }
7519               else
7520                 emacs_abort ();
7521               *head -= len;
7522             }
7523         }
7524
7525       if (find_composition (end, limit, &start, &end, &prop,
7526                             coding->src_object)
7527           && end <= limit)
7528         *stop = start;
7529       else
7530         *stop = limit;
7531     }
7532   return buf;
7533 }
7534
7535
7536 /* Extract an annotation datum from a text property `charset' at POS of
7537    CODING->src_object (buffer of string), store the data in BUF, set
7538    *STOP to the position where the value of `charset' property changes
7539    (limiting by LIMIT), and return the address of the next element of
7540    BUF.
7541
7542    If the property value is nil, set *STOP to the position where the
7543    property value is non-nil (limiting by LIMIT), and return BUF.  */
7544
7545 static int *
7546 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7547                            struct coding_system *coding, int *buf,
7548                            ptrdiff_t *stop)
7549 {
7550   Lisp_Object val, next;
7551   int id;
7552
7553   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7554   if (! NILP (val) && CHARSETP (val))
7555     id = XINT (CHARSET_SYMBOL_ID (val));
7556   else
7557     id = -1;
7558   ADD_CHARSET_DATA (buf, 0, id);
7559   next = Fnext_single_property_change (make_number (pos), Qcharset,
7560                                        coding->src_object,
7561                                        make_number (limit));
7562   *stop = XINT (next);
7563   return buf;
7564 }
7565
7566
7567 static void
7568 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7569                int max_lookup)
7570 {
7571   int *buf = coding->charbuf;
7572   int *buf_end = coding->charbuf + coding->charbuf_size;
7573   const unsigned char *src = coding->source + coding->consumed;
7574   const unsigned char *src_end = coding->source + coding->src_bytes;
7575   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7576   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7577   bool multibytep = coding->src_multibyte;
7578   Lisp_Object eol_type;
7579   int c;
7580   ptrdiff_t stop, stop_composition, stop_charset;
7581   int *lookup_buf = NULL;
7582
7583   if (! NILP (translation_table))
7584     lookup_buf = alloca (sizeof (int) * max_lookup);
7585
7586   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7587   if (VECTORP (eol_type))
7588     eol_type = Qunix;
7589
7590   /* Note: composition handling is not yet implemented.  */
7591   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7592
7593   if (NILP (coding->src_object))
7594     stop = stop_composition = stop_charset = end_pos;
7595   else
7596     {
7597       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7598         stop = stop_composition = pos;
7599       else
7600         stop = stop_composition = end_pos;
7601       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7602         stop = stop_charset = pos;
7603       else
7604         stop_charset = end_pos;
7605     }
7606
7607   /* Compensate for CRLF and conversion.  */
7608   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7609   while (buf < buf_end)
7610     {
7611       Lisp_Object trans;
7612
7613       if (pos == stop)
7614         {
7615           if (pos == end_pos)
7616             break;
7617           if (pos == stop_composition)
7618             buf = handle_composition_annotation (pos, end_pos, coding,
7619                                                  buf, &stop_composition);
7620           if (pos == stop_charset)
7621             buf = handle_charset_annotation (pos, end_pos, coding,
7622                                              buf, &stop_charset);
7623           stop = (stop_composition < stop_charset
7624                   ? stop_composition : stop_charset);
7625         }
7626
7627       if (! multibytep)
7628         {
7629           int bytes;
7630
7631           if (coding->encoder == encode_coding_raw_text
7632               || coding->encoder == encode_coding_ccl)
7633             c = *src++, pos++;
7634           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7635             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7636           else
7637             c = BYTE8_TO_CHAR (*src), src++, pos++;
7638         }
7639       else
7640         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7641       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7642         c = '\n';
7643       if (! EQ (eol_type, Qunix))
7644         {
7645           if (c == '\n')
7646             {
7647               if (EQ (eol_type, Qdos))
7648                 *buf++ = '\r';
7649               else
7650                 c = '\r';
7651             }
7652         }
7653
7654       trans = Qnil;
7655       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7656       if (NILP (trans))
7657         *buf++ = c;
7658       else
7659         {
7660           ptrdiff_t from_nchars = 1, to_nchars = 1;
7661           int *lookup_buf_end;
7662           const unsigned char *p = src;
7663           int i;
7664
7665           lookup_buf[0] = c;
7666           for (i = 1; i < max_lookup && p < src_end; i++)
7667             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7668           lookup_buf_end = lookup_buf + i;
7669           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7670           if (INTEGERP (trans))
7671             c = XINT (trans);
7672           else if (CONSP (trans))
7673             {
7674               from_nchars = ASIZE (XCAR (trans));
7675               trans = XCDR (trans);
7676               if (INTEGERP (trans))
7677                 c = XINT (trans);
7678               else
7679                 {
7680                   to_nchars = ASIZE (trans);
7681                   if (buf_end - buf < to_nchars)
7682                     break;
7683                   c = XINT (AREF (trans, 0));
7684                 }
7685             }
7686           else
7687             break;
7688           *buf++ = c;
7689           for (i = 1; i < to_nchars; i++)
7690             *buf++ = XINT (AREF (trans, i));
7691           for (i = 1; i < from_nchars; i++, pos++)
7692             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7693         }
7694     }
7695
7696   coding->consumed = src - coding->source;
7697   coding->consumed_char = pos - coding->src_pos;
7698   coding->charbuf_used = buf - coding->charbuf;
7699   coding->chars_at_source = 0;
7700 }
7701
7702
7703 /* Encode the text at CODING->src_object into CODING->dst_object.
7704    CODING->src_object is a buffer or a string.
7705    CODING->dst_object is a buffer or nil.
7706
7707    If CODING->src_object is a buffer, it must be the current buffer.
7708    In this case, if CODING->src_pos is positive, it is a position of
7709    the source text in the buffer, otherwise. the source text is in the
7710    gap area of the buffer, and coding->src_pos specifies the offset of
7711    the text from GPT (which must be the same as PT).  If this is the
7712    same buffer as CODING->dst_object, CODING->src_pos must be
7713    negative and CODING should not have `pre-write-conversion'.
7714
7715    If CODING->src_object is a string, CODING should not have
7716    `pre-write-conversion'.
7717
7718    If CODING->dst_object is a buffer, the encoded data is inserted at
7719    the current point of that buffer.
7720
7721    If CODING->dst_object is nil, the encoded data is placed at the
7722    memory area specified by CODING->destination.  */
7723
7724 static void
7725 encode_coding (struct coding_system *coding)
7726 {
7727   Lisp_Object attrs;
7728   Lisp_Object translation_table;
7729   int max_lookup;
7730   struct ccl_spec cclspec;
7731
7732   USE_SAFE_ALLOCA;
7733
7734   attrs = CODING_ID_ATTRS (coding->id);
7735   if (coding->encoder == encode_coding_raw_text)
7736     translation_table = Qnil, max_lookup = 0;
7737   else
7738     translation_table = get_translation_table (attrs, 1, &max_lookup);
7739
7740   if (BUFFERP (coding->dst_object))
7741     {
7742       set_buffer_internal (XBUFFER (coding->dst_object));
7743       coding->dst_multibyte
7744         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7745     }
7746
7747   coding->consumed = coding->consumed_char = 0;
7748   coding->produced = coding->produced_char = 0;
7749   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7750
7751   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7752
7753   if (coding->encoder == encode_coding_ccl)
7754     {
7755       coding->spec.ccl = &cclspec;
7756       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7757     }
7758   do {
7759     coding_set_source (coding);
7760     consume_chars (coding, translation_table, max_lookup);
7761     coding_set_destination (coding);
7762     (*(coding->encoder)) (coding);
7763   } while (coding->consumed_char < coding->src_chars);
7764
7765   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7766     insert_from_gap (coding->produced_char, coding->produced, 0);
7767
7768   SAFE_FREE ();
7769 }
7770
7771
7772 /* Name (or base name) of work buffer for code conversion.  */
7773 static Lisp_Object Vcode_conversion_workbuf_name;
7774
7775 /* A working buffer used by the top level conversion.  Once it is
7776    created, it is never destroyed.  It has the name
7777    Vcode_conversion_workbuf_name.  The other working buffers are
7778    destroyed after the use is finished, and their names are modified
7779    versions of Vcode_conversion_workbuf_name.  */
7780 static Lisp_Object Vcode_conversion_reused_workbuf;
7781
7782 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7783 static bool reused_workbuf_in_use;
7784
7785
7786 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7787    multibyteness of returning buffer.  */
7788
7789 static Lisp_Object
7790 make_conversion_work_buffer (bool multibyte)
7791 {
7792   Lisp_Object name, workbuf;
7793   struct buffer *current;
7794
7795   if (reused_workbuf_in_use)
7796     {
7797       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7798       workbuf = Fget_buffer_create (name);
7799     }
7800   else
7801     {
7802       reused_workbuf_in_use = 1;
7803       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7804         Vcode_conversion_reused_workbuf
7805           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7806       workbuf = Vcode_conversion_reused_workbuf;
7807     }
7808   current = current_buffer;
7809   set_buffer_internal (XBUFFER (workbuf));
7810   /* We can't allow modification hooks to run in the work buffer.  For
7811      instance, directory_files_internal assumes that file decoding
7812      doesn't compile new regexps.  */
7813   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7814   Ferase_buffer ();
7815   bset_undo_list (current_buffer, Qt);
7816   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7817   set_buffer_internal (current);
7818   return workbuf;
7819 }
7820
7821
7822 static void
7823 code_conversion_restore (Lisp_Object arg)
7824 {
7825   Lisp_Object current, workbuf;
7826   struct gcpro gcpro1;
7827
7828   GCPRO1 (arg);
7829   current = XCAR (arg);
7830   workbuf = XCDR (arg);
7831   if (! NILP (workbuf))
7832     {
7833       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7834         reused_workbuf_in_use = 0;
7835       else
7836         Fkill_buffer (workbuf);
7837     }
7838   set_buffer_internal (XBUFFER (current));
7839   UNGCPRO;
7840 }
7841
7842 Lisp_Object
7843 code_conversion_save (bool with_work_buf, bool multibyte)
7844 {
7845   Lisp_Object workbuf = Qnil;
7846
7847   if (with_work_buf)
7848     workbuf = make_conversion_work_buffer (multibyte);
7849   record_unwind_protect (code_conversion_restore,
7850                          Fcons (Fcurrent_buffer (), workbuf));
7851   return workbuf;
7852 }
7853
7854 void
7855 decode_coding_gap (struct coding_system *coding,
7856                    ptrdiff_t chars, ptrdiff_t bytes)
7857 {
7858   ptrdiff_t count = SPECPDL_INDEX ();
7859   Lisp_Object attrs;
7860
7861   coding->src_object = Fcurrent_buffer ();
7862   coding->src_chars = chars;
7863   coding->src_bytes = bytes;
7864   coding->src_pos = -chars;
7865   coding->src_pos_byte = -bytes;
7866   coding->src_multibyte = chars < bytes;
7867   coding->dst_object = coding->src_object;
7868   coding->dst_pos = PT;
7869   coding->dst_pos_byte = PT_BYTE;
7870   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7871
7872   coding->head_ascii = -1;
7873   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7874   coding->eol_seen = EOL_SEEN_NONE;
7875   if (CODING_REQUIRE_DETECTION (coding))
7876     detect_coding (coding);
7877   attrs = CODING_ID_ATTRS (coding->id);
7878   if (! disable_ascii_optimization
7879       && ! coding->src_multibyte
7880       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7881       && NILP (CODING_ATTR_POST_READ (attrs))
7882       && NILP (get_translation_table (attrs, 0, NULL)))
7883     {
7884       chars = coding->head_ascii;
7885       if (chars < 0)
7886         chars = check_ascii (coding);
7887       if (chars != bytes)
7888         {
7889           /* There exists a non-ASCII byte.  */
7890           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7891               && coding->detected_utf8_bytes == coding->src_bytes)
7892             {
7893               if (coding->detected_utf8_chars >= 0)
7894                 chars = coding->detected_utf8_chars;
7895               else
7896                 chars = check_utf_8 (coding);
7897               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7898                   && coding->head_ascii == 0
7899                   && coding->source[0] == UTF_8_BOM_1
7900                   && coding->source[1] == UTF_8_BOM_2
7901                   && coding->source[2] == UTF_8_BOM_3)
7902                 {
7903                   chars--;
7904                   bytes -= 3;
7905                   coding->src_bytes -= 3;
7906                 }
7907             }
7908           else
7909             chars = -1;
7910         }
7911       if (chars >= 0)
7912         {
7913           Lisp_Object eol_type;
7914
7915           eol_type = CODING_ID_EOL_TYPE (coding->id);
7916           if (VECTORP (eol_type))
7917             {
7918               if (coding->eol_seen != EOL_SEEN_NONE)
7919                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7920             }
7921           if (EQ (eol_type, Qmac))
7922             {
7923               unsigned char *src_end = GAP_END_ADDR;
7924               unsigned char *src = src_end - coding->src_bytes;
7925
7926               while (src < src_end)
7927                 {
7928                   if (*src++ == '\r')
7929                     src[-1] = '\n';
7930                 }
7931             }
7932           else if (EQ (eol_type, Qdos))
7933             {
7934               unsigned char *src = GAP_END_ADDR;
7935               unsigned char *src_beg = src - coding->src_bytes;
7936               unsigned char *dst = src;
7937               ptrdiff_t diff;
7938
7939               while (src_beg < src)
7940                 {
7941                   *--dst = *--src;
7942                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7943                     src--;
7944                 }
7945               diff = dst - src;
7946               bytes -= diff;
7947               chars -= diff;
7948             }
7949           coding->produced = bytes;
7950           coding->produced_char = chars;
7951           insert_from_gap (chars, bytes, 1);
7952           return;
7953         }
7954     }
7955   code_conversion_save (0, 0);
7956
7957   coding->mode |= CODING_MODE_LAST_BLOCK;
7958   current_buffer->text->inhibit_shrinking = 1;
7959   decode_coding (coding);
7960   current_buffer->text->inhibit_shrinking = 0;
7961
7962   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7963     {
7964       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7965       Lisp_Object val;
7966
7967       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7968       val = call1 (CODING_ATTR_POST_READ (attrs),
7969                    make_number (coding->produced_char));
7970       CHECK_NATNUM (val);
7971       coding->produced_char += Z - prev_Z;
7972       coding->produced += Z_BYTE - prev_Z_BYTE;
7973     }
7974
7975   unbind_to (count, Qnil);
7976 }
7977
7978
7979 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7980    SRC_OBJECT into DST_OBJECT by coding context CODING.
7981
7982    SRC_OBJECT is a buffer, a string, or Qnil.
7983
7984    If it is a buffer, the text is at point of the buffer.  FROM and TO
7985    are positions in the buffer.
7986
7987    If it is a string, the text is at the beginning of the string.
7988    FROM and TO are indices to the string.
7989
7990    If it is nil, the text is at coding->source.  FROM and TO are
7991    indices to coding->source.
7992
7993    DST_OBJECT is a buffer, Qt, or Qnil.
7994
7995    If it is a buffer, the decoded text is inserted at point of the
7996    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7997    is deleted.
7998
7999    If it is Qt, a string is made from the decoded text, and
8000    set in CODING->dst_object.
8001
8002    If it is Qnil, the decoded text is stored at CODING->destination.
8003    The caller must allocate CODING->dst_bytes bytes at
8004    CODING->destination by xmalloc.  If the decoded text is longer than
8005    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8006  */
8007
8008 void
8009 decode_coding_object (struct coding_system *coding,
8010                       Lisp_Object src_object,
8011                       ptrdiff_t from, ptrdiff_t from_byte,
8012                       ptrdiff_t to, ptrdiff_t to_byte,
8013                       Lisp_Object dst_object)
8014 {
8015   ptrdiff_t count = SPECPDL_INDEX ();
8016   unsigned char *destination IF_LINT (= NULL);
8017   ptrdiff_t dst_bytes IF_LINT (= 0);
8018   ptrdiff_t chars = to - from;
8019   ptrdiff_t bytes = to_byte - from_byte;
8020   Lisp_Object attrs;
8021   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8022   bool need_marker_adjustment = 0;
8023   Lisp_Object old_deactivate_mark;
8024
8025   old_deactivate_mark = Vdeactivate_mark;
8026
8027   if (NILP (dst_object))
8028     {
8029       destination = coding->destination;
8030       dst_bytes = coding->dst_bytes;
8031     }
8032
8033   coding->src_object = src_object;
8034   coding->src_chars = chars;
8035   coding->src_bytes = bytes;
8036   coding->src_multibyte = chars < bytes;
8037
8038   if (STRINGP (src_object))
8039     {
8040       coding->src_pos = from;
8041       coding->src_pos_byte = from_byte;
8042     }
8043   else if (BUFFERP (src_object))
8044     {
8045       set_buffer_internal (XBUFFER (src_object));
8046       if (from != GPT)
8047         move_gap_both (from, from_byte);
8048       if (EQ (src_object, dst_object))
8049         {
8050           struct Lisp_Marker *tail;
8051
8052           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8053             {
8054               tail->need_adjustment
8055                 = tail->charpos == (tail->insertion_type ? from : to);
8056               need_marker_adjustment |= tail->need_adjustment;
8057             }
8058           saved_pt = PT, saved_pt_byte = PT_BYTE;
8059           TEMP_SET_PT_BOTH (from, from_byte);
8060           current_buffer->text->inhibit_shrinking = 1;
8061           del_range_both (from, from_byte, to, to_byte, 1);
8062           coding->src_pos = -chars;
8063           coding->src_pos_byte = -bytes;
8064         }
8065       else
8066         {
8067           coding->src_pos = from;
8068           coding->src_pos_byte = from_byte;
8069         }
8070     }
8071
8072   if (CODING_REQUIRE_DETECTION (coding))
8073     detect_coding (coding);
8074   attrs = CODING_ID_ATTRS (coding->id);
8075
8076   if (EQ (dst_object, Qt)
8077       || (! NILP (CODING_ATTR_POST_READ (attrs))
8078           && NILP (dst_object)))
8079     {
8080       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8081       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8082       coding->dst_pos = BEG;
8083       coding->dst_pos_byte = BEG_BYTE;
8084     }
8085   else if (BUFFERP (dst_object))
8086     {
8087       code_conversion_save (0, 0);
8088       coding->dst_object = dst_object;
8089       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8090       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8091       coding->dst_multibyte
8092         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8093     }
8094   else
8095     {
8096       code_conversion_save (0, 0);
8097       coding->dst_object = Qnil;
8098       /* Most callers presume this will return a multibyte result, and they
8099          won't use `binary' or `raw-text' anyway, so let's not worry about
8100          CODING_FOR_UNIBYTE.  */
8101       coding->dst_multibyte = 1;
8102     }
8103
8104   decode_coding (coding);
8105
8106   if (BUFFERP (coding->dst_object))
8107     set_buffer_internal (XBUFFER (coding->dst_object));
8108
8109   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8110     {
8111       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8112       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8113       Lisp_Object val;
8114
8115       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8116       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8117               old_deactivate_mark);
8118       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8119                         make_number (coding->produced_char));
8120       UNGCPRO;
8121       CHECK_NATNUM (val);
8122       coding->produced_char += Z - prev_Z;
8123       coding->produced += Z_BYTE - prev_Z_BYTE;
8124     }
8125
8126   if (EQ (dst_object, Qt))
8127     {
8128       coding->dst_object = Fbuffer_string ();
8129     }
8130   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8131     {
8132       set_buffer_internal (XBUFFER (coding->dst_object));
8133       if (dst_bytes < coding->produced)
8134         {
8135           eassert (coding->produced > 0);
8136           destination = xrealloc (destination, coding->produced);
8137           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8138             move_gap_both (BEGV, BEGV_BYTE);
8139           memcpy (destination, BEGV_ADDR, coding->produced);
8140           coding->destination = destination;
8141         }
8142     }
8143
8144   if (saved_pt >= 0)
8145     {
8146       /* This is the case of:
8147          (BUFFERP (src_object) && EQ (src_object, dst_object))
8148          As we have moved PT while replacing the original buffer
8149          contents, we must recover it now.  */
8150       set_buffer_internal (XBUFFER (src_object));
8151       current_buffer->text->inhibit_shrinking = 0;
8152       if (saved_pt < from)
8153         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8154       else if (saved_pt < from + chars)
8155         TEMP_SET_PT_BOTH (from, from_byte);
8156       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8157         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8158                           saved_pt_byte + (coding->produced - bytes));
8159       else
8160         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8161                           saved_pt_byte + (coding->produced - bytes));
8162
8163       if (need_marker_adjustment)
8164         {
8165           struct Lisp_Marker *tail;
8166
8167           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8168             if (tail->need_adjustment)
8169               {
8170                 tail->need_adjustment = 0;
8171                 if (tail->insertion_type)
8172                   {
8173                     tail->bytepos = from_byte;
8174                     tail->charpos = from;
8175                   }
8176                 else
8177                   {
8178                     tail->bytepos = from_byte + coding->produced;
8179                     tail->charpos
8180                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8181                          ? tail->bytepos : from + coding->produced_char);
8182                   }
8183               }
8184         }
8185     }
8186
8187   Vdeactivate_mark = old_deactivate_mark;
8188   unbind_to (count, coding->dst_object);
8189 }
8190
8191
8192 void
8193 encode_coding_object (struct coding_system *coding,
8194                       Lisp_Object src_object,
8195                       ptrdiff_t from, ptrdiff_t from_byte,
8196                       ptrdiff_t to, ptrdiff_t to_byte,
8197                       Lisp_Object dst_object)
8198 {
8199   ptrdiff_t count = SPECPDL_INDEX ();
8200   ptrdiff_t chars = to - from;
8201   ptrdiff_t bytes = to_byte - from_byte;
8202   Lisp_Object attrs;
8203   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8204   bool need_marker_adjustment = 0;
8205   bool kill_src_buffer = 0;
8206   Lisp_Object old_deactivate_mark;
8207
8208   old_deactivate_mark = Vdeactivate_mark;
8209
8210   coding->src_object = src_object;
8211   coding->src_chars = chars;
8212   coding->src_bytes = bytes;
8213   coding->src_multibyte = chars < bytes;
8214
8215   attrs = CODING_ID_ATTRS (coding->id);
8216
8217   if (EQ (src_object, dst_object))
8218     {
8219       struct Lisp_Marker *tail;
8220
8221       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8222         {
8223           tail->need_adjustment
8224             = tail->charpos == (tail->insertion_type ? from : to);
8225           need_marker_adjustment |= tail->need_adjustment;
8226         }
8227     }
8228
8229   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8230     {
8231       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8232       set_buffer_internal (XBUFFER (coding->src_object));
8233       if (STRINGP (src_object))
8234         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8235       else if (BUFFERP (src_object))
8236         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8237       else
8238         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8239
8240       if (EQ (src_object, dst_object))
8241         {
8242           set_buffer_internal (XBUFFER (src_object));
8243           saved_pt = PT, saved_pt_byte = PT_BYTE;
8244           del_range_both (from, from_byte, to, to_byte, 1);
8245           set_buffer_internal (XBUFFER (coding->src_object));
8246         }
8247
8248       {
8249         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8250
8251         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8252                 old_deactivate_mark);
8253         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8254                     make_number (BEG), make_number (Z));
8255         UNGCPRO;
8256       }
8257       if (XBUFFER (coding->src_object) != current_buffer)
8258         kill_src_buffer = 1;
8259       coding->src_object = Fcurrent_buffer ();
8260       if (BEG != GPT)
8261         move_gap_both (BEG, BEG_BYTE);
8262       coding->src_chars = Z - BEG;
8263       coding->src_bytes = Z_BYTE - BEG_BYTE;
8264       coding->src_pos = BEG;
8265       coding->src_pos_byte = BEG_BYTE;
8266       coding->src_multibyte = Z < Z_BYTE;
8267     }
8268   else if (STRINGP (src_object))
8269     {
8270       code_conversion_save (0, 0);
8271       coding->src_pos = from;
8272       coding->src_pos_byte = from_byte;
8273     }
8274   else if (BUFFERP (src_object))
8275     {
8276       code_conversion_save (0, 0);
8277       set_buffer_internal (XBUFFER (src_object));
8278       if (EQ (src_object, dst_object))
8279         {
8280           saved_pt = PT, saved_pt_byte = PT_BYTE;
8281           coding->src_object = del_range_1 (from, to, 1, 1);
8282           coding->src_pos = 0;
8283           coding->src_pos_byte = 0;
8284         }
8285       else
8286         {
8287           if (from < GPT && to >= GPT)
8288             move_gap_both (from, from_byte);
8289           coding->src_pos = from;
8290           coding->src_pos_byte = from_byte;
8291         }
8292     }
8293   else
8294     code_conversion_save (0, 0);
8295
8296   if (BUFFERP (dst_object))
8297     {
8298       coding->dst_object = dst_object;
8299       if (EQ (src_object, dst_object))
8300         {
8301           coding->dst_pos = from;
8302           coding->dst_pos_byte = from_byte;
8303         }
8304       else
8305         {
8306           struct buffer *current = current_buffer;
8307
8308           set_buffer_temp (XBUFFER (dst_object));
8309           coding->dst_pos = PT;
8310           coding->dst_pos_byte = PT_BYTE;
8311           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8312           set_buffer_temp (current);
8313         }
8314       coding->dst_multibyte
8315         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8316     }
8317   else if (EQ (dst_object, Qt))
8318     {
8319       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8320       coding->dst_object = Qnil;
8321       coding->destination = xmalloc (dst_bytes);
8322       coding->dst_bytes = dst_bytes;
8323       coding->dst_multibyte = 0;
8324     }
8325   else
8326     {
8327       coding->dst_object = Qnil;
8328       coding->dst_multibyte = 0;
8329     }
8330
8331   encode_coding (coding);
8332
8333   if (EQ (dst_object, Qt))
8334     {
8335       if (BUFFERP (coding->dst_object))
8336         coding->dst_object = Fbuffer_string ();
8337       else if (coding->raw_destination)
8338         /* This is used to avoid creating huge Lisp string.
8339            NOTE: caller who sets `raw_destination' is also
8340            responsible for freeing `destination' buffer.  */
8341         coding->dst_object = Qnil;
8342       else
8343         {
8344           coding->dst_object
8345             = make_unibyte_string ((char *) coding->destination,
8346                                    coding->produced);
8347           xfree (coding->destination);
8348         }
8349     }
8350
8351   if (saved_pt >= 0)
8352     {
8353       /* This is the case of:
8354          (BUFFERP (src_object) && EQ (src_object, dst_object))
8355          As we have moved PT while replacing the original buffer
8356          contents, we must recover it now.  */
8357       set_buffer_internal (XBUFFER (src_object));
8358       if (saved_pt < from)
8359         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8360       else if (saved_pt < from + chars)
8361         TEMP_SET_PT_BOTH (from, from_byte);
8362       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8363         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8364                           saved_pt_byte + (coding->produced - bytes));
8365       else
8366         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8367                           saved_pt_byte + (coding->produced - bytes));
8368
8369       if (need_marker_adjustment)
8370         {
8371           struct Lisp_Marker *tail;
8372
8373           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8374             if (tail->need_adjustment)
8375               {
8376                 tail->need_adjustment = 0;
8377                 if (tail->insertion_type)
8378                   {
8379                     tail->bytepos = from_byte;
8380                     tail->charpos = from;
8381                   }
8382                 else
8383                   {
8384                     tail->bytepos = from_byte + coding->produced;
8385                     tail->charpos
8386                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8387                          ? tail->bytepos : from + coding->produced_char);
8388                   }
8389               }
8390         }
8391     }
8392
8393   if (kill_src_buffer)
8394     Fkill_buffer (coding->src_object);
8395
8396   Vdeactivate_mark = old_deactivate_mark;
8397   unbind_to (count, Qnil);
8398 }
8399
8400
8401 Lisp_Object
8402 preferred_coding_system (void)
8403 {
8404   int id = coding_categories[coding_priorities[0]].id;
8405
8406   return CODING_ID_NAME (id);
8407 }
8408
8409 #if defined (WINDOWSNT) || defined (CYGWIN)
8410
8411 Lisp_Object
8412 from_unicode (Lisp_Object str)
8413 {
8414   CHECK_STRING (str);
8415   if (!STRING_MULTIBYTE (str) &&
8416       SBYTES (str) & 1)
8417     {
8418       str = Fsubstring (str, make_number (0), make_number (-1));
8419     }
8420
8421   return code_convert_string_norecord (str, Qutf_16le, 0);
8422 }
8423
8424 Lisp_Object
8425 from_unicode_buffer (const wchar_t *wstr)
8426 {
8427     return from_unicode (
8428         make_unibyte_string (
8429             (char *) wstr,
8430             /* we get one of the two final 0 bytes for free. */
8431             1 + sizeof (wchar_t) * wcslen (wstr)));
8432 }
8433
8434 wchar_t *
8435 to_unicode (Lisp_Object str, Lisp_Object *buf)
8436 {
8437   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8438   /* We need to make another copy (in addition to the one made by
8439      code_convert_string_norecord) to ensure that the final string is
8440      _doubly_ zero terminated --- that is, that the string is
8441      terminated by two zero bytes and one utf-16le null character.
8442      Because strings are already terminated with a single zero byte,
8443      we just add one additional zero. */
8444   str = make_uninit_string (SBYTES (*buf) + 1);
8445   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8446   SDATA (str) [SBYTES (*buf)] = '\0';
8447   *buf = str;
8448   return WCSDATA (*buf);
8449 }
8450
8451 #endif /* WINDOWSNT || CYGWIN */
8452
8453 \f
8454 #ifdef emacs
8455 /*** 8. Emacs Lisp library functions ***/
8456
8457 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8458        doc: /* Return t if OBJECT is nil or a coding-system.
8459 See the documentation of `define-coding-system' for information
8460 about coding-system objects.  */)
8461   (Lisp_Object object)
8462 {
8463   if (NILP (object)
8464       || CODING_SYSTEM_ID (object) >= 0)
8465     return Qt;
8466   if (! SYMBOLP (object)
8467       || NILP (Fget (object, Qcoding_system_define_form)))
8468     return Qnil;
8469   return Qt;
8470 }
8471
8472 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8473        Sread_non_nil_coding_system, 1, 1, 0,
8474        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8475   (Lisp_Object prompt)
8476 {
8477   Lisp_Object val;
8478   do
8479     {
8480       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8481                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8482     }
8483   while (SCHARS (val) == 0);
8484   return (Fintern (val, Qnil));
8485 }
8486
8487 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8488        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8489 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8490 Ignores case when completing coding systems (all Emacs coding systems
8491 are lower-case).  */)
8492   (Lisp_Object prompt, Lisp_Object default_coding_system)
8493 {
8494   Lisp_Object val;
8495   ptrdiff_t count = SPECPDL_INDEX ();
8496
8497   if (SYMBOLP (default_coding_system))
8498     default_coding_system = SYMBOL_NAME (default_coding_system);
8499   specbind (Qcompletion_ignore_case, Qt);
8500   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8501                           Qt, Qnil, Qcoding_system_history,
8502                           default_coding_system, Qnil);
8503   unbind_to (count, Qnil);
8504   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8505 }
8506
8507 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8508        1, 1, 0,
8509        doc: /* Check validity of CODING-SYSTEM.
8510 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8511 It is valid if it is nil or a symbol defined as a coding system by the
8512 function `define-coding-system'.  */)
8513   (Lisp_Object coding_system)
8514 {
8515   Lisp_Object define_form;
8516
8517   define_form = Fget (coding_system, Qcoding_system_define_form);
8518   if (! NILP (define_form))
8519     {
8520       Fput (coding_system, Qcoding_system_define_form, Qnil);
8521       safe_eval (define_form);
8522     }
8523   if (!NILP (Fcoding_system_p (coding_system)))
8524     return coding_system;
8525   xsignal1 (Qcoding_system_error, coding_system);
8526 }
8527
8528 \f
8529 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8530    HIGHEST, return the coding system of the highest
8531    priority among the detected coding systems.  Otherwise return a
8532    list of detected coding systems sorted by their priorities.  If
8533    MULTIBYTEP, it is assumed that the bytes are in correct
8534    multibyte form but contains only ASCII and eight-bit chars.
8535    Otherwise, the bytes are raw bytes.
8536
8537    CODING-SYSTEM controls the detection as below:
8538
8539    If it is nil, detect both text-format and eol-format.  If the
8540    text-format part of CODING-SYSTEM is already specified
8541    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8542    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8543    detect only text-format.  */
8544
8545 Lisp_Object
8546 detect_coding_system (const unsigned char *src,
8547                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8548                       bool highest, bool multibytep,
8549                       Lisp_Object coding_system)
8550 {
8551   const unsigned char *src_end = src + src_bytes;
8552   Lisp_Object attrs, eol_type;
8553   Lisp_Object val = Qnil;
8554   struct coding_system coding;
8555   ptrdiff_t id;
8556   struct coding_detection_info detect_info;
8557   enum coding_category base_category;
8558   bool null_byte_found = 0, eight_bit_found = 0;
8559
8560   if (NILP (coding_system))
8561     coding_system = Qundecided;
8562   setup_coding_system (coding_system, &coding);
8563   attrs = CODING_ID_ATTRS (coding.id);
8564   eol_type = CODING_ID_EOL_TYPE (coding.id);
8565   coding_system = CODING_ATTR_BASE_NAME (attrs);
8566
8567   coding.source = src;
8568   coding.src_chars = src_chars;
8569   coding.src_bytes = src_bytes;
8570   coding.src_multibyte = multibytep;
8571   coding.consumed = 0;
8572   coding.mode |= CODING_MODE_LAST_BLOCK;
8573   coding.head_ascii = 0;
8574
8575   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8576
8577   /* At first, detect text-format if necessary.  */
8578   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8579   if (base_category == coding_category_undecided)
8580     {
8581       enum coding_category category IF_LINT (= 0);
8582       struct coding_system *this IF_LINT (= NULL);
8583       int c, i;
8584       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8585                                        inhibit_null_byte_detection);
8586       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8587                                        inhibit_iso_escape_detection);
8588       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8589
8590       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8591       for (; src < src_end; src++)
8592         {
8593           c = *src;
8594           if (c & 0x80)
8595             {
8596               eight_bit_found = 1;
8597               if (null_byte_found)
8598                 break;
8599             }
8600           else if (c < 0x20)
8601             {
8602               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8603                   && ! inhibit_ied
8604                   && ! detect_info.checked)
8605                 {
8606                   if (detect_coding_iso_2022 (&coding, &detect_info))
8607                     {
8608                       /* We have scanned the whole data.  */
8609                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8610                         {
8611                           /* We didn't find an 8-bit code.  We may
8612                              have found a null-byte, but it's very
8613                              rare that a binary file confirm to
8614                              ISO-2022.  */
8615                           src = src_end;
8616                           coding.head_ascii = src - coding.source;
8617                         }
8618                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8619                       break;
8620                     }
8621                 }
8622               else if (! c && !inhibit_nbd)
8623                 {
8624                   null_byte_found = 1;
8625                   if (eight_bit_found)
8626                     break;
8627                 }
8628               if (! eight_bit_found)
8629                 coding.head_ascii++;
8630             }
8631           else if (! eight_bit_found)
8632             coding.head_ascii++;
8633         }
8634
8635       if (null_byte_found || eight_bit_found
8636           || coding.head_ascii < coding.src_bytes
8637           || detect_info.found)
8638         {
8639           if (coding.head_ascii == coding.src_bytes)
8640             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8641             for (i = 0; i < coding_category_raw_text; i++)
8642               {
8643                 category = coding_priorities[i];
8644                 this = coding_categories + category;
8645                 if (detect_info.found & (1 << category))
8646                   break;
8647               }
8648           else
8649             {
8650               if (null_byte_found)
8651                 {
8652                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8653                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8654                 }
8655               else if (prefer_utf_8
8656                        && detect_coding_utf_8 (&coding, &detect_info))
8657                 {
8658                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8659                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8660                 }
8661               for (i = 0; i < coding_category_raw_text; i++)
8662                 {
8663                   category = coding_priorities[i];
8664                   this = coding_categories + category;
8665
8666                   if (this->id < 0)
8667                     {
8668                       /* No coding system of this category is defined.  */
8669                       detect_info.rejected |= (1 << category);
8670                     }
8671                   else if (category >= coding_category_raw_text)
8672                     continue;
8673                   else if (detect_info.checked & (1 << category))
8674                     {
8675                       if (highest
8676                           && (detect_info.found & (1 << category)))
8677                         break;
8678                     }
8679                   else if ((*(this->detector)) (&coding, &detect_info)
8680                            && highest
8681                            && (detect_info.found & (1 << category)))
8682                     {
8683                       if (category == coding_category_utf_16_auto)
8684                         {
8685                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8686                             category = coding_category_utf_16_le;
8687                           else
8688                             category = coding_category_utf_16_be;
8689                         }
8690                       break;
8691                     }
8692                 }
8693             }
8694         }
8695
8696       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8697           || null_byte_found)
8698         {
8699           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8700           id = CODING_SYSTEM_ID (Qno_conversion);
8701           val = list1 (make_number (id));
8702         }
8703       else if (! detect_info.rejected && ! detect_info.found)
8704         {
8705           detect_info.found = CATEGORY_MASK_ANY;
8706           id = coding_categories[coding_category_undecided].id;
8707           val = list1 (make_number (id));
8708         }
8709       else if (highest)
8710         {
8711           if (detect_info.found)
8712             {
8713               detect_info.found = 1 << category;
8714               val = list1 (make_number (this->id));
8715             }
8716           else
8717             for (i = 0; i < coding_category_raw_text; i++)
8718               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8719                 {
8720                   detect_info.found = 1 << coding_priorities[i];
8721                   id = coding_categories[coding_priorities[i]].id;
8722                   val = list1 (make_number (id));
8723                   break;
8724                 }
8725         }
8726       else
8727         {
8728           int mask = detect_info.rejected | detect_info.found;
8729           int found = 0;
8730
8731           for (i = coding_category_raw_text - 1; i >= 0; i--)
8732             {
8733               category = coding_priorities[i];
8734               if (! (mask & (1 << category)))
8735                 {
8736                   found |= 1 << category;
8737                   id = coding_categories[category].id;
8738                   if (id >= 0)
8739                     val = list1 (make_number (id));
8740                 }
8741             }
8742           for (i = coding_category_raw_text - 1; i >= 0; i--)
8743             {
8744               category = coding_priorities[i];
8745               if (detect_info.found & (1 << category))
8746                 {
8747                   id = coding_categories[category].id;
8748                   val = Fcons (make_number (id), val);
8749                 }
8750             }
8751           detect_info.found |= found;
8752         }
8753     }
8754   else if (base_category == coding_category_utf_8_auto)
8755     {
8756       if (detect_coding_utf_8 (&coding, &detect_info))
8757         {
8758           struct coding_system *this;
8759
8760           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8761             this = coding_categories + coding_category_utf_8_sig;
8762           else
8763             this = coding_categories + coding_category_utf_8_nosig;
8764           val = list1 (make_number (this->id));
8765         }
8766     }
8767   else if (base_category == coding_category_utf_16_auto)
8768     {
8769       if (detect_coding_utf_16 (&coding, &detect_info))
8770         {
8771           struct coding_system *this;
8772
8773           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8774             this = coding_categories + coding_category_utf_16_le;
8775           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8776             this = coding_categories + coding_category_utf_16_be;
8777           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8778             this = coding_categories + coding_category_utf_16_be_nosig;
8779           else
8780             this = coding_categories + coding_category_utf_16_le_nosig;
8781           val = list1 (make_number (this->id));
8782         }
8783     }
8784   else
8785     {
8786       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8787       val = list1 (make_number (coding.id));
8788     }
8789
8790   /* Then, detect eol-format if necessary.  */
8791   {
8792     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8793     Lisp_Object tail;
8794
8795     if (VECTORP (eol_type))
8796       {
8797         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8798           {
8799             if (null_byte_found)
8800               normal_eol = EOL_SEEN_LF;
8801             else
8802               normal_eol = detect_eol (coding.source, src_bytes,
8803                                        coding_category_raw_text);
8804           }
8805         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8806                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8807           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8808                                       coding_category_utf_16_be);
8809         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8810                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8811           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8812                                       coding_category_utf_16_le);
8813       }
8814     else
8815       {
8816         if (EQ (eol_type, Qunix))
8817           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8818         else if (EQ (eol_type, Qdos))
8819           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8820         else
8821           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8822       }
8823
8824     for (tail = val; CONSP (tail); tail = XCDR (tail))
8825       {
8826         enum coding_category category;
8827         int this_eol;
8828
8829         id = XINT (XCAR (tail));
8830         attrs = CODING_ID_ATTRS (id);
8831         category = XINT (CODING_ATTR_CATEGORY (attrs));
8832         eol_type = CODING_ID_EOL_TYPE (id);
8833         if (VECTORP (eol_type))
8834           {
8835             if (category == coding_category_utf_16_be
8836                 || category == coding_category_utf_16_be_nosig)
8837               this_eol = utf_16_be_eol;
8838             else if (category == coding_category_utf_16_le
8839                      || category == coding_category_utf_16_le_nosig)
8840               this_eol = utf_16_le_eol;
8841             else
8842               this_eol = normal_eol;
8843
8844             if (this_eol == EOL_SEEN_LF)
8845               XSETCAR (tail, AREF (eol_type, 0));
8846             else if (this_eol == EOL_SEEN_CRLF)
8847               XSETCAR (tail, AREF (eol_type, 1));
8848             else if (this_eol == EOL_SEEN_CR)
8849               XSETCAR (tail, AREF (eol_type, 2));
8850             else
8851               XSETCAR (tail, CODING_ID_NAME (id));
8852           }
8853         else
8854           XSETCAR (tail, CODING_ID_NAME (id));
8855       }
8856   }
8857
8858   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8859 }
8860
8861
8862 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8863        2, 3, 0,
8864        doc: /* Detect coding system of the text in the region between START and END.
8865 Return a list of possible coding systems ordered by priority.
8866 The coding systems to try and their priorities follows what
8867 the function `coding-system-priority-list' (which see) returns.
8868
8869 If only ASCII characters are found (except for such ISO-2022 control
8870 characters as ESC), it returns a list of single element `undecided'
8871 or its subsidiary coding system according to a detected end-of-line
8872 format.
8873
8874 If optional argument HIGHEST is non-nil, return the coding system of
8875 highest priority.  */)
8876   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8877 {
8878   ptrdiff_t from, to;
8879   ptrdiff_t from_byte, to_byte;
8880
8881   validate_region (&start, &end);
8882   from = XINT (start), to = XINT (end);
8883   from_byte = CHAR_TO_BYTE (from);
8884   to_byte = CHAR_TO_BYTE (to);
8885
8886   if (from < GPT && to >= GPT)
8887     move_gap_both (to, to_byte);
8888
8889   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8890                                to - from, to_byte - from_byte,
8891                                !NILP (highest),
8892                                !NILP (BVAR (current_buffer
8893                                       , enable_multibyte_characters)),
8894                                Qnil);
8895 }
8896
8897 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8898        1, 2, 0,
8899        doc: /* Detect coding system of the text in STRING.
8900 Return a list of possible coding systems ordered by priority.
8901 The coding systems to try and their priorities follows what
8902 the function `coding-system-priority-list' (which see) returns.
8903
8904 If only ASCII characters are found (except for such ISO-2022 control
8905 characters as ESC), it returns a list of single element `undecided'
8906 or its subsidiary coding system according to a detected end-of-line
8907 format.
8908
8909 If optional argument HIGHEST is non-nil, return the coding system of
8910 highest priority.  */)
8911   (Lisp_Object string, Lisp_Object highest)
8912 {
8913   CHECK_STRING (string);
8914
8915   return detect_coding_system (SDATA (string),
8916                                SCHARS (string), SBYTES (string),
8917                                !NILP (highest), STRING_MULTIBYTE (string),
8918                                Qnil);
8919 }
8920
8921
8922 static bool
8923 char_encodable_p (int c, Lisp_Object attrs)
8924 {
8925   Lisp_Object tail;
8926   struct charset *charset;
8927   Lisp_Object translation_table;
8928
8929   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8930   if (! NILP (translation_table))
8931     c = translate_char (translation_table, c);
8932   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8933        CONSP (tail); tail = XCDR (tail))
8934     {
8935       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8936       if (CHAR_CHARSET_P (c, charset))
8937         break;
8938     }
8939   return (! NILP (tail));
8940 }
8941
8942
8943 /* Return a list of coding systems that safely encode the text between
8944    START and END.  If EXCLUDE is non-nil, it is a list of coding
8945    systems not to check.  The returned list doesn't contain any such
8946    coding systems.  In any case, if the text contains only ASCII or is
8947    unibyte, return t.  */
8948
8949 DEFUN ("find-coding-systems-region-internal",
8950        Ffind_coding_systems_region_internal,
8951        Sfind_coding_systems_region_internal, 2, 3, 0,
8952        doc: /* Internal use only.  */)
8953   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8954 {
8955   Lisp_Object coding_attrs_list, safe_codings;
8956   ptrdiff_t start_byte, end_byte;
8957   const unsigned char *p, *pbeg, *pend;
8958   int c;
8959   Lisp_Object tail, elt, work_table;
8960
8961   if (STRINGP (start))
8962     {
8963       if (!STRING_MULTIBYTE (start)
8964           || SCHARS (start) == SBYTES (start))
8965         return Qt;
8966       start_byte = 0;
8967       end_byte = SBYTES (start);
8968     }
8969   else
8970     {
8971       CHECK_NUMBER_COERCE_MARKER (start);
8972       CHECK_NUMBER_COERCE_MARKER (end);
8973       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8974         args_out_of_range (start, end);
8975       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8976         return Qt;
8977       start_byte = CHAR_TO_BYTE (XINT (start));
8978       end_byte = CHAR_TO_BYTE (XINT (end));
8979       if (XINT (end) - XINT (start) == end_byte - start_byte)
8980         return Qt;
8981
8982       if (XINT (start) < GPT && XINT (end) > GPT)
8983         {
8984           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8985             move_gap_both (XINT (start), start_byte);
8986           else
8987             move_gap_both (XINT (end), end_byte);
8988         }
8989     }
8990
8991   coding_attrs_list = Qnil;
8992   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8993     if (NILP (exclude)
8994         || NILP (Fmemq (XCAR (tail), exclude)))
8995       {
8996         Lisp_Object attrs;
8997
8998         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8999         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9000           {
9001             ASET (attrs, coding_attr_trans_tbl,
9002                   get_translation_table (attrs, 1, NULL));
9003             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9004           }
9005       }
9006
9007   if (STRINGP (start))
9008     p = pbeg = SDATA (start);
9009   else
9010     p = pbeg = BYTE_POS_ADDR (start_byte);
9011   pend = p + (end_byte - start_byte);
9012
9013   while (p < pend && ASCII_CHAR_P (*p)) p++;
9014   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9015
9016   work_table = Fmake_char_table (Qnil, Qnil);
9017   while (p < pend)
9018     {
9019       if (ASCII_CHAR_P (*p))
9020         p++;
9021       else
9022         {
9023           c = STRING_CHAR_ADVANCE (p);
9024           if (!NILP (char_table_ref (work_table, c)))
9025             /* This character was already checked.  Ignore it.  */
9026             continue;
9027
9028           charset_map_loaded = 0;
9029           for (tail = coding_attrs_list; CONSP (tail);)
9030             {
9031               elt = XCAR (tail);
9032               if (NILP (elt))
9033                 tail = XCDR (tail);
9034               else if (char_encodable_p (c, elt))
9035                 tail = XCDR (tail);
9036               else if (CONSP (XCDR (tail)))
9037                 {
9038                   XSETCAR (tail, XCAR (XCDR (tail)));
9039                   XSETCDR (tail, XCDR (XCDR (tail)));
9040                 }
9041               else
9042                 {
9043                   XSETCAR (tail, Qnil);
9044                   tail = XCDR (tail);
9045                 }
9046             }
9047           if (charset_map_loaded)
9048             {
9049               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9050
9051               if (STRINGP (start))
9052                 pbeg = SDATA (start);
9053               else
9054                 pbeg = BYTE_POS_ADDR (start_byte);
9055               p = pbeg + p_offset;
9056               pend = pbeg + pend_offset;
9057             }
9058           char_table_set (work_table, c, Qt);
9059         }
9060     }
9061
9062   safe_codings = list2 (Qraw_text, Qno_conversion);
9063   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9064     if (! NILP (XCAR (tail)))
9065       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9066
9067   return safe_codings;
9068 }
9069
9070
9071 DEFUN ("unencodable-char-position", Funencodable_char_position,
9072        Sunencodable_char_position, 3, 5, 0,
9073        doc: /* Return position of first un-encodable character in a region.
9074 START and END specify the region and CODING-SYSTEM specifies the
9075 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9076
9077 If optional 4th argument COUNT is non-nil, it specifies at most how
9078 many un-encodable characters to search.  In this case, the value is a
9079 list of positions.
9080
9081 If optional 5th argument STRING is non-nil, it is a string to search
9082 for un-encodable characters.  In that case, START and END are indexes
9083 to the string and treated as in `substring'.  */)
9084   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9085    Lisp_Object count, Lisp_Object string)
9086 {
9087   EMACS_INT n;
9088   struct coding_system coding;
9089   Lisp_Object attrs, charset_list, translation_table;
9090   Lisp_Object positions;
9091   ptrdiff_t from, to;
9092   const unsigned char *p, *stop, *pend;
9093   bool ascii_compatible;
9094
9095   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9096   attrs = CODING_ID_ATTRS (coding.id);
9097   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9098     return Qnil;
9099   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9100   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9101   translation_table = get_translation_table (attrs, 1, NULL);
9102
9103   if (NILP (string))
9104     {
9105       validate_region (&start, &end);
9106       from = XINT (start);
9107       to = XINT (end);
9108       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9109           || (ascii_compatible
9110               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9111         return Qnil;
9112       p = CHAR_POS_ADDR (from);
9113       pend = CHAR_POS_ADDR (to);
9114       if (from < GPT && to >= GPT)
9115         stop = GPT_ADDR;
9116       else
9117         stop = pend;
9118     }
9119   else
9120     {
9121       CHECK_STRING (string);
9122       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9123       if (! STRING_MULTIBYTE (string))
9124         return Qnil;
9125       p = SDATA (string) + string_char_to_byte (string, from);
9126       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9127       if (ascii_compatible && (to - from) == (pend - p))
9128         return Qnil;
9129     }
9130
9131   if (NILP (count))
9132     n = 1;
9133   else
9134     {
9135       CHECK_NATNUM (count);
9136       n = XINT (count);
9137     }
9138
9139   positions = Qnil;
9140   charset_map_loaded = 0;
9141   while (1)
9142     {
9143       int c;
9144
9145       if (ascii_compatible)
9146         while (p < stop && ASCII_CHAR_P (*p))
9147           p++, from++;
9148       if (p >= stop)
9149         {
9150           if (p >= pend)
9151             break;
9152           stop = pend;
9153           p = GAP_END_ADDR;
9154         }
9155
9156       c = STRING_CHAR_ADVANCE (p);
9157       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9158           && ! char_charset (translate_char (translation_table, c),
9159                              charset_list, NULL))
9160         {
9161           positions = Fcons (make_number (from), positions);
9162           n--;
9163           if (n == 0)
9164             break;
9165         }
9166
9167       from++;
9168       if (charset_map_loaded && NILP (string))
9169         {
9170           p = CHAR_POS_ADDR (from);
9171           pend = CHAR_POS_ADDR (to);
9172           if (from < GPT && to >= GPT)
9173             stop = GPT_ADDR;
9174           else
9175             stop = pend;
9176           charset_map_loaded = 0;
9177         }
9178     }
9179
9180   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9181 }
9182
9183
9184 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9185        Scheck_coding_systems_region, 3, 3, 0,
9186        doc: /* Check if the region is encodable by coding systems.
9187
9188 START and END are buffer positions specifying the region.
9189 CODING-SYSTEM-LIST is a list of coding systems to check.
9190
9191 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9192 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9193 whole region, POS0, POS1, ... are buffer positions where non-encodable
9194 characters are found.
9195
9196 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9197 value is nil.
9198
9199 START may be a string.  In that case, check if the string is
9200 encodable, and the value contains indices to the string instead of
9201 buffer positions.  END is ignored.
9202
9203 If the current buffer (or START if it is a string) is unibyte, the value
9204 is nil.  */)
9205   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9206 {
9207   Lisp_Object list;
9208   ptrdiff_t start_byte, end_byte;
9209   ptrdiff_t pos;
9210   const unsigned char *p, *pbeg, *pend;
9211   int c;
9212   Lisp_Object tail, elt, attrs;
9213
9214   if (STRINGP (start))
9215     {
9216       if (!STRING_MULTIBYTE (start)
9217           || SCHARS (start) == SBYTES (start))
9218         return Qnil;
9219       start_byte = 0;
9220       end_byte = SBYTES (start);
9221       pos = 0;
9222     }
9223   else
9224     {
9225       CHECK_NUMBER_COERCE_MARKER (start);
9226       CHECK_NUMBER_COERCE_MARKER (end);
9227       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9228         args_out_of_range (start, end);
9229       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9230         return Qnil;
9231       start_byte = CHAR_TO_BYTE (XINT (start));
9232       end_byte = CHAR_TO_BYTE (XINT (end));
9233       if (XINT (end) - XINT (start) == end_byte - start_byte)
9234         return Qnil;
9235
9236       if (XINT (start) < GPT && XINT (end) > GPT)
9237         {
9238           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9239             move_gap_both (XINT (start), start_byte);
9240           else
9241             move_gap_both (XINT (end), end_byte);
9242         }
9243       pos = XINT (start);
9244     }
9245
9246   list = Qnil;
9247   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9248     {
9249       elt = XCAR (tail);
9250       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9251       ASET (attrs, coding_attr_trans_tbl,
9252             get_translation_table (attrs, 1, NULL));
9253       list = Fcons (list2 (elt, attrs), list);
9254     }
9255
9256   if (STRINGP (start))
9257     p = pbeg = SDATA (start);
9258   else
9259     p = pbeg = BYTE_POS_ADDR (start_byte);
9260   pend = p + (end_byte - start_byte);
9261
9262   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9263   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9264
9265   while (p < pend)
9266     {
9267       if (ASCII_CHAR_P (*p))
9268         p++;
9269       else
9270         {
9271           c = STRING_CHAR_ADVANCE (p);
9272
9273           charset_map_loaded = 0;
9274           for (tail = list; CONSP (tail); tail = XCDR (tail))
9275             {
9276               elt = XCDR (XCAR (tail));
9277               if (! char_encodable_p (c, XCAR (elt)))
9278                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9279             }
9280           if (charset_map_loaded)
9281             {
9282               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9283
9284               if (STRINGP (start))
9285                 pbeg = SDATA (start);
9286               else
9287                 pbeg = BYTE_POS_ADDR (start_byte);
9288               p = pbeg + p_offset;
9289               pend = pbeg + pend_offset;
9290             }
9291         }
9292       pos++;
9293     }
9294
9295   tail = list;
9296   list = Qnil;
9297   for (; CONSP (tail); tail = XCDR (tail))
9298     {
9299       elt = XCAR (tail);
9300       if (CONSP (XCDR (XCDR (elt))))
9301         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9302                       list);
9303     }
9304
9305   return list;
9306 }
9307
9308
9309 static Lisp_Object
9310 code_convert_region (Lisp_Object start, Lisp_Object end,
9311                      Lisp_Object coding_system, Lisp_Object dst_object,
9312                      bool encodep, bool norecord)
9313 {
9314   struct coding_system coding;
9315   ptrdiff_t from, from_byte, to, to_byte;
9316   Lisp_Object src_object;
9317
9318   if (NILP (coding_system))
9319     coding_system = Qno_conversion;
9320   else
9321     CHECK_CODING_SYSTEM (coding_system);
9322   src_object = Fcurrent_buffer ();
9323   if (NILP (dst_object))
9324     dst_object = src_object;
9325   else if (! EQ (dst_object, Qt))
9326     CHECK_BUFFER (dst_object);
9327
9328   validate_region (&start, &end);
9329   from = XFASTINT (start);
9330   from_byte = CHAR_TO_BYTE (from);
9331   to = XFASTINT (end);
9332   to_byte = CHAR_TO_BYTE (to);
9333
9334   setup_coding_system (coding_system, &coding);
9335   coding.mode |= CODING_MODE_LAST_BLOCK;
9336
9337   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9338     {
9339       struct buffer *buf = XBUFFER (dst_object);
9340       ptrdiff_t buf_pt = BUF_PT (buf);
9341
9342       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9343     }
9344
9345   if (encodep)
9346     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9347                           dst_object);
9348   else
9349     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9350                           dst_object);
9351   if (! norecord)
9352     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9353
9354   return (BUFFERP (dst_object)
9355           ? make_number (coding.produced_char)
9356           : coding.dst_object);
9357 }
9358
9359
9360 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9361        3, 4, "r\nzCoding system: ",
9362        doc: /* Decode the current region from the specified coding system.
9363 When called from a program, takes four arguments:
9364         START, END, CODING-SYSTEM, and DESTINATION.
9365 START and END are buffer positions.
9366
9367 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9368 If nil, the region between START and END is replaced by the decoded text.
9369 If buffer, the decoded text is inserted in that buffer after point (point
9370 does not move).
9371 In those cases, the length of the decoded text is returned.
9372 If DESTINATION is t, the decoded text is returned.
9373
9374 This function sets `last-coding-system-used' to the precise coding system
9375 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9376 not fully specified.)  */)
9377   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9378 {
9379   return code_convert_region (start, end, coding_system, destination, 0, 0);
9380 }
9381
9382 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9383        3, 4, "r\nzCoding system: ",
9384        doc: /* Encode the current region by specified coding system.
9385 When called from a program, takes four arguments:
9386         START, END, CODING-SYSTEM and DESTINATION.
9387 START and END are buffer positions.
9388
9389 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9390 If nil, the region between START and END is replace by the encoded text.
9391 If buffer, the encoded text is inserted in that buffer after point (point
9392 does not move).
9393 In those cases, the length of the encoded text is returned.
9394 If DESTINATION is t, the encoded text is returned.
9395
9396 This function sets `last-coding-system-used' to the precise coding system
9397 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9398 not fully specified.)  */)
9399   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9400 {
9401   return code_convert_region (start, end, coding_system, destination, 1, 0);
9402 }
9403
9404 Lisp_Object
9405 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9406                      Lisp_Object dst_object, bool encodep, bool nocopy,
9407                      bool norecord)
9408 {
9409   struct coding_system coding;
9410   ptrdiff_t chars, bytes;
9411
9412   CHECK_STRING (string);
9413   if (NILP (coding_system))
9414     {
9415       if (! norecord)
9416         Vlast_coding_system_used = Qno_conversion;
9417       if (NILP (dst_object))
9418         return (nocopy ? Fcopy_sequence (string) : string);
9419     }
9420
9421   if (NILP (coding_system))
9422     coding_system = Qno_conversion;
9423   else
9424     CHECK_CODING_SYSTEM (coding_system);
9425   if (NILP (dst_object))
9426     dst_object = Qt;
9427   else if (! EQ (dst_object, Qt))
9428     CHECK_BUFFER (dst_object);
9429
9430   setup_coding_system (coding_system, &coding);
9431   coding.mode |= CODING_MODE_LAST_BLOCK;
9432   chars = SCHARS (string);
9433   bytes = SBYTES (string);
9434
9435   if (BUFFERP (dst_object))
9436     {
9437       struct buffer *buf = XBUFFER (dst_object);
9438       ptrdiff_t buf_pt = BUF_PT (buf);
9439
9440       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9441     }
9442
9443   if (encodep)
9444     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9445   else
9446     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9447   if (! norecord)
9448     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9449
9450   return (BUFFERP (dst_object)
9451           ? make_number (coding.produced_char)
9452           : coding.dst_object);
9453 }
9454
9455
9456 /* Encode or decode STRING according to CODING_SYSTEM.
9457    Do not set Vlast_coding_system_used.
9458
9459    This function is called only from macros DECODE_FILE and
9460    ENCODE_FILE, thus we ignore character composition.  */
9461
9462 Lisp_Object
9463 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9464                               bool encodep)
9465 {
9466   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9467 }
9468
9469 /* Encode or decode a file name, to or from a unibyte string suitable
9470    for passing to C library functions.  */
9471 Lisp_Object
9472 decode_file_name (Lisp_Object fname)
9473 {
9474 #ifdef WINDOWSNT
9475   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9476      converts the file names either to UTF-16LE or to the system ANSI
9477      codepage internally, depending on the underlying OS; see w32.c.  */
9478   if (! NILP (Fcoding_system_p (Qutf_8)))
9479     return code_convert_string_norecord (fname, Qutf_8, 0);
9480   return fname;
9481 #else  /* !WINDOWSNT */
9482   if (! NILP (Vfile_name_coding_system))
9483     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9484   else if (! NILP (Vdefault_file_name_coding_system))
9485     return code_convert_string_norecord (fname,
9486                                          Vdefault_file_name_coding_system, 0);
9487   else
9488     return fname;
9489 #endif
9490 }
9491
9492 Lisp_Object
9493 encode_file_name (Lisp_Object fname)
9494 {
9495   /* This is especially important during bootstrap and dumping, when
9496      file-name encoding is not yet known, and therefore any non-ASCII
9497      file names are unibyte strings, and could only be thrashed if we
9498      try to encode them.  */
9499   if (!STRING_MULTIBYTE (fname))
9500     return fname;
9501 #ifdef WINDOWSNT
9502   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9503      converts the file names either to UTF-16LE or to the system ANSI
9504      codepage internally, depending on the underlying OS; see w32.c.  */
9505   if (! NILP (Fcoding_system_p (Qutf_8)))
9506     return code_convert_string_norecord (fname, Qutf_8, 1);
9507   return fname;
9508 #else  /* !WINDOWSNT */
9509   if (! NILP (Vfile_name_coding_system))
9510     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9511   else if (! NILP (Vdefault_file_name_coding_system))
9512     return code_convert_string_norecord (fname,
9513                                          Vdefault_file_name_coding_system, 1);
9514   else
9515     return fname;
9516 #endif
9517 }
9518
9519 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9520        2, 4, 0,
9521        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9522
9523 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9524 if the decoding operation is trivial.
9525
9526 Optional fourth arg BUFFER non-nil means that the decoded text is
9527 inserted in that buffer after point (point does not move).  In this
9528 case, the return value is the length of the decoded text.
9529
9530 This function sets `last-coding-system-used' to the precise coding system
9531 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9532 not fully specified.)  */)
9533   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9534 {
9535   return code_convert_string (string, coding_system, buffer,
9536                               0, ! NILP (nocopy), 0);
9537 }
9538
9539 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9540        2, 4, 0,
9541        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9542
9543 Optional third arg NOCOPY non-nil means it is OK to return STRING
9544 itself if the encoding operation is trivial.
9545
9546 Optional fourth arg BUFFER non-nil means that the encoded text is
9547 inserted in that buffer after point (point does not move).  In this
9548 case, the return value is the length of the encoded text.
9549
9550 This function sets `last-coding-system-used' to the precise coding system
9551 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9552 not fully specified.)  */)
9553   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9554 {
9555   return code_convert_string (string, coding_system, buffer,
9556                               1, ! NILP (nocopy), 0);
9557 }
9558
9559 \f
9560 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9561        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9562 Return the corresponding character.  */)
9563   (Lisp_Object code)
9564 {
9565   Lisp_Object spec, attrs, val;
9566   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9567   EMACS_INT ch;
9568   int c;
9569
9570   CHECK_NATNUM (code);
9571   ch = XFASTINT (code);
9572   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9573   attrs = AREF (spec, 0);
9574
9575   if (ASCII_CHAR_P (ch)
9576       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9577     return code;
9578
9579   val = CODING_ATTR_CHARSET_LIST (attrs);
9580   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9581   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9582   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9583
9584   if (ch <= 0x7F)
9585     {
9586       c = ch;
9587       charset = charset_roman;
9588     }
9589   else if (ch >= 0xA0 && ch < 0xDF)
9590     {
9591       c = ch - 0x80;
9592       charset = charset_kana;
9593     }
9594   else
9595     {
9596       EMACS_INT c1 = ch >> 8;
9597       int c2 = ch & 0xFF;
9598
9599       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9600           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9601         error ("Invalid code: %"pI"d", ch);
9602       c = ch;
9603       SJIS_TO_JIS (c);
9604       charset = charset_kanji;
9605     }
9606   c = DECODE_CHAR (charset, c);
9607   if (c < 0)
9608     error ("Invalid code: %"pI"d", ch);
9609   return make_number (c);
9610 }
9611
9612
9613 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9614        doc: /* Encode a Japanese character CH to shift_jis encoding.
9615 Return the corresponding code in SJIS.  */)
9616   (Lisp_Object ch)
9617 {
9618   Lisp_Object spec, attrs, charset_list;
9619   int c;
9620   struct charset *charset;
9621   unsigned code;
9622
9623   CHECK_CHARACTER (ch);
9624   c = XFASTINT (ch);
9625   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9626   attrs = AREF (spec, 0);
9627
9628   if (ASCII_CHAR_P (c)
9629       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9630     return ch;
9631
9632   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9633   charset = char_charset (c, charset_list, &code);
9634   if (code == CHARSET_INVALID_CODE (charset))
9635     error ("Can't encode by shift_jis encoding: %c", c);
9636   JIS_TO_SJIS (code);
9637
9638   return make_number (code);
9639 }
9640
9641 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9642        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9643 Return the corresponding character.  */)
9644   (Lisp_Object code)
9645 {
9646   Lisp_Object spec, attrs, val;
9647   struct charset *charset_roman, *charset_big5, *charset;
9648   EMACS_INT ch;
9649   int c;
9650
9651   CHECK_NATNUM (code);
9652   ch = XFASTINT (code);
9653   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9654   attrs = AREF (spec, 0);
9655
9656   if (ASCII_CHAR_P (ch)
9657       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9658     return code;
9659
9660   val = CODING_ATTR_CHARSET_LIST (attrs);
9661   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9662   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9663
9664   if (ch <= 0x7F)
9665     {
9666       c = ch;
9667       charset = charset_roman;
9668     }
9669   else
9670     {
9671       EMACS_INT b1 = ch >> 8;
9672       int b2 = ch & 0x7F;
9673       if (b1 < 0xA1 || b1 > 0xFE
9674           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9675         error ("Invalid code: %"pI"d", ch);
9676       c = ch;
9677       charset = charset_big5;
9678     }
9679   c = DECODE_CHAR (charset, c);
9680   if (c < 0)
9681     error ("Invalid code: %"pI"d", ch);
9682   return make_number (c);
9683 }
9684
9685 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9686        doc: /* Encode the Big5 character CH to BIG5 coding system.
9687 Return the corresponding character code in Big5.  */)
9688   (Lisp_Object ch)
9689 {
9690   Lisp_Object spec, attrs, charset_list;
9691   struct charset *charset;
9692   int c;
9693   unsigned code;
9694
9695   CHECK_CHARACTER (ch);
9696   c = XFASTINT (ch);
9697   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9698   attrs = AREF (spec, 0);
9699   if (ASCII_CHAR_P (c)
9700       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9701     return ch;
9702
9703   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9704   charset = char_charset (c, charset_list, &code);
9705   if (code == CHARSET_INVALID_CODE (charset))
9706     error ("Can't encode by Big5 encoding: %c", c);
9707
9708   return make_number (code);
9709 }
9710
9711 \f
9712 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9713        Sset_terminal_coding_system_internal, 1, 2, 0,
9714        doc: /* Internal use only.  */)
9715   (Lisp_Object coding_system, Lisp_Object terminal)
9716 {
9717   struct terminal *term = decode_live_terminal (terminal);
9718   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9719   CHECK_SYMBOL (coding_system);
9720   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9721   /* We had better not send unsafe characters to terminal.  */
9722   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9723   /* Character composition should be disabled.  */
9724   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9725   terminal_coding->src_multibyte = 1;
9726   terminal_coding->dst_multibyte = 0;
9727   tset_charset_list
9728     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9729             ? coding_charset_list (terminal_coding)
9730             : list1 (make_number (charset_ascii))));
9731   return Qnil;
9732 }
9733
9734 DEFUN ("set-safe-terminal-coding-system-internal",
9735        Fset_safe_terminal_coding_system_internal,
9736        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9737        doc: /* Internal use only.  */)
9738   (Lisp_Object coding_system)
9739 {
9740   CHECK_SYMBOL (coding_system);
9741   setup_coding_system (Fcheck_coding_system (coding_system),
9742                        &safe_terminal_coding);
9743   /* Character composition should be disabled.  */
9744   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9745   safe_terminal_coding.src_multibyte = 1;
9746   safe_terminal_coding.dst_multibyte = 0;
9747   return Qnil;
9748 }
9749
9750 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9751        Sterminal_coding_system, 0, 1, 0,
9752        doc: /* Return coding system specified for terminal output on the given terminal.
9753 TERMINAL may be a terminal object, a frame, or nil for the selected
9754 frame's terminal device.  */)
9755   (Lisp_Object terminal)
9756 {
9757   struct coding_system *terminal_coding
9758     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
9759   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9760
9761   /* For backward compatibility, return nil if it is `undecided'.  */
9762   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9763 }
9764
9765 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9766        Sset_keyboard_coding_system_internal, 1, 2, 0,
9767        doc: /* Internal use only.  */)
9768   (Lisp_Object coding_system, Lisp_Object terminal)
9769 {
9770   struct terminal *t = decode_live_terminal (terminal);
9771   CHECK_SYMBOL (coding_system);
9772   if (NILP (coding_system))
9773     coding_system = Qno_conversion;
9774   else
9775     Fcheck_coding_system (coding_system);
9776   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9777   /* Character composition should be disabled.  */
9778   TERMINAL_KEYBOARD_CODING (t)->common_flags
9779     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9780   return Qnil;
9781 }
9782
9783 DEFUN ("keyboard-coding-system",
9784        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9785        doc: /* Return coding system specified for decoding keyboard input.  */)
9786   (Lisp_Object terminal)
9787 {
9788   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9789                          (decode_live_terminal (terminal))->id);
9790 }
9791
9792 \f
9793 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9794        Sfind_operation_coding_system,  1, MANY, 0,
9795        doc: /* Choose a coding system for an operation based on the target name.
9796 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9797 DECODING-SYSTEM is the coding system to use for decoding
9798 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9799 for encoding (in case OPERATION does encoding).
9800
9801 The first argument OPERATION specifies an I/O primitive:
9802   For file I/O, `insert-file-contents' or `write-region'.
9803   For process I/O, `call-process', `call-process-region', or `start-process'.
9804   For network I/O, `open-network-stream'.
9805
9806 The remaining arguments should be the same arguments that were passed
9807 to the primitive.  Depending on which primitive, one of those arguments
9808 is selected as the TARGET.  For example, if OPERATION does file I/O,
9809 whichever argument specifies the file name is TARGET.
9810
9811 TARGET has a meaning which depends on OPERATION:
9812   For file I/O, TARGET is a file name (except for the special case below).
9813   For process I/O, TARGET is a process name.
9814   For network I/O, TARGET is a service name or a port number.
9815
9816 This function looks up what is specified for TARGET in
9817 `file-coding-system-alist', `process-coding-system-alist',
9818 or `network-coding-system-alist' depending on OPERATION.
9819 They may specify a coding system, a cons of coding systems,
9820 or a function symbol to call.
9821 In the last case, we call the function with one argument,
9822 which is a list of all the arguments given to this function.
9823 If the function can't decide a coding system, it can return
9824 `undecided' so that the normal code-detection is performed.
9825
9826 If OPERATION is `insert-file-contents', the argument corresponding to
9827 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9828 file name to look up, and BUFFER is a buffer that contains the file's
9829 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9830 function to call for FILENAME, that function should examine the
9831 contents of BUFFER instead of reading the file.
9832
9833 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9834   (ptrdiff_t nargs, Lisp_Object *args)
9835 {
9836   Lisp_Object operation, target_idx, target, val;
9837   register Lisp_Object chain;
9838
9839   if (nargs < 2)
9840     error ("Too few arguments");
9841   operation = args[0];
9842   if (!SYMBOLP (operation)
9843       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9844     error ("Invalid first argument");
9845   if (nargs <= 1 + XFASTINT (target_idx))
9846     error ("Too few arguments for operation `%s'",
9847            SDATA (SYMBOL_NAME (operation)));
9848   target = args[XFASTINT (target_idx) + 1];
9849   if (!(STRINGP (target)
9850         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9851             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9852         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9853     error ("Invalid argument %"pI"d of operation `%s'",
9854            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9855   if (CONSP (target))
9856     target = XCAR (target);
9857
9858   chain = ((EQ (operation, Qinsert_file_contents)
9859             || EQ (operation, Qwrite_region))
9860            ? Vfile_coding_system_alist
9861            : (EQ (operation, Qopen_network_stream)
9862               ? Vnetwork_coding_system_alist
9863               : Vprocess_coding_system_alist));
9864   if (NILP (chain))
9865     return Qnil;
9866
9867   for (; CONSP (chain); chain = XCDR (chain))
9868     {
9869       Lisp_Object elt;
9870
9871       elt = XCAR (chain);
9872       if (CONSP (elt)
9873           && ((STRINGP (target)
9874                && STRINGP (XCAR (elt))
9875                && fast_string_match (XCAR (elt), target) >= 0)
9876               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9877         {
9878           val = XCDR (elt);
9879           /* Here, if VAL is both a valid coding system and a valid
9880              function symbol, we return VAL as a coding system.  */
9881           if (CONSP (val))
9882             return val;
9883           if (! SYMBOLP (val))
9884             return Qnil;
9885           if (! NILP (Fcoding_system_p (val)))
9886             return Fcons (val, val);
9887           if (! NILP (Ffboundp (val)))
9888             {
9889               /* We use call1 rather than safe_call1
9890                  so as to get bug reports about functions called here
9891                  which don't handle the current interface.  */
9892               val = call1 (val, Flist (nargs, args));
9893               if (CONSP (val))
9894                 return val;
9895               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9896                 return Fcons (val, val);
9897             }
9898           return Qnil;
9899         }
9900     }
9901   return Qnil;
9902 }
9903
9904 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9905        Sset_coding_system_priority, 0, MANY, 0,
9906        doc: /* Assign higher priority to the coding systems given as arguments.
9907 If multiple coding systems belong to the same category,
9908 all but the first one are ignored.
9909
9910 usage: (set-coding-system-priority &rest coding-systems)  */)
9911   (ptrdiff_t nargs, Lisp_Object *args)
9912 {
9913   ptrdiff_t i, j;
9914   bool changed[coding_category_max];
9915   enum coding_category priorities[coding_category_max];
9916
9917   memset (changed, 0, sizeof changed);
9918
9919   for (i = j = 0; i < nargs; i++)
9920     {
9921       enum coding_category category;
9922       Lisp_Object spec, attrs;
9923
9924       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9925       attrs = AREF (spec, 0);
9926       category = XINT (CODING_ATTR_CATEGORY (attrs));
9927       if (changed[category])
9928         /* Ignore this coding system because a coding system of the
9929            same category already had a higher priority.  */
9930         continue;
9931       changed[category] = 1;
9932       priorities[j++] = category;
9933       if (coding_categories[category].id >= 0
9934           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9935         setup_coding_system (args[i], &coding_categories[category]);
9936       Fset (AREF (Vcoding_category_table, category), args[i]);
9937     }
9938
9939   /* Now we have decided top J priorities.  Reflect the order of the
9940      original priorities to the remaining priorities.  */
9941
9942   for (i = j, j = 0; i < coding_category_max; i++, j++)
9943     {
9944       while (j < coding_category_max
9945              && changed[coding_priorities[j]])
9946         j++;
9947       if (j == coding_category_max)
9948         emacs_abort ();
9949       priorities[i] = coding_priorities[j];
9950     }
9951
9952   memcpy (coding_priorities, priorities, sizeof priorities);
9953
9954   /* Update `coding-category-list'.  */
9955   Vcoding_category_list = Qnil;
9956   for (i = coding_category_max; i-- > 0; )
9957     Vcoding_category_list
9958       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9959                Vcoding_category_list);
9960
9961   return Qnil;
9962 }
9963
9964 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9965        Scoding_system_priority_list, 0, 1, 0,
9966        doc: /* Return a list of coding systems ordered by their priorities.
9967 The list contains a subset of coding systems; i.e. coding systems
9968 assigned to each coding category (see `coding-category-list').
9969
9970 HIGHESTP non-nil means just return the highest priority one.  */)
9971   (Lisp_Object highestp)
9972 {
9973   int i;
9974   Lisp_Object val;
9975
9976   for (i = 0, val = Qnil; i < coding_category_max; i++)
9977     {
9978       enum coding_category category = coding_priorities[i];
9979       int id = coding_categories[category].id;
9980       Lisp_Object attrs;
9981
9982       if (id < 0)
9983         continue;
9984       attrs = CODING_ID_ATTRS (id);
9985       if (! NILP (highestp))
9986         return CODING_ATTR_BASE_NAME (attrs);
9987       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9988     }
9989   return Fnreverse (val);
9990 }
9991
9992 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9993
9994 static Lisp_Object
9995 make_subsidiaries (Lisp_Object base)
9996 {
9997   Lisp_Object subsidiaries;
9998   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9999   USE_SAFE_ALLOCA;
10000   char *buf = SAFE_ALLOCA (base_name_len + 6);
10001   int i;
10002
10003   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10004   subsidiaries = make_uninit_vector (3);
10005   for (i = 0; i < 3; i++)
10006     {
10007       strcpy (buf + base_name_len, suffixes[i]);
10008       ASET (subsidiaries, i, intern (buf));
10009     }
10010   SAFE_FREE ();
10011   return subsidiaries;
10012 }
10013
10014
10015 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10016        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10017        doc: /* For internal use only.
10018 usage: (define-coding-system-internal ...)  */)
10019   (ptrdiff_t nargs, Lisp_Object *args)
10020 {
10021   Lisp_Object name;
10022   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10023   Lisp_Object attrs;            /* Vector of attributes.  */
10024   Lisp_Object eol_type;
10025   Lisp_Object aliases;
10026   Lisp_Object coding_type, charset_list, safe_charsets;
10027   enum coding_category category;
10028   Lisp_Object tail, val;
10029   int max_charset_id = 0;
10030   int i;
10031
10032   if (nargs < coding_arg_max)
10033     goto short_args;
10034
10035   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10036
10037   name = args[coding_arg_name];
10038   CHECK_SYMBOL (name);
10039   ASET (attrs, coding_attr_base_name, name);
10040
10041   val = args[coding_arg_mnemonic];
10042   if (! STRINGP (val))
10043     CHECK_CHARACTER (val);
10044   ASET (attrs, coding_attr_mnemonic, val);
10045
10046   coding_type = args[coding_arg_coding_type];
10047   CHECK_SYMBOL (coding_type);
10048   ASET (attrs, coding_attr_type, coding_type);
10049
10050   charset_list = args[coding_arg_charset_list];
10051   if (SYMBOLP (charset_list))
10052     {
10053       if (EQ (charset_list, Qiso_2022))
10054         {
10055           if (! EQ (coding_type, Qiso_2022))
10056             error ("Invalid charset-list");
10057           charset_list = Viso_2022_charset_list;
10058         }
10059       else if (EQ (charset_list, Qemacs_mule))
10060         {
10061           if (! EQ (coding_type, Qemacs_mule))
10062             error ("Invalid charset-list");
10063           charset_list = Vemacs_mule_charset_list;
10064         }
10065       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10066         {
10067           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10068             error ("Invalid charset-list");
10069           if (max_charset_id < XFASTINT (XCAR (tail)))
10070             max_charset_id = XFASTINT (XCAR (tail));
10071         }
10072     }
10073   else
10074     {
10075       charset_list = Fcopy_sequence (charset_list);
10076       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10077         {
10078           struct charset *charset;
10079
10080           val = XCAR (tail);
10081           CHECK_CHARSET_GET_CHARSET (val, charset);
10082           if (EQ (coding_type, Qiso_2022)
10083               ? CHARSET_ISO_FINAL (charset) < 0
10084               : EQ (coding_type, Qemacs_mule)
10085               ? CHARSET_EMACS_MULE_ID (charset) < 0
10086               : 0)
10087             error ("Can't handle charset `%s'",
10088                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10089
10090           XSETCAR (tail, make_number (charset->id));
10091           if (max_charset_id < charset->id)
10092             max_charset_id = charset->id;
10093         }
10094     }
10095   ASET (attrs, coding_attr_charset_list, charset_list);
10096
10097   safe_charsets = make_uninit_string (max_charset_id + 1);
10098   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10099   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10100     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10101   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10102
10103   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10104
10105   val = args[coding_arg_decode_translation_table];
10106   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10107     CHECK_SYMBOL (val);
10108   ASET (attrs, coding_attr_decode_tbl, val);
10109
10110   val = args[coding_arg_encode_translation_table];
10111   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10112     CHECK_SYMBOL (val);
10113   ASET (attrs, coding_attr_encode_tbl, val);
10114
10115   val = args[coding_arg_post_read_conversion];
10116   CHECK_SYMBOL (val);
10117   ASET (attrs, coding_attr_post_read, val);
10118
10119   val = args[coding_arg_pre_write_conversion];
10120   CHECK_SYMBOL (val);
10121   ASET (attrs, coding_attr_pre_write, val);
10122
10123   val = args[coding_arg_default_char];
10124   if (NILP (val))
10125     ASET (attrs, coding_attr_default_char, make_number (' '));
10126   else
10127     {
10128       CHECK_CHARACTER (val);
10129       ASET (attrs, coding_attr_default_char, val);
10130     }
10131
10132   val = args[coding_arg_for_unibyte];
10133   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10134
10135   val = args[coding_arg_plist];
10136   CHECK_LIST (val);
10137   ASET (attrs, coding_attr_plist, val);
10138
10139   if (EQ (coding_type, Qcharset))
10140     {
10141       /* Generate a lisp vector of 256 elements.  Each element is nil,
10142          integer, or a list of charset IDs.
10143
10144          If Nth element is nil, the byte code N is invalid in this
10145          coding system.
10146
10147          If Nth element is a number NUM, N is the first byte of a
10148          charset whose ID is NUM.
10149
10150          If Nth element is a list of charset IDs, N is the first byte
10151          of one of them.  The list is sorted by dimensions of the
10152          charsets.  A charset of smaller dimension comes first. */
10153       val = Fmake_vector (make_number (256), Qnil);
10154
10155       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10156         {
10157           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10158           int dim = CHARSET_DIMENSION (charset);
10159           int idx = (dim - 1) * 4;
10160
10161           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10162             ASET (attrs, coding_attr_ascii_compat, Qt);
10163
10164           for (i = charset->code_space[idx];
10165                i <= charset->code_space[idx + 1]; i++)
10166             {
10167               Lisp_Object tmp, tmp2;
10168               int dim2;
10169
10170               tmp = AREF (val, i);
10171               if (NILP (tmp))
10172                 tmp = XCAR (tail);
10173               else if (NUMBERP (tmp))
10174                 {
10175                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10176                   if (dim < dim2)
10177                     tmp = list2 (XCAR (tail), tmp);
10178                   else
10179                     tmp = list2 (tmp, XCAR (tail));
10180                 }
10181               else
10182                 {
10183                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10184                     {
10185                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10186                       if (dim < dim2)
10187                         break;
10188                     }
10189                   if (NILP (tmp2))
10190                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10191                   else
10192                     {
10193                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10194                       XSETCAR (tmp2, XCAR (tail));
10195                     }
10196                 }
10197               ASET (val, i, tmp);
10198             }
10199         }
10200       ASET (attrs, coding_attr_charset_valids, val);
10201       category = coding_category_charset;
10202     }
10203   else if (EQ (coding_type, Qccl))
10204     {
10205       Lisp_Object valids;
10206
10207       if (nargs < coding_arg_ccl_max)
10208         goto short_args;
10209
10210       val = args[coding_arg_ccl_decoder];
10211       CHECK_CCL_PROGRAM (val);
10212       if (VECTORP (val))
10213         val = Fcopy_sequence (val);
10214       ASET (attrs, coding_attr_ccl_decoder, val);
10215
10216       val = args[coding_arg_ccl_encoder];
10217       CHECK_CCL_PROGRAM (val);
10218       if (VECTORP (val))
10219         val = Fcopy_sequence (val);
10220       ASET (attrs, coding_attr_ccl_encoder, val);
10221
10222       val = args[coding_arg_ccl_valids];
10223       valids = Fmake_string (make_number (256), make_number (0));
10224       for (tail = val; CONSP (tail); tail = XCDR (tail))
10225         {
10226           int from, to;
10227
10228           val = XCAR (tail);
10229           if (INTEGERP (val))
10230             {
10231               if (! (0 <= XINT (val) && XINT (val) <= 255))
10232                 args_out_of_range_3 (val, make_number (0), make_number (255));
10233               from = to = XINT (val);
10234             }
10235           else
10236             {
10237               CHECK_CONS (val);
10238               CHECK_NATNUM_CAR (val);
10239               CHECK_NUMBER_CDR (val);
10240               if (XINT (XCAR (val)) > 255)
10241                 args_out_of_range_3 (XCAR (val),
10242                                      make_number (0), make_number (255));
10243               from = XINT (XCAR (val));
10244               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10245                 args_out_of_range_3 (XCDR (val),
10246                                      XCAR (val), make_number (255));
10247               to = XINT (XCDR (val));
10248             }
10249           for (i = from; i <= to; i++)
10250             SSET (valids, i, 1);
10251         }
10252       ASET (attrs, coding_attr_ccl_valids, valids);
10253
10254       category = coding_category_ccl;
10255     }
10256   else if (EQ (coding_type, Qutf_16))
10257     {
10258       Lisp_Object bom, endian;
10259
10260       ASET (attrs, coding_attr_ascii_compat, Qnil);
10261
10262       if (nargs < coding_arg_utf16_max)
10263         goto short_args;
10264
10265       bom = args[coding_arg_utf16_bom];
10266       if (! NILP (bom) && ! EQ (bom, Qt))
10267         {
10268           CHECK_CONS (bom);
10269           val = XCAR (bom);
10270           CHECK_CODING_SYSTEM (val);
10271           val = XCDR (bom);
10272           CHECK_CODING_SYSTEM (val);
10273         }
10274       ASET (attrs, coding_attr_utf_bom, bom);
10275
10276       endian = args[coding_arg_utf16_endian];
10277       CHECK_SYMBOL (endian);
10278       if (NILP (endian))
10279         endian = Qbig;
10280       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10281         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10282       ASET (attrs, coding_attr_utf_16_endian, endian);
10283
10284       category = (CONSP (bom)
10285                   ? coding_category_utf_16_auto
10286                   : NILP (bom)
10287                   ? (EQ (endian, Qbig)
10288                      ? coding_category_utf_16_be_nosig
10289                      : coding_category_utf_16_le_nosig)
10290                   : (EQ (endian, Qbig)
10291                      ? coding_category_utf_16_be
10292                      : coding_category_utf_16_le));
10293     }
10294   else if (EQ (coding_type, Qiso_2022))
10295     {
10296       Lisp_Object initial, reg_usage, request, flags;
10297
10298       if (nargs < coding_arg_iso2022_max)
10299         goto short_args;
10300
10301       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10302       CHECK_VECTOR (initial);
10303       for (i = 0; i < 4; i++)
10304         {
10305           val = AREF (initial, i);
10306           if (! NILP (val))
10307             {
10308               struct charset *charset;
10309
10310               CHECK_CHARSET_GET_CHARSET (val, charset);
10311               ASET (initial, i, make_number (CHARSET_ID (charset)));
10312               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10313                 ASET (attrs, coding_attr_ascii_compat, Qt);
10314             }
10315           else
10316             ASET (initial, i, make_number (-1));
10317         }
10318
10319       reg_usage = args[coding_arg_iso2022_reg_usage];
10320       CHECK_CONS (reg_usage);
10321       CHECK_NUMBER_CAR (reg_usage);
10322       CHECK_NUMBER_CDR (reg_usage);
10323
10324       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10325       for (tail = request; CONSP (tail); tail = XCDR (tail))
10326         {
10327           int id;
10328           Lisp_Object tmp1;
10329
10330           val = XCAR (tail);
10331           CHECK_CONS (val);
10332           tmp1 = XCAR (val);
10333           CHECK_CHARSET_GET_ID (tmp1, id);
10334           CHECK_NATNUM_CDR (val);
10335           if (XINT (XCDR (val)) >= 4)
10336             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10337           XSETCAR (val, make_number (id));
10338         }
10339
10340       flags = args[coding_arg_iso2022_flags];
10341       CHECK_NATNUM (flags);
10342       i = XINT (flags) & INT_MAX;
10343       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10344         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10345       flags = make_number (i);
10346
10347       ASET (attrs, coding_attr_iso_initial, initial);
10348       ASET (attrs, coding_attr_iso_usage, reg_usage);
10349       ASET (attrs, coding_attr_iso_request, request);
10350       ASET (attrs, coding_attr_iso_flags, flags);
10351       setup_iso_safe_charsets (attrs);
10352
10353       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10354         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10355                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10356                     ? coding_category_iso_7_else
10357                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10358                     ? coding_category_iso_7
10359                     : coding_category_iso_7_tight);
10360       else
10361         {
10362           int id = XINT (AREF (initial, 1));
10363
10364           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10365                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10366                        || id < 0)
10367                       ? coding_category_iso_8_else
10368                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10369                       ? coding_category_iso_8_1
10370                       : coding_category_iso_8_2);
10371         }
10372       if (category != coding_category_iso_8_1
10373           && category != coding_category_iso_8_2)
10374         ASET (attrs, coding_attr_ascii_compat, Qnil);
10375     }
10376   else if (EQ (coding_type, Qemacs_mule))
10377     {
10378       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10379         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10380       ASET (attrs, coding_attr_ascii_compat, Qt);
10381       category = coding_category_emacs_mule;
10382     }
10383   else if (EQ (coding_type, Qshift_jis))
10384     {
10385
10386       struct charset *charset;
10387
10388       if (XINT (Flength (charset_list)) != 3
10389           && XINT (Flength (charset_list)) != 4)
10390         error ("There should be three or four charsets");
10391
10392       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10393       if (CHARSET_DIMENSION (charset) != 1)
10394         error ("Dimension of charset %s is not one",
10395                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10396       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10397         ASET (attrs, coding_attr_ascii_compat, Qt);
10398
10399       charset_list = XCDR (charset_list);
10400       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10401       if (CHARSET_DIMENSION (charset) != 1)
10402         error ("Dimension of charset %s is not one",
10403                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10404
10405       charset_list = XCDR (charset_list);
10406       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10407       if (CHARSET_DIMENSION (charset) != 2)
10408         error ("Dimension of charset %s is not two",
10409                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10410
10411       charset_list = XCDR (charset_list);
10412       if (! NILP (charset_list))
10413         {
10414           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10415           if (CHARSET_DIMENSION (charset) != 2)
10416             error ("Dimension of charset %s is not two",
10417                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10418         }
10419
10420       category = coding_category_sjis;
10421       Vsjis_coding_system = name;
10422     }
10423   else if (EQ (coding_type, Qbig5))
10424     {
10425       struct charset *charset;
10426
10427       if (XINT (Flength (charset_list)) != 2)
10428         error ("There should be just two charsets");
10429
10430       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10431       if (CHARSET_DIMENSION (charset) != 1)
10432         error ("Dimension of charset %s is not one",
10433                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10434       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10435         ASET (attrs, coding_attr_ascii_compat, Qt);
10436
10437       charset_list = XCDR (charset_list);
10438       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10439       if (CHARSET_DIMENSION (charset) != 2)
10440         error ("Dimension of charset %s is not two",
10441                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10442
10443       category = coding_category_big5;
10444       Vbig5_coding_system = name;
10445     }
10446   else if (EQ (coding_type, Qraw_text))
10447     {
10448       category = coding_category_raw_text;
10449       ASET (attrs, coding_attr_ascii_compat, Qt);
10450     }
10451   else if (EQ (coding_type, Qutf_8))
10452     {
10453       Lisp_Object bom;
10454
10455       if (nargs < coding_arg_utf8_max)
10456         goto short_args;
10457
10458       bom = args[coding_arg_utf8_bom];
10459       if (! NILP (bom) && ! EQ (bom, Qt))
10460         {
10461           CHECK_CONS (bom);
10462           val = XCAR (bom);
10463           CHECK_CODING_SYSTEM (val);
10464           val = XCDR (bom);
10465           CHECK_CODING_SYSTEM (val);
10466         }
10467       ASET (attrs, coding_attr_utf_bom, bom);
10468       if (NILP (bom))
10469         ASET (attrs, coding_attr_ascii_compat, Qt);
10470
10471       category = (CONSP (bom) ? coding_category_utf_8_auto
10472                   : NILP (bom) ? coding_category_utf_8_nosig
10473                   : coding_category_utf_8_sig);
10474     }
10475   else if (EQ (coding_type, Qundecided))
10476     {
10477       if (nargs < coding_arg_undecided_max)
10478         goto short_args;
10479       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10480             args[coding_arg_undecided_inhibit_null_byte_detection]);
10481       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10482             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10483       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10484             args[coding_arg_undecided_prefer_utf_8]);
10485       category = coding_category_undecided;
10486     }
10487   else
10488     error ("Invalid coding system type: %s",
10489            SDATA (SYMBOL_NAME (coding_type)));
10490
10491   ASET (attrs, coding_attr_category, make_number (category));
10492   ASET (attrs, coding_attr_plist,
10493         Fcons (QCcategory,
10494                Fcons (AREF (Vcoding_category_table, category),
10495                       CODING_ATTR_PLIST (attrs))));
10496   ASET (attrs, coding_attr_plist,
10497         Fcons (QCascii_compatible_p,
10498                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10499                       CODING_ATTR_PLIST (attrs))));
10500
10501   eol_type = args[coding_arg_eol_type];
10502   if (! NILP (eol_type)
10503       && ! EQ (eol_type, Qunix)
10504       && ! EQ (eol_type, Qdos)
10505       && ! EQ (eol_type, Qmac))
10506     error ("Invalid eol-type");
10507
10508   aliases = list1 (name);
10509
10510   if (NILP (eol_type))
10511     {
10512       eol_type = make_subsidiaries (name);
10513       for (i = 0; i < 3; i++)
10514         {
10515           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10516
10517           this_name = AREF (eol_type, i);
10518           this_aliases = list1 (this_name);
10519           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10520           this_spec = make_uninit_vector (3);
10521           ASET (this_spec, 0, attrs);
10522           ASET (this_spec, 1, this_aliases);
10523           ASET (this_spec, 2, this_eol_type);
10524           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10525           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10526           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10527           if (NILP (val))
10528             Vcoding_system_alist
10529               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10530                        Vcoding_system_alist);
10531         }
10532     }
10533
10534   spec_vec = make_uninit_vector (3);
10535   ASET (spec_vec, 0, attrs);
10536   ASET (spec_vec, 1, aliases);
10537   ASET (spec_vec, 2, eol_type);
10538
10539   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10540   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10541   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10542   if (NILP (val))
10543     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10544                                   Vcoding_system_alist);
10545
10546   {
10547     int id = coding_categories[category].id;
10548
10549     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10550       setup_coding_system (name, &coding_categories[category]);
10551   }
10552
10553   return Qnil;
10554
10555  short_args:
10556   return Fsignal (Qwrong_number_of_arguments,
10557                   Fcons (intern ("define-coding-system-internal"),
10558                          make_number (nargs)));
10559 }
10560
10561
10562 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10563        3, 3, 0,
10564        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10565   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10566 {
10567   Lisp_Object spec, attrs;
10568
10569   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10570   attrs = AREF (spec, 0);
10571   if (EQ (prop, QCmnemonic))
10572     {
10573       if (! STRINGP (val))
10574         CHECK_CHARACTER (val);
10575       ASET (attrs, coding_attr_mnemonic, val);
10576     }
10577   else if (EQ (prop, QCdefault_char))
10578     {
10579       if (NILP (val))
10580         val = make_number (' ');
10581       else
10582         CHECK_CHARACTER (val);
10583       ASET (attrs, coding_attr_default_char, val);
10584     }
10585   else if (EQ (prop, QCdecode_translation_table))
10586     {
10587       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10588         CHECK_SYMBOL (val);
10589       ASET (attrs, coding_attr_decode_tbl, val);
10590     }
10591   else if (EQ (prop, QCencode_translation_table))
10592     {
10593       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10594         CHECK_SYMBOL (val);
10595       ASET (attrs, coding_attr_encode_tbl, val);
10596     }
10597   else if (EQ (prop, QCpost_read_conversion))
10598     {
10599       CHECK_SYMBOL (val);
10600       ASET (attrs, coding_attr_post_read, val);
10601     }
10602   else if (EQ (prop, QCpre_write_conversion))
10603     {
10604       CHECK_SYMBOL (val);
10605       ASET (attrs, coding_attr_pre_write, val);
10606     }
10607   else if (EQ (prop, QCascii_compatible_p))
10608     {
10609       ASET (attrs, coding_attr_ascii_compat, val);
10610     }
10611
10612   ASET (attrs, coding_attr_plist,
10613         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10614   return val;
10615 }
10616
10617
10618 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10619        Sdefine_coding_system_alias, 2, 2, 0,
10620        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10621   (Lisp_Object alias, Lisp_Object coding_system)
10622 {
10623   Lisp_Object spec, aliases, eol_type, val;
10624
10625   CHECK_SYMBOL (alias);
10626   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10627   aliases = AREF (spec, 1);
10628   /* ALIASES should be a list of length more than zero, and the first
10629      element is a base coding system.  Append ALIAS at the tail of the
10630      list.  */
10631   while (!NILP (XCDR (aliases)))
10632     aliases = XCDR (aliases);
10633   XSETCDR (aliases, list1 (alias));
10634
10635   eol_type = AREF (spec, 2);
10636   if (VECTORP (eol_type))
10637     {
10638       Lisp_Object subsidiaries;
10639       int i;
10640
10641       subsidiaries = make_subsidiaries (alias);
10642       for (i = 0; i < 3; i++)
10643         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10644                                      AREF (eol_type, i));
10645     }
10646
10647   Fputhash (alias, spec, Vcoding_system_hash_table);
10648   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10649   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10650   if (NILP (val))
10651     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10652                                   Vcoding_system_alist);
10653
10654   return Qnil;
10655 }
10656
10657 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10658        1, 1, 0,
10659        doc: /* Return the base of CODING-SYSTEM.
10660 Any alias or subsidiary coding system is not a base coding system.  */)
10661   (Lisp_Object coding_system)
10662 {
10663   Lisp_Object spec, attrs;
10664
10665   if (NILP (coding_system))
10666     return (Qno_conversion);
10667   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10668   attrs = AREF (spec, 0);
10669   return CODING_ATTR_BASE_NAME (attrs);
10670 }
10671
10672 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10673        1, 1, 0,
10674        doc: /* Return the property list of CODING-SYSTEM.  */)
10675   (Lisp_Object coding_system)
10676 {
10677   Lisp_Object spec, attrs;
10678
10679   if (NILP (coding_system))
10680     coding_system = Qno_conversion;
10681   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10682   attrs = AREF (spec, 0);
10683   return CODING_ATTR_PLIST (attrs);
10684 }
10685
10686
10687 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10688        1, 1, 0,
10689        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10690   (Lisp_Object coding_system)
10691 {
10692   Lisp_Object spec;
10693
10694   if (NILP (coding_system))
10695     coding_system = Qno_conversion;
10696   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10697   return AREF (spec, 1);
10698 }
10699
10700 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10701        Scoding_system_eol_type, 1, 1, 0,
10702        doc: /* Return eol-type of CODING-SYSTEM.
10703 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10704
10705 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10706 and CR respectively.
10707
10708 A vector value indicates that a format of end-of-line should be
10709 detected automatically.  Nth element of the vector is the subsidiary
10710 coding system whose eol-type is N.  */)
10711   (Lisp_Object coding_system)
10712 {
10713   Lisp_Object spec, eol_type;
10714   int n;
10715
10716   if (NILP (coding_system))
10717     coding_system = Qno_conversion;
10718   if (! CODING_SYSTEM_P (coding_system))
10719     return Qnil;
10720   spec = CODING_SYSTEM_SPEC (coding_system);
10721   eol_type = AREF (spec, 2);
10722   if (VECTORP (eol_type))
10723     return Fcopy_sequence (eol_type);
10724   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10725   return make_number (n);
10726 }
10727
10728 #endif /* emacs */
10729
10730 \f
10731 /*** 9. Post-amble ***/
10732
10733 void
10734 init_coding_once (void)
10735 {
10736   int i;
10737
10738   for (i = 0; i < coding_category_max; i++)
10739     {
10740       coding_categories[i].id = -1;
10741       coding_priorities[i] = i;
10742     }
10743
10744   /* ISO2022 specific initialize routine.  */
10745   for (i = 0; i < 0x20; i++)
10746     iso_code_class[i] = ISO_control_0;
10747   for (i = 0x21; i < 0x7F; i++)
10748     iso_code_class[i] = ISO_graphic_plane_0;
10749   for (i = 0x80; i < 0xA0; i++)
10750     iso_code_class[i] = ISO_control_1;
10751   for (i = 0xA1; i < 0xFF; i++)
10752     iso_code_class[i] = ISO_graphic_plane_1;
10753   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10754   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10755   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10756   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10757   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10758   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10759   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10760   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10761   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10762
10763   for (i = 0; i < 256; i++)
10764     {
10765       emacs_mule_bytes[i] = 1;
10766     }
10767   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10768   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10769   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10770   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10771 }
10772
10773 #ifdef emacs
10774
10775 void
10776 syms_of_coding (void)
10777 {
10778   staticpro (&Vcoding_system_hash_table);
10779   {
10780     Lisp_Object args[2];
10781     args[0] = QCtest;
10782     args[1] = Qeq;
10783     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10784   }
10785
10786   staticpro (&Vsjis_coding_system);
10787   Vsjis_coding_system = Qnil;
10788
10789   staticpro (&Vbig5_coding_system);
10790   Vbig5_coding_system = Qnil;
10791
10792   staticpro (&Vcode_conversion_reused_workbuf);
10793   Vcode_conversion_reused_workbuf = Qnil;
10794
10795   staticpro (&Vcode_conversion_workbuf_name);
10796   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10797
10798   reused_workbuf_in_use = 0;
10799
10800   DEFSYM (Qcharset, "charset");
10801   DEFSYM (Qtarget_idx, "target-idx");
10802   DEFSYM (Qcoding_system_history, "coding-system-history");
10803   Fset (Qcoding_system_history, Qnil);
10804
10805   /* Target FILENAME is the first argument.  */
10806   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10807   /* Target FILENAME is the third argument.  */
10808   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10809
10810   DEFSYM (Qcall_process, "call-process");
10811   /* Target PROGRAM is the first argument.  */
10812   Fput (Qcall_process, Qtarget_idx, make_number (0));
10813
10814   DEFSYM (Qcall_process_region, "call-process-region");
10815   /* Target PROGRAM is the third argument.  */
10816   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10817
10818   DEFSYM (Qstart_process, "start-process");
10819   /* Target PROGRAM is the third argument.  */
10820   Fput (Qstart_process, Qtarget_idx, make_number (2));
10821
10822   DEFSYM (Qopen_network_stream, "open-network-stream");
10823   /* Target SERVICE is the fourth argument.  */
10824   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10825
10826   DEFSYM (Qcoding_system, "coding-system");
10827   DEFSYM (Qcoding_aliases, "coding-aliases");
10828
10829   DEFSYM (Qeol_type, "eol-type");
10830   DEFSYM (Qunix, "unix");
10831   DEFSYM (Qdos, "dos");
10832   DEFSYM (Qmac, "mac");
10833
10834   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10835   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10836   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10837   DEFSYM (Qdefault_char, "default-char");
10838   DEFSYM (Qundecided, "undecided");
10839   DEFSYM (Qno_conversion, "no-conversion");
10840   DEFSYM (Qraw_text, "raw-text");
10841
10842   DEFSYM (Qiso_2022, "iso-2022");
10843
10844   DEFSYM (Qutf_8, "utf-8");
10845   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10846
10847 #if defined (WINDOWSNT) || defined (CYGWIN)
10848   /* No, not utf-16-le: that one has a BOM.  */
10849   DEFSYM (Qutf_16le, "utf-16le");
10850 #endif
10851
10852   DEFSYM (Qutf_16, "utf-16");
10853   DEFSYM (Qbig, "big");
10854   DEFSYM (Qlittle, "little");
10855
10856   DEFSYM (Qshift_jis, "shift-jis");
10857   DEFSYM (Qbig5, "big5");
10858
10859   DEFSYM (Qcoding_system_p, "coding-system-p");
10860
10861   /* Error signaled when there's a problem with detecting a coding system.  */
10862   DEFSYM (Qcoding_system_error, "coding-system-error");
10863   Fput (Qcoding_system_error, Qerror_conditions,
10864         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10865   Fput (Qcoding_system_error, Qerror_message,
10866         build_pure_c_string ("Invalid coding system"));
10867
10868   DEFSYM (Qtranslation_table, "translation-table");
10869   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10870   DEFSYM (Qtranslation_table_id, "translation-table-id");
10871   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10872   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10873
10874   DEFSYM (Qvalid_codes, "valid-codes");
10875
10876   /* Coding system emacs-mule and raw-text are for converting only
10877      end-of-line format.  */
10878   DEFSYM (Qemacs_mule, "emacs-mule");
10879
10880   DEFSYM (QCcategory, ":category");
10881   DEFSYM (QCmnemonic, ":mnemonic");
10882   DEFSYM (QCdefault_char, ":default-char");
10883   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10884   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10885   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10886   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10887   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10888
10889   Vcoding_category_table
10890     = Fmake_vector (make_number (coding_category_max), Qnil);
10891   staticpro (&Vcoding_category_table);
10892   /* Followings are target of code detection.  */
10893   ASET (Vcoding_category_table, coding_category_iso_7,
10894         intern_c_string ("coding-category-iso-7"));
10895   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10896         intern_c_string ("coding-category-iso-7-tight"));
10897   ASET (Vcoding_category_table, coding_category_iso_8_1,
10898         intern_c_string ("coding-category-iso-8-1"));
10899   ASET (Vcoding_category_table, coding_category_iso_8_2,
10900         intern_c_string ("coding-category-iso-8-2"));
10901   ASET (Vcoding_category_table, coding_category_iso_7_else,
10902         intern_c_string ("coding-category-iso-7-else"));
10903   ASET (Vcoding_category_table, coding_category_iso_8_else,
10904         intern_c_string ("coding-category-iso-8-else"));
10905   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10906         intern_c_string ("coding-category-utf-8-auto"));
10907   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10908         intern_c_string ("coding-category-utf-8"));
10909   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10910         intern_c_string ("coding-category-utf-8-sig"));
10911   ASET (Vcoding_category_table, coding_category_utf_16_be,
10912         intern_c_string ("coding-category-utf-16-be"));
10913   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10914         intern_c_string ("coding-category-utf-16-auto"));
10915   ASET (Vcoding_category_table, coding_category_utf_16_le,
10916         intern_c_string ("coding-category-utf-16-le"));
10917   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10918         intern_c_string ("coding-category-utf-16-be-nosig"));
10919   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10920         intern_c_string ("coding-category-utf-16-le-nosig"));
10921   ASET (Vcoding_category_table, coding_category_charset,
10922         intern_c_string ("coding-category-charset"));
10923   ASET (Vcoding_category_table, coding_category_sjis,
10924         intern_c_string ("coding-category-sjis"));
10925   ASET (Vcoding_category_table, coding_category_big5,
10926         intern_c_string ("coding-category-big5"));
10927   ASET (Vcoding_category_table, coding_category_ccl,
10928         intern_c_string ("coding-category-ccl"));
10929   ASET (Vcoding_category_table, coding_category_emacs_mule,
10930         intern_c_string ("coding-category-emacs-mule"));
10931   /* Followings are NOT target of code detection.  */
10932   ASET (Vcoding_category_table, coding_category_raw_text,
10933         intern_c_string ("coding-category-raw-text"));
10934   ASET (Vcoding_category_table, coding_category_undecided,
10935         intern_c_string ("coding-category-undecided"));
10936
10937   DEFSYM (Qinsufficient_source, "insufficient-source");
10938   DEFSYM (Qinvalid_source, "invalid-source");
10939   DEFSYM (Qinterrupted, "interrupted");
10940
10941   /* If a symbol has this property, evaluate the value to define the
10942      symbol as a coding system.  */
10943   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10944
10945   defsubr (&Scoding_system_p);
10946   defsubr (&Sread_coding_system);
10947   defsubr (&Sread_non_nil_coding_system);
10948   defsubr (&Scheck_coding_system);
10949   defsubr (&Sdetect_coding_region);
10950   defsubr (&Sdetect_coding_string);
10951   defsubr (&Sfind_coding_systems_region_internal);
10952   defsubr (&Sunencodable_char_position);
10953   defsubr (&Scheck_coding_systems_region);
10954   defsubr (&Sdecode_coding_region);
10955   defsubr (&Sencode_coding_region);
10956   defsubr (&Sdecode_coding_string);
10957   defsubr (&Sencode_coding_string);
10958   defsubr (&Sdecode_sjis_char);
10959   defsubr (&Sencode_sjis_char);
10960   defsubr (&Sdecode_big5_char);
10961   defsubr (&Sencode_big5_char);
10962   defsubr (&Sset_terminal_coding_system_internal);
10963   defsubr (&Sset_safe_terminal_coding_system_internal);
10964   defsubr (&Sterminal_coding_system);
10965   defsubr (&Sset_keyboard_coding_system_internal);
10966   defsubr (&Skeyboard_coding_system);
10967   defsubr (&Sfind_operation_coding_system);
10968   defsubr (&Sset_coding_system_priority);
10969   defsubr (&Sdefine_coding_system_internal);
10970   defsubr (&Sdefine_coding_system_alias);
10971   defsubr (&Scoding_system_put);
10972   defsubr (&Scoding_system_base);
10973   defsubr (&Scoding_system_plist);
10974   defsubr (&Scoding_system_aliases);
10975   defsubr (&Scoding_system_eol_type);
10976   defsubr (&Scoding_system_priority_list);
10977
10978   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10979                doc: /* List of coding systems.
10980
10981 Do not alter the value of this variable manually.  This variable should be
10982 updated by the functions `define-coding-system' and
10983 `define-coding-system-alias'.  */);
10984   Vcoding_system_list = Qnil;
10985
10986   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10987                doc: /* Alist of coding system names.
10988 Each element is one element list of coding system name.
10989 This variable is given to `completing-read' as COLLECTION argument.
10990
10991 Do not alter the value of this variable manually.  This variable should be
10992 updated by the functions `make-coding-system' and
10993 `define-coding-system-alias'.  */);
10994   Vcoding_system_alist = Qnil;
10995
10996   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10997                doc: /* List of coding-categories (symbols) ordered by priority.
10998
10999 On detecting a coding system, Emacs tries code detection algorithms
11000 associated with each coding-category one by one in this order.  When
11001 one algorithm agrees with a byte sequence of source text, the coding
11002 system bound to the corresponding coding-category is selected.
11003
11004 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
11005   {
11006     int i;
11007
11008     Vcoding_category_list = Qnil;
11009     for (i = coding_category_max - 1; i >= 0; i--)
11010       Vcoding_category_list
11011         = Fcons (AREF (Vcoding_category_table, i),
11012                  Vcoding_category_list);
11013   }
11014
11015   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11016                doc: /* Specify the coding system for read operations.
11017 It is useful to bind this variable with `let', but do not set it globally.
11018 If the value is a coding system, it is used for decoding on read operation.
11019 If not, an appropriate element is used from one of the coding system alists.
11020 There are three such tables: `file-coding-system-alist',
11021 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11022   Vcoding_system_for_read = Qnil;
11023
11024   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11025                doc: /* Specify the coding system for write operations.
11026 Programs bind this variable with `let', but you should not set it globally.
11027 If the value is a coding system, it is used for encoding of output,
11028 when writing it to a file and when sending it to a file or subprocess.
11029
11030 If this does not specify a coding system, an appropriate element
11031 is used from one of the coding system alists.
11032 There are three such tables: `file-coding-system-alist',
11033 `process-coding-system-alist', and `network-coding-system-alist'.
11034 For output to files, if the above procedure does not specify a coding system,
11035 the value of `buffer-file-coding-system' is used.  */);
11036   Vcoding_system_for_write = Qnil;
11037
11038   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11039                doc: /*
11040 Coding system used in the latest file or process I/O.  */);
11041   Vlast_coding_system_used = Qnil;
11042
11043   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11044                doc: /*
11045 Error status of the last code conversion.
11046
11047 When an error was detected in the last code conversion, this variable
11048 is set to one of the following symbols.
11049   `insufficient-source'
11050   `inconsistent-eol'
11051   `invalid-source'
11052   `interrupted'
11053   `insufficient-memory'
11054 When no error was detected, the value doesn't change.  So, to check
11055 the error status of a code conversion by this variable, you must
11056 explicitly set this variable to nil before performing code
11057 conversion.  */);
11058   Vlast_code_conversion_error = Qnil;
11059
11060   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11061                doc: /*
11062 *Non-nil means always inhibit code conversion of end-of-line format.
11063 See info node `Coding Systems' and info node `Text and Binary' concerning
11064 such conversion.  */);
11065   inhibit_eol_conversion = 0;
11066
11067   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11068                doc: /*
11069 Non-nil means process buffer inherits coding system of process output.
11070 Bind it to t if the process output is to be treated as if it were a file
11071 read from some filesystem.  */);
11072   inherit_process_coding_system = 0;
11073
11074   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11075                doc: /*
11076 Alist to decide a coding system to use for a file I/O operation.
11077 The format is ((PATTERN . VAL) ...),
11078 where PATTERN is a regular expression matching a file name,
11079 VAL is a coding system, a cons of coding systems, or a function symbol.
11080 If VAL is a coding system, it is used for both decoding and encoding
11081 the file contents.
11082 If VAL is a cons of coding systems, the car part is used for decoding,
11083 and the cdr part is used for encoding.
11084 If VAL is a function symbol, the function must return a coding system
11085 or a cons of coding systems which are used as above.  The function is
11086 called with an argument that is a list of the arguments with which
11087 `find-operation-coding-system' was called.  If the function can't decide
11088 a coding system, it can return `undecided' so that the normal
11089 code-detection is performed.
11090
11091 See also the function `find-operation-coding-system'
11092 and the variable `auto-coding-alist'.  */);
11093   Vfile_coding_system_alist = Qnil;
11094
11095   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11096                doc: /*
11097 Alist to decide a coding system to use for a process I/O operation.
11098 The format is ((PATTERN . VAL) ...),
11099 where PATTERN is a regular expression matching a program name,
11100 VAL is a coding system, a cons of coding systems, or a function symbol.
11101 If VAL is a coding system, it is used for both decoding what received
11102 from the program and encoding what sent to the program.
11103 If VAL is a cons of coding systems, the car part is used for decoding,
11104 and the cdr part is used for encoding.
11105 If VAL is a function symbol, the function must return a coding system
11106 or a cons of coding systems which are used as above.
11107
11108 See also the function `find-operation-coding-system'.  */);
11109   Vprocess_coding_system_alist = Qnil;
11110
11111   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11112                doc: /*
11113 Alist to decide a coding system to use for a network I/O operation.
11114 The format is ((PATTERN . VAL) ...),
11115 where PATTERN is a regular expression matching a network service name
11116 or is a port number to connect to,
11117 VAL is a coding system, a cons of coding systems, or a function symbol.
11118 If VAL is a coding system, it is used for both decoding what received
11119 from the network stream and encoding what sent to the network stream.
11120 If VAL is a cons of coding systems, the car part is used for decoding,
11121 and the cdr part is used for encoding.
11122 If VAL is a function symbol, the function must return a coding system
11123 or a cons of coding systems which are used as above.
11124
11125 See also the function `find-operation-coding-system'.  */);
11126   Vnetwork_coding_system_alist = Qnil;
11127
11128   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11129                doc: /* Coding system to use with system messages.
11130 Also used for decoding keyboard input on X Window system.  */);
11131   Vlocale_coding_system = Qnil;
11132
11133   /* The eol mnemonics are reset in startup.el system-dependently.  */
11134   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11135                doc: /*
11136 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11137   eol_mnemonic_unix = build_pure_c_string (":");
11138
11139   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11140                doc: /*
11141 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11142   eol_mnemonic_dos = build_pure_c_string ("\\");
11143
11144   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11145                doc: /*
11146 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11147   eol_mnemonic_mac = build_pure_c_string ("/");
11148
11149   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11150                doc: /*
11151 *String displayed in mode line when end-of-line format is not yet determined.  */);
11152   eol_mnemonic_undecided = build_pure_c_string (":");
11153
11154   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11155                doc: /*
11156 *Non-nil enables character translation while encoding and decoding.  */);
11157   Venable_character_translation = Qt;
11158
11159   DEFVAR_LISP ("standard-translation-table-for-decode",
11160                Vstandard_translation_table_for_decode,
11161                doc: /* Table for translating characters while decoding.  */);
11162   Vstandard_translation_table_for_decode = Qnil;
11163
11164   DEFVAR_LISP ("standard-translation-table-for-encode",
11165                Vstandard_translation_table_for_encode,
11166                doc: /* Table for translating characters while encoding.  */);
11167   Vstandard_translation_table_for_encode = Qnil;
11168
11169   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11170                doc: /* Alist of charsets vs revision numbers.
11171 While encoding, if a charset (car part of an element) is found,
11172 designate it with the escape sequence identifying revision (cdr part
11173 of the element).  */);
11174   Vcharset_revision_table = Qnil;
11175
11176   DEFVAR_LISP ("default-process-coding-system",
11177                Vdefault_process_coding_system,
11178                doc: /* Cons of coding systems used for process I/O by default.
11179 The car part is used for decoding a process output,
11180 the cdr part is used for encoding a text to be sent to a process.  */);
11181   Vdefault_process_coding_system = Qnil;
11182
11183   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11184                doc: /*
11185 Table of extra Latin codes in the range 128..159 (inclusive).
11186 This is a vector of length 256.
11187 If Nth element is non-nil, the existence of code N in a file
11188 \(or output of subprocess) doesn't prevent it to be detected as
11189 a coding system of ISO 2022 variant which has a flag
11190 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11191 or reading output of a subprocess.
11192 Only 128th through 159th elements have a meaning.  */);
11193   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11194
11195   DEFVAR_LISP ("select-safe-coding-system-function",
11196                Vselect_safe_coding_system_function,
11197                doc: /*
11198 Function to call to select safe coding system for encoding a text.
11199
11200 If set, this function is called to force a user to select a proper
11201 coding system which can encode the text in the case that a default
11202 coding system used in each operation can't encode the text.  The
11203 function should take care that the buffer is not modified while
11204 the coding system is being selected.
11205
11206 The default value is `select-safe-coding-system' (which see).  */);
11207   Vselect_safe_coding_system_function = Qnil;
11208
11209   DEFVAR_BOOL ("coding-system-require-warning",
11210                coding_system_require_warning,
11211                doc: /* Internal use only.
11212 If non-nil, on writing a file, `select-safe-coding-system-function' is
11213 called even if `coding-system-for-write' is non-nil.  The command
11214 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11215   coding_system_require_warning = 0;
11216
11217
11218   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11219                inhibit_iso_escape_detection,
11220                doc: /*
11221 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11222
11223 When Emacs reads text, it tries to detect how the text is encoded.
11224 This code detection is sensitive to escape sequences.  If Emacs sees
11225 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11226 of the ISO2022 encodings, and decodes text by the corresponding coding
11227 system (e.g. `iso-2022-7bit').
11228
11229 However, there may be a case that you want to read escape sequences in
11230 a file as is.  In such a case, you can set this variable to non-nil.
11231 Then the code detection will ignore any escape sequences, and no text is
11232 detected as encoded in some ISO-2022 encoding.  The result is that all
11233 escape sequences become visible in a buffer.
11234
11235 The default value is nil, and it is strongly recommended not to change
11236 it.  That is because many Emacs Lisp source files that contain
11237 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11238 in Emacs's distribution, and they won't be decoded correctly on
11239 reading if you suppress escape sequence detection.
11240
11241 The other way to read escape sequences in a file without decoding is
11242 to explicitly specify some coding system that doesn't use ISO-2022
11243 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11244   inhibit_iso_escape_detection = 0;
11245
11246   DEFVAR_BOOL ("inhibit-null-byte-detection",
11247                inhibit_null_byte_detection,
11248                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11249 By default, Emacs treats it as binary data, and does not attempt to
11250 decode it.  The effect is as if you specified `no-conversion' for
11251 reading that text.
11252
11253 Set this to non-nil when a regular text happens to include null bytes.
11254 Examples are Index nodes of Info files and null-byte delimited output
11255 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11256 decode text as usual.  */);
11257   inhibit_null_byte_detection = 0;
11258
11259   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11260                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11261 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11262   disable_ascii_optimization = 0;
11263
11264   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11265                doc: /* Char table for translating self-inserting characters.
11266 This is applied to the result of input methods, not their input.
11267 See also `keyboard-translate-table'.
11268
11269 Use of this variable for character code unification was rendered
11270 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11271 internal character representation.  */);
11272     Vtranslation_table_for_input = Qnil;
11273
11274   {
11275     Lisp_Object args[coding_arg_undecided_max];
11276     Lisp_Object plist[16];
11277     int i;
11278
11279     for (i = 0; i < coding_arg_undecided_max; i++)
11280       args[i] = Qnil;
11281
11282     plist[0] = intern_c_string (":name");
11283     plist[1] = args[coding_arg_name] = Qno_conversion;
11284     plist[2] = intern_c_string (":mnemonic");
11285     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11286     plist[4] = intern_c_string (":coding-type");
11287     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11288     plist[6] = intern_c_string (":ascii-compatible-p");
11289     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11290     plist[8] = intern_c_string (":default-char");
11291     plist[9] = args[coding_arg_default_char] = make_number (0);
11292     plist[10] = intern_c_string (":for-unibyte");
11293     plist[11] = args[coding_arg_for_unibyte] = Qt;
11294     plist[12] = intern_c_string (":docstring");
11295     plist[13] = build_pure_c_string ("Do no conversion.\n\
11296 \n\
11297 When you visit a file with this coding, the file is read into a\n\
11298 unibyte buffer as is, thus each byte of a file is treated as a\n\
11299 character.");
11300     plist[14] = intern_c_string (":eol-type");
11301     plist[15] = args[coding_arg_eol_type] = Qunix;
11302     args[coding_arg_plist] = Flist (16, plist);
11303     Fdefine_coding_system_internal (coding_arg_max, args);
11304
11305     plist[1] = args[coding_arg_name] = Qundecided;
11306     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11307     plist[5] = args[coding_arg_coding_type] = Qundecided;
11308     /* This is already set.
11309        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11310     plist[8] = intern_c_string (":charset-list");
11311     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11312     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11313     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11314     plist[15] = args[coding_arg_eol_type] = Qnil;
11315     args[coding_arg_plist] = Flist (16, plist);
11316     args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11317     args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11318     Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11319   }
11320
11321   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11322
11323   {
11324     int i;
11325
11326     for (i = 0; i < coding_category_max; i++)
11327       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11328   }
11329 #if defined (DOS_NT)
11330   system_eol_type = Qdos;
11331 #else
11332   system_eol_type = Qunix;
11333 #endif
11334   staticpro (&system_eol_type);
11335 }
11336
11337 char *
11338 emacs_strerror (int error_number)
11339 {
11340   char *str;
11341
11342   synchronize_system_messages_locale ();
11343   str = strerror (error_number);
11344
11345   if (! NILP (Vlocale_coding_system))
11346     {
11347       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11348                                                       Vlocale_coding_system,
11349                                                       0);
11350       str = SSDATA (dec);
11351     }
11352
11353   return str;
11354 }
11355
11356 #endif /* emacs */