src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2017 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or (at
  16 your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  Classic Mac OS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "termhooks.h"
 301
 302 Lisp_Object Vcoding_system_hash_table;
 303
 304 /* Format of end-of-line decided by system.  This is Qunix on
 305    Unix and Mac, Qdos on DOS/Windows.
 306    This has an effect only for external encoding (i.e. for output to
 307    file and process), not for in-buffer or Lisp string encoding.  */
 308 static Lisp_Object system_eol_type;
 309
 310 #ifdef emacs
 311
 312 /* Coding-systems are handed between Emacs Lisp programs and C internal
 313    routines by the following three variables.  */
 314 /* Coding system to be used to encode text for terminal display when
 315    terminal coding system is nil.  */
 316 struct coding_system safe_terminal_coding;
 317
 318 #endif /* emacs */
 319
 320 /* Two special coding systems.  */
 321 static Lisp_Object Vsjis_coding_system;
 322 static Lisp_Object Vbig5_coding_system;
 323
 324 /* ISO2022 section */
 325
 326 #define CODING_ISO_INITIAL(coding, reg)                 \
 327   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 328                      coding_attr_iso_initial),          \
 329                reg)))
 330
 331
 332 #define CODING_ISO_REQUEST(coding, charset_id)          \
 333   (((charset_id) <= (coding)->max_charset_id            \
 334     ? ((coding)->safe_charsets[charset_id] != 255       \
 335        ? (coding)->safe_charsets[charset_id]            \
 336        : -1)                                            \
 337     : -1))
 338
 339
 340 #define CODING_ISO_FLAGS(coding)        \
 341   ((coding)->spec.iso_2022.flags)
 342 #define CODING_ISO_DESIGNATION(coding, reg)     \
 343   ((coding)->spec.iso_2022.current_designation[reg])
 344 #define CODING_ISO_INVOCATION(coding, plane)    \
 345   ((coding)->spec.iso_2022.current_invocation[plane])
 346 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 347   ((coding)->spec.iso_2022.single_shifting)
 348 #define CODING_ISO_BOL(coding)  \
 349   ((coding)->spec.iso_2022.bol)
 350 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 351   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1       \
 352    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
 353 #define CODING_ISO_CMP_STATUS(coding)   \
 354   (&(coding)->spec.iso_2022.cmp_status)
 355 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 356   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 357 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 358   ((coding)->spec.iso_2022.embedded_utf_8)
 359
 360 /* Control characters of ISO2022.  */
 361                         /* code */      /* function */
 362 #define ISO_CODE_SO     0x0E            /* shift-out */
 363 #define ISO_CODE_SI     0x0F            /* shift-in */
 364 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 365 #define ISO_CODE_ESC    0x1B            /* escape */
 366 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 367 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 368 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 369
 370 /* All code (1-byte) of ISO2022 is classified into one of the
 371    followings.  */
 372 enum iso_code_class_type
 373   {
 374     ISO_control_0,              /* Control codes in the range
 375                                    0x00..0x1F and 0x7F, except for the
 376                                    following 5 codes.  */
 377     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 378     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 379     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 380     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 381     ISO_control_1,              /* Control codes in the range
 382                                    0x80..0x9F, except for the
 383                                    following 3 codes.  */
 384     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 385     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 386     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 387     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 388     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 389     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 390     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 391   };
 392
 393 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 394     `iso-flags' attribute of an iso2022 coding system.  */
 395
 396 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 397    instead of the correct short-form sequence (e.g. ESC $ A).  */
 398 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 399
 400 /* If set, reset graphic planes and registers at end-of-line to the
 401    initial state.  */
 402 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 403
 404 /* If set, reset graphic planes and registers before any control
 405    characters to the initial state.  */
 406 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 407
 408 /* If set, encode by 7-bit environment.  */
 409 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 410
 411 /* If set, use locking-shift function.  */
 412 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 413
 414 /* If set, use single-shift function.  Overwrite
 415    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 416 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 417
 418 /* If set, use designation escape sequence.  */
 419 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 420
 421 /* If set, produce revision number sequence.  */
 422 #define CODING_ISO_FLAG_REVISION        0x0080
 423
 424 /* If set, produce ISO6429's direction specifying sequence.  */
 425 #define CODING_ISO_FLAG_DIRECTION       0x0100
 426
 427 /* If set, assume designation states are reset at beginning of line on
 428    output.  */
 429 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 430
 431 /* If set, designation sequence should be placed at beginning of line
 432    on output.  */
 433 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 434
 435 /* If set, do not encode unsafe characters on output.  */
 436 #define CODING_ISO_FLAG_SAFE            0x0800
 437
 438 /* If set, extra latin codes (128..159) are accepted as a valid code
 439    on input.  */
 440 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 441
 442 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 443
 444 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 445
 446 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 447
 448 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 449
 450 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 451
 452 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 453
 454 /* A character to be produced on output if encoding of the original
 455    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 456 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 457
 458 /* UTF-8 section */
 459 #define CODING_UTF_8_BOM(coding)        \
 460   ((coding)->spec.utf_8_bom)
 461
 462 /* UTF-16 section */
 463 #define CODING_UTF_16_BOM(coding)       \
 464   ((coding)->spec.utf_16.bom)
 465
 466 #define CODING_UTF_16_ENDIAN(coding)    \
 467   ((coding)->spec.utf_16.endian)
 468
 469 #define CODING_UTF_16_SURROGATE(coding) \
 470   ((coding)->spec.utf_16.surrogate)
 471
 472
 473 /* CCL section */
 474 #define CODING_CCL_DECODER(coding)      \
 475   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 476 #define CODING_CCL_ENCODER(coding)      \
 477   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 478 #define CODING_CCL_VALIDS(coding)                                          \
 479   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 480
 481 /* Index for each coding category in `coding_categories' */
 482
 483 enum coding_category
 484   {
 485     coding_category_iso_7,
 486     coding_category_iso_7_tight,
 487     coding_category_iso_8_1,
 488     coding_category_iso_8_2,
 489     coding_category_iso_7_else,
 490     coding_category_iso_8_else,
 491     coding_category_utf_8_auto,
 492     coding_category_utf_8_nosig,
 493     coding_category_utf_8_sig,
 494     coding_category_utf_16_auto,
 495     coding_category_utf_16_be,
 496     coding_category_utf_16_le,
 497     coding_category_utf_16_be_nosig,
 498     coding_category_utf_16_le_nosig,
 499     coding_category_charset,
 500     coding_category_sjis,
 501     coding_category_big5,
 502     coding_category_ccl,
 503     coding_category_emacs_mule,
 504     /* All above are targets of code detection.  */
 505     coding_category_raw_text,
 506     coding_category_undecided,
 507     coding_category_max
 508   };
 509
 510 /* Definitions of flag bits used in detect_coding_XXXX.  */
 511 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 512 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 513 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 514 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 515 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 516 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 517 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 518 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 519 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 520 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 521 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 522 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 523 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 524 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 525 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 526 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 527 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 528 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 529 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 530 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 531
 532 /* This value is returned if detect_coding_mask () find nothing other
 533    than ASCII characters.  */
 534 #define CATEGORY_MASK_ANY               \
 535   (CATEGORY_MASK_ISO_7                  \
 536    | CATEGORY_MASK_ISO_7_TIGHT          \
 537    | CATEGORY_MASK_ISO_8_1              \
 538    | CATEGORY_MASK_ISO_8_2              \
 539    | CATEGORY_MASK_ISO_7_ELSE           \
 540    | CATEGORY_MASK_ISO_8_ELSE           \
 541    | CATEGORY_MASK_UTF_8_AUTO           \
 542    | CATEGORY_MASK_UTF_8_NOSIG          \
 543    | CATEGORY_MASK_UTF_8_SIG            \
 544    | CATEGORY_MASK_UTF_16_AUTO          \
 545    | CATEGORY_MASK_UTF_16_BE            \
 546    | CATEGORY_MASK_UTF_16_LE            \
 547    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 548    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 549    | CATEGORY_MASK_CHARSET              \
 550    | CATEGORY_MASK_SJIS                 \
 551    | CATEGORY_MASK_BIG5                 \
 552    | CATEGORY_MASK_CCL                  \
 553    | CATEGORY_MASK_EMACS_MULE)
 554
 555
 556 #define CATEGORY_MASK_ISO_7BIT \
 557   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 558
 559 #define CATEGORY_MASK_ISO_8BIT \
 560   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 561
 562 #define CATEGORY_MASK_ISO_ELSE \
 563   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 564
 565 #define CATEGORY_MASK_ISO_ESCAPE        \
 566   (CATEGORY_MASK_ISO_7                  \
 567    | CATEGORY_MASK_ISO_7_TIGHT          \
 568    | CATEGORY_MASK_ISO_7_ELSE           \
 569    | CATEGORY_MASK_ISO_8_ELSE)
 570
 571 #define CATEGORY_MASK_ISO       \
 572   (  CATEGORY_MASK_ISO_7BIT     \
 573      | CATEGORY_MASK_ISO_8BIT   \
 574      | CATEGORY_MASK_ISO_ELSE)
 575
 576 #define CATEGORY_MASK_UTF_16            \
 577   (CATEGORY_MASK_UTF_16_AUTO            \
 578    | CATEGORY_MASK_UTF_16_BE            \
 579    | CATEGORY_MASK_UTF_16_LE            \
 580    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 581    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 582
 583 #define CATEGORY_MASK_UTF_8     \
 584   (CATEGORY_MASK_UTF_8_AUTO     \
 585    | CATEGORY_MASK_UTF_8_NOSIG  \
 586    | CATEGORY_MASK_UTF_8_SIG)
 587
 588 /* Table of coding categories (Lisp symbols).  This variable is for
 589    internal use only.  */
 590 static Lisp_Object Vcoding_category_table;
 591
 592 /* Table of coding-categories ordered by priority.  */
 593 static enum coding_category coding_priorities[coding_category_max];
 594
 595 /* Nth element is a coding context for the coding system bound to the
 596    Nth coding category.  */
 597 static struct coding_system coding_categories[coding_category_max];
 598
 599 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 600
 601 static int
 602 encode_inhibit_flag (Lisp_Object flag)
 603 {
 604   return NILP (flag) ? -1 : EQ (flag, Qt);
 605 }
 606
 607 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 608    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 609
 610 static bool
 611 inhibit_flag (int encoded_flag, bool var)
 612 {
 613   return 0 < encoded_flag + var;
 614 }
 615
 616 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 617   do {                                                  \
 618     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 619     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 620   } while (0)
 621
 622 static void
 623 CHECK_NATNUM_CAR (Lisp_Object x)
 624 {
 625   Lisp_Object tmp = XCAR (x);
 626   CHECK_NATNUM (tmp);
 627   XSETCAR (x, tmp);
 628 }
 629
 630 static void
 631 CHECK_NATNUM_CDR (Lisp_Object x)
 632 {
 633   Lisp_Object tmp = XCDR (x);
 634   CHECK_NATNUM (tmp);
 635   XSETCDR (x, tmp);
 636 }
 637
 638 /* True if CODING's destination can be grown.  */
 639
 640 static bool
 641 growable_destination (struct coding_system *coding)
 642 {
 643   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
 644 }
 645
 646
 647 /* Safely get one byte from the source text pointed by SRC which ends
 648    at SRC_END, and set C to that byte.  If there are not enough bytes
 649    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 650    and a multibyte character is found at SRC, set C to the
 651    negative value of the character code.  The caller should declare
 652    and set these variables appropriately in advance:
 653         src, src_end, multibytep */
 654
 655 #define ONE_MORE_BYTE(c)                                \
 656   do {                                                  \
 657     if (src == src_end)                                 \
 658       {                                                 \
 659         if (src_base < src)                             \
 660           record_conversion_result                      \
 661             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 662         goto no_more_source;                            \
 663       }                                                 \
 664     c = *src++;                                         \
 665     if (multibytep && (c & 0x80))                       \
 666       {                                                 \
 667         if ((c & 0xFE) == 0xC0)                         \
 668           c = ((c & 1) << 6) | *src++;                  \
 669         else                                            \
 670           {                                             \
 671             src--;                                      \
 672             c = - string_char (src, &src, NULL);        \
 673             record_conversion_result                    \
 674               (coding, CODING_RESULT_INVALID_SRC);      \
 675           }                                             \
 676       }                                                 \
 677     consumed_chars++;                                   \
 678   } while (0)
 679
 680 /* Safely get two bytes from the source text pointed by SRC which ends
 681    at SRC_END, and set C1 and C2 to those bytes while skipping the
 682    heading multibyte characters.  If there are not enough bytes in the
 683    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 684    a multibyte character is found for C2, set C2 to the negative value
 685    of the character code.  The caller should declare and set these
 686    variables appropriately in advance:
 687         src, src_end, multibytep
 688    It is intended that this macro is used in detect_coding_utf_16.  */
 689
 690 #define TWO_MORE_BYTES(c1, c2)                          \
 691   do {                                                  \
 692     do {                                                \
 693       if (src == src_end)                               \
 694         goto no_more_source;                            \
 695       c1 = *src++;                                      \
 696       if (multibytep && (c1 & 0x80))                    \
 697         {                                               \
 698           if ((c1 & 0xFE) == 0xC0)                      \
 699             c1 = ((c1 & 1) << 6) | *src++;              \
 700           else                                          \
 701             {                                           \
 702               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 703               c1 = -1;                                  \
 704             }                                           \
 705         }                                               \
 706     } while (c1 < 0);                                   \
 707     if (src == src_end)                                 \
 708       goto no_more_source;                              \
 709     c2 = *src++;                                        \
 710     if (multibytep && (c2 & 0x80))                      \
 711       {                                                 \
 712         if ((c2 & 0xFE) == 0xC0)                        \
 713           c2 = ((c2 & 1) << 6) | *src++;                \
 714         else                                            \
 715           c2 = -1;                                      \
 716       }                                                 \
 717   } while (0)
 718
 719
 720 /* Store a byte C in the place pointed by DST and increment DST to the
 721    next free point, and increment PRODUCED_CHARS.  The caller should
 722    assure that C is 0..127, and declare and set the variable `dst'
 723    appropriately in advance.
 724 */
 725
 726
 727 #define EMIT_ONE_ASCII_BYTE(c)  \
 728   do {                          \
 729     produced_chars++;           \
 730     *dst++ = (c);               \
 731   } while (0)
 732
 733
 734 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 735
 736 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 737   do {                                  \
 738     produced_chars += 2;                \
 739     *dst++ = (c1), *dst++ = (c2);       \
 740   } while (0)
 741
 742
 743 /* Store a byte C in the place pointed by DST and increment DST to the
 744    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 745    store in an appropriate multibyte form.  The caller should
 746    declare and set the variables `dst' and `multibytep' appropriately
 747    in advance.  */
 748
 749 #define EMIT_ONE_BYTE(c)                \
 750   do {                                  \
 751     produced_chars++;                   \
 752     if (multibytep)                     \
 753       {                                 \
 754         unsigned ch = (c);              \
 755         if (ch >= 0x80)                 \
 756           ch = BYTE8_TO_CHAR (ch);      \
 757         CHAR_STRING_ADVANCE (ch, dst);  \
 758       }                                 \
 759     else                                \
 760       *dst++ = (c);                     \
 761   } while (0)
 762
 763
 764 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 765
 766 #define EMIT_TWO_BYTES(c1, c2)          \
 767   do {                                  \
 768     produced_chars += 2;                \
 769     if (multibytep)                     \
 770       {                                 \
 771         unsigned ch;                    \
 772                                         \
 773         ch = (c1);                      \
 774         if (ch >= 0x80)                 \
 775           ch = BYTE8_TO_CHAR (ch);      \
 776         CHAR_STRING_ADVANCE (ch, dst);  \
 777         ch = (c2);                      \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781       }                                 \
 782     else                                \
 783       {                                 \
 784         *dst++ = (c1);                  \
 785         *dst++ = (c2);                  \
 786       }                                 \
 787   } while (0)
 788
 789
 790 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 791   do {                                  \
 792     EMIT_ONE_BYTE (c1);                 \
 793     EMIT_TWO_BYTES (c2, c3);            \
 794   } while (0)
 795
 796
 797 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 798   do {                                          \
 799     EMIT_TWO_BYTES (c1, c2);                    \
 800     EMIT_TWO_BYTES (c3, c4);                    \
 801   } while (0)
 802
 803
 804 static void
 805 record_conversion_result (struct coding_system *coding,
 806                           enum coding_result_code result)
 807 {
 808   coding->result = result;
 809   switch (result)
 810     {
 811     case CODING_RESULT_INSUFFICIENT_SRC:
 812       Vlast_code_conversion_error = Qinsufficient_source;
 813       break;
 814     case CODING_RESULT_INVALID_SRC:
 815       Vlast_code_conversion_error = Qinvalid_source;
 816       break;
 817     case CODING_RESULT_INTERRUPT:
 818       Vlast_code_conversion_error = Qinterrupted;
 819       break;
 820     case CODING_RESULT_INSUFFICIENT_DST:
 821       /* Don't record this error in Vlast_code_conversion_error
 822          because it happens just temporarily and is resolved when the
 823          whole conversion is finished.  */
 824       break;
 825     case CODING_RESULT_SUCCESS:
 826       break;
 827     default:
 828       Vlast_code_conversion_error = intern ("Unknown error");
 829     }
 830 }
 831
 832 /* These wrapper macros are used to preserve validity of pointers into
 833    buffer text across calls to decode_char, encode_char, etc, which
 834    could cause relocation of buffers if it loads a charset map,
 835    because loading a charset map allocates large structures.  */
 836
 837 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 838   do {                                                                       \
 839     ptrdiff_t offset;                                                        \
 840                                                                              \
 841     charset_map_loaded = 0;                                                  \
 842     c = DECODE_CHAR (charset, code);                                         \
 843     if (charset_map_loaded                                                   \
 844         && (offset = coding_change_source (coding)))                         \
 845       {                                                                      \
 846         src += offset;                                                       \
 847         src_base += offset;                                                  \
 848         src_end += offset;                                                   \
 849       }                                                                      \
 850   } while (0)
 851
 852 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 853   do {                                                                  \
 854     ptrdiff_t offset;                                                   \
 855                                                                         \
 856     charset_map_loaded = 0;                                             \
 857     code = ENCODE_CHAR (charset, c);                                    \
 858     if (charset_map_loaded                                              \
 859         && (offset = coding_change_destination (coding)))               \
 860       {                                                                 \
 861         dst += offset;                                                  \
 862         dst_end += offset;                                              \
 863       }                                                                 \
 864   } while (0)
 865
 866 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 867   do {                                                                  \
 868     ptrdiff_t offset;                                                   \
 869                                                                         \
 870     charset_map_loaded = 0;                                             \
 871     charset = char_charset (c, charset_list, code_return);              \
 872     if (charset_map_loaded                                              \
 873         && (offset = coding_change_destination (coding)))               \
 874       {                                                                 \
 875         dst += offset;                                                  \
 876         dst_end += offset;                                              \
 877       }                                                                 \
 878   } while (0)
 879
 880 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 881   do {                                                                  \
 882     ptrdiff_t offset;                                                   \
 883                                                                         \
 884     charset_map_loaded = 0;                                             \
 885     result = CHAR_CHARSET_P (c, charset);                               \
 886     if (charset_map_loaded                                              \
 887         && (offset = coding_change_destination (coding)))               \
 888       {                                                                 \
 889         dst += offset;                                                  \
 890         dst_end += offset;                                              \
 891       }                                                                 \
 892   } while (0)
 893
 894
 895 /* If there are at least BYTES length of room at dst, allocate memory
 896    for coding->destination and update dst and dst_end.  We don't have
 897    to take care of coding->source which will be relocated.  It is
 898    handled by calling coding_set_source in encode_coding.  */
 899
 900 #define ASSURE_DESTINATION(bytes)                               \
 901   do {                                                          \
 902     if (dst + (bytes) >= dst_end)                               \
 903       {                                                         \
 904         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 905                                                                 \
 906         dst = alloc_destination (coding, more_bytes, dst);      \
 907         dst_end = coding->destination + coding->dst_bytes;      \
 908       }                                                         \
 909   } while (0)
 910
 911
 912 /* Store multibyte form of the character C in P, and advance P to the
 913    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 914    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 915    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 916
 917 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 918
 919 /* Return the character code of character whose multibyte form is at
 920    P, and advance P to the end of the multibyte form.  This used to be
 921    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 922    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 923
 924 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 925
 926 /* Set coding->source from coding->src_object.  */
 927
 928 static void
 929 coding_set_source (struct coding_system *coding)
 930 {
 931   if (BUFFERP (coding->src_object))
 932     {
 933       struct buffer *buf = XBUFFER (coding->src_object);
 934
 935       if (coding->src_pos < 0)
 936         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 937       else
 938         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 939     }
 940   else if (STRINGP (coding->src_object))
 941     {
 942       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 943     }
 944   else
 945     {
 946       /* Otherwise, the source is C string and is never relocated
 947          automatically.  Thus we don't have to update anything.  */
 948     }
 949 }
 950
 951
 952 /* Set coding->source from coding->src_object, and return how many
 953    bytes coding->source was changed.  */
 954
 955 static ptrdiff_t
 956 coding_change_source (struct coding_system *coding)
 957 {
 958   const unsigned char *orig = coding->source;
 959   coding_set_source (coding);
 960   return coding->source - orig;
 961 }
 962
 963
 964 /* Set coding->destination from coding->dst_object.  */
 965
 966 static void
 967 coding_set_destination (struct coding_system *coding)
 968 {
 969   if (BUFFERP (coding->dst_object))
 970     {
 971       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 972         {
 973           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 974           coding->dst_bytes = (GAP_END_ADDR
 975                                - (coding->src_bytes - coding->consumed)
 976                                - coding->destination);
 977         }
 978       else
 979         {
 980           /* We are sure that coding->dst_pos_byte is before the gap
 981              of the buffer. */
 982           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 983                                  + coding->dst_pos_byte - BEG_BYTE);
 984           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 985                                - coding->destination);
 986         }
 987     }
 988   else
 989     {
 990       /* Otherwise, the destination is C string and is never relocated
 991          automatically.  Thus we don't have to update anything.  */
 992     }
 993 }
 994
 995
 996 /* Set coding->destination from coding->dst_object, and return how
 997    many bytes coding->destination was changed.  */
 998
 999 static ptrdiff_t
1000 coding_change_destination (struct coding_system *coding)
1001 {
1002   const unsigned char *orig = coding->destination;
1003   coding_set_destination (coding);
1004   return coding->destination - orig;
1005 }
1006
1007
1008 static void
1009 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1010 {
1011   ptrdiff_t newbytes;
1012   if (INT_ADD_WRAPV (coding->dst_bytes, bytes, &newbytes)
1013       || SIZE_MAX < newbytes)
1014     string_overflow ();
1015   coding->destination = xrealloc (coding->destination, newbytes);
1016   coding->dst_bytes = newbytes;
1017 }
1018
1019 static void
1020 coding_alloc_by_making_gap (struct coding_system *coding,
1021                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1022 {
1023   if (EQ (coding->src_object, coding->dst_object))
1024     {
1025       /* The gap may contain the produced data at the head and not-yet
1026          consumed data at the tail.  To preserve those data, we at
1027          first make the gap size to zero, then increase the gap
1028          size.  */
1029       ptrdiff_t add = GAP_SIZE;
1030
1031       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1032       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1033       make_gap (bytes);
1034       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1035       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1036     }
1037   else
1038     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1039 }
1040
1041
1042 static unsigned char *
1043 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1044                    unsigned char *dst)
1045 {
1046   ptrdiff_t offset = dst - coding->destination;
1047
1048   if (BUFFERP (coding->dst_object))
1049     {
1050       struct buffer *buf = XBUFFER (coding->dst_object);
1051
1052       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1053     }
1054   else
1055     coding_alloc_by_realloc (coding, nbytes);
1056   coding_set_destination (coding);
1057   dst = coding->destination + offset;
1058   return dst;
1059 }
1060
1061 /** Macros for annotations.  */
1062
1063 /* An annotation data is stored in the array coding->charbuf in this
1064    format:
1065      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1066    LENGTH is the number of elements in the annotation.
1067    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1068    NCHARS is the number of characters in the text annotated.
1069
1070    The format of the following elements depend on ANNOTATION_MASK.
1071
1072    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1073    follows:
1074      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1075
1076    NBYTES is the number of bytes specified in the header part of
1077    old-style emacs-mule encoding, or 0 for the other kind of
1078    composition.
1079
1080    METHOD is one of enum composition_method.
1081
1082    Optional COMPOSITION-COMPONENTS are characters and composition
1083    rules.
1084
1085    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1086    follows.
1087
1088    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1089    recover from an invalid annotation, and should be skipped by
1090    produce_annotation.  */
1091
1092 /* Maximum length of the header of annotation data.  */
1093 #define MAX_ANNOTATION_LENGTH 5
1094
1095 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1096   do {                                                  \
1097     *(buf)++ = -(len);                                  \
1098     *(buf)++ = (mask);                                  \
1099     *(buf)++ = (nchars);                                \
1100     coding->annotated = 1;                              \
1101   } while (0);
1102
1103 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1104   do {                                                                      \
1105     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1106     *buf++ = nbytes;                                                        \
1107     *buf++ = method;                                                        \
1108   } while (0)
1109
1110
1111 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1112   do {                                                                  \
1113     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1114     *buf++ = id;                                                        \
1115   } while (0)
1116
1117
1118 /* Bitmasks for coding->eol_seen.  */
1119
1120 #define EOL_SEEN_NONE   0
1121 #define EOL_SEEN_LF     1
1122 #define EOL_SEEN_CR     2
1123 #define EOL_SEEN_CRLF   4
1124
1125 \f
1126 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1127
1128
1129
1130 \f
1131 /*** 3. UTF-8 ***/
1132
1133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1134    Return true if a text is encoded in UTF-8.  */
1135
1136 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1137 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1138 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1139 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1140 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1141 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1142
1143 #define UTF_8_BOM_1 0xEF
1144 #define UTF_8_BOM_2 0xBB
1145 #define UTF_8_BOM_3 0xBF
1146
1147 /* Unlike the other detect_coding_XXX, this function counts the number
1148    of characters and checks the EOL format.  */
1149
1150 static bool
1151 detect_coding_utf_8 (struct coding_system *coding,
1152                      struct coding_detection_info *detect_info)
1153 {
1154   const unsigned char *src = coding->source, *src_base;
1155   const unsigned char *src_end = coding->source + coding->src_bytes;
1156   bool multibytep = coding->src_multibyte;
1157   ptrdiff_t consumed_chars = 0;
1158   bool bom_found = 0;
1159   ptrdiff_t nchars = coding->head_ascii;
1160   int eol_seen = coding->eol_seen;
1161
1162   detect_info->checked |= CATEGORY_MASK_UTF_8;
1163   /* A coding system of this category is always ASCII compatible.  */
1164   src += nchars;
1165
1166   if (src == coding->source     /* BOM should be at the head.  */
1167       && src + 3 < src_end      /* BOM is 3-byte long.  */
1168       && src[0] == UTF_8_BOM_1
1169       && src[1] == UTF_8_BOM_2
1170       && src[2] == UTF_8_BOM_3)
1171     {
1172       bom_found = 1;
1173       src += 3;
1174       nchars++;
1175     }
1176
1177   while (1)
1178     {
1179       int c, c1, c2, c3, c4;
1180
1181       src_base = src;
1182       ONE_MORE_BYTE (c);
1183       if (c < 0 || UTF_8_1_OCTET_P (c))
1184         {
1185           nchars++;
1186           if (c == '\r')
1187             {
1188               if (src < src_end && *src == '\n')
1189                 {
1190                   eol_seen |= EOL_SEEN_CRLF;
1191                   src++;
1192                   nchars++;
1193                 }
1194               else
1195                 eol_seen |= EOL_SEEN_CR;
1196             }
1197           else if (c == '\n')
1198             eol_seen |= EOL_SEEN_LF;
1199           continue;
1200         }
1201       ONE_MORE_BYTE (c1);
1202       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1203         break;
1204       if (UTF_8_2_OCTET_LEADING_P (c))
1205         {
1206           nchars++;
1207           continue;
1208         }
1209       ONE_MORE_BYTE (c2);
1210       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1211         break;
1212       if (UTF_8_3_OCTET_LEADING_P (c))
1213         {
1214           nchars++;
1215           continue;
1216         }
1217       ONE_MORE_BYTE (c3);
1218       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1219         break;
1220       if (UTF_8_4_OCTET_LEADING_P (c))
1221         {
1222           nchars++;
1223           continue;
1224         }
1225       ONE_MORE_BYTE (c4);
1226       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1227         break;
1228       if (UTF_8_5_OCTET_LEADING_P (c))
1229         {
1230           nchars++;
1231           continue;
1232         }
1233       break;
1234     }
1235   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1236   return 0;
1237
1238  no_more_source:
1239   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1240     {
1241       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1242       return 0;
1243     }
1244   if (bom_found)
1245     {
1246       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1247       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1248     }
1249   else
1250     {
1251       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1252       if (nchars < src_end - coding->source)
1253         /* The found characters are less than source bytes, which
1254            means that we found a valid non-ASCII characters.  */
1255         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1256     }
1257   coding->detected_utf8_bytes = src_base - coding->source;
1258   coding->detected_utf8_chars = nchars;
1259   return 1;
1260 }
1261
1262
1263 static void
1264 decode_coding_utf_8 (struct coding_system *coding)
1265 {
1266   const unsigned char *src = coding->source + coding->consumed;
1267   const unsigned char *src_end = coding->source + coding->src_bytes;
1268   const unsigned char *src_base;
1269   int *charbuf = coding->charbuf + coding->charbuf_used;
1270   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1271   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1272   bool multibytep = coding->src_multibyte;
1273   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1274   bool eol_dos
1275     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1276   int byte_after_cr = -1;
1277
1278   if (bom != utf_without_bom)
1279     {
1280       int c1, c2, c3;
1281
1282       src_base = src;
1283       ONE_MORE_BYTE (c1);
1284       if (! UTF_8_3_OCTET_LEADING_P (c1))
1285         src = src_base;
1286       else
1287         {
1288           ONE_MORE_BYTE (c2);
1289           if (! UTF_8_EXTRA_OCTET_P (c2))
1290             src = src_base;
1291           else
1292             {
1293               ONE_MORE_BYTE (c3);
1294               if (! UTF_8_EXTRA_OCTET_P (c3))
1295                 src = src_base;
1296               else
1297                 {
1298                   if ((c1 != UTF_8_BOM_1)
1299                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1300                     src = src_base;
1301                   else
1302                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1303                 }
1304             }
1305         }
1306     }
1307   CODING_UTF_8_BOM (coding) = utf_without_bom;
1308
1309   while (1)
1310     {
1311       int c, c1, c2, c3, c4, c5;
1312
1313       src_base = src;
1314       consumed_chars_base = consumed_chars;
1315
1316       if (charbuf >= charbuf_end)
1317         {
1318           if (byte_after_cr >= 0)
1319             src_base--;
1320           break;
1321         }
1322
1323       /* In the simple case, rapidly handle ordinary characters */
1324       if (multibytep && ! eol_dos
1325           && charbuf < charbuf_end - 6 && src < src_end - 6)
1326         {
1327           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1328             {
1329               c1 = *src;
1330               if (c1 & 0x80)
1331                 break;
1332               src++;
1333               consumed_chars++;
1334               *charbuf++ = c1;
1335
1336               c1 = *src;
1337               if (c1 & 0x80)
1338                 break;
1339               src++;
1340               consumed_chars++;
1341               *charbuf++ = c1;
1342
1343               c1 = *src;
1344               if (c1 & 0x80)
1345                 break;
1346               src++;
1347               consumed_chars++;
1348               *charbuf++ = c1;
1349
1350               c1 = *src;
1351               if (c1 & 0x80)
1352                 break;
1353               src++;
1354               consumed_chars++;
1355               *charbuf++ = c1;
1356             }
1357           /* If we handled at least one character, restart the main loop.  */
1358           if (src != src_base)
1359             continue;
1360         }
1361
1362       if (byte_after_cr >= 0)
1363         c1 = byte_after_cr, byte_after_cr = -1;
1364       else
1365         ONE_MORE_BYTE (c1);
1366       if (c1 < 0)
1367         {
1368           c = - c1;
1369         }
1370       else if (UTF_8_1_OCTET_P (c1))
1371         {
1372           if (eol_dos && c1 == '\r')
1373             ONE_MORE_BYTE (byte_after_cr);
1374           c = c1;
1375         }
1376       else
1377         {
1378           ONE_MORE_BYTE (c2);
1379           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1380             goto invalid_code;
1381           if (UTF_8_2_OCTET_LEADING_P (c1))
1382             {
1383               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1384               /* Reject overlong sequences here and below.  Encoders
1385                  producing them are incorrect, they can be misleading,
1386                  and they mess up read/write invariance.  */
1387               if (c < 128)
1388                 goto invalid_code;
1389             }
1390           else
1391             {
1392               ONE_MORE_BYTE (c3);
1393               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1394                 goto invalid_code;
1395               if (UTF_8_3_OCTET_LEADING_P (c1))
1396                 {
1397                   c = (((c1 & 0xF) << 12)
1398                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1399                   if (c < 0x800
1400                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1401                     goto invalid_code;
1402                 }
1403               else
1404                 {
1405                   ONE_MORE_BYTE (c4);
1406                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1407                     goto invalid_code;
1408                   if (UTF_8_4_OCTET_LEADING_P (c1))
1409                     {
1410                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1411                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1412                     if (c < 0x10000)
1413                       goto invalid_code;
1414                     }
1415                   else
1416                     {
1417                       ONE_MORE_BYTE (c5);
1418                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1419                         goto invalid_code;
1420                       if (UTF_8_5_OCTET_LEADING_P (c1))
1421                         {
1422                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1423                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1424                                | (c5 & 0x3F));
1425                           if ((c > MAX_CHAR) || (c < 0x200000))
1426                             goto invalid_code;
1427                         }
1428                       else
1429                         goto invalid_code;
1430                     }
1431                 }
1432             }
1433         }
1434
1435       *charbuf++ = c;
1436       continue;
1437
1438     invalid_code:
1439       src = src_base;
1440       consumed_chars = consumed_chars_base;
1441       ONE_MORE_BYTE (c);
1442       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1443     }
1444
1445  no_more_source:
1446   coding->consumed_char += consumed_chars_base;
1447   coding->consumed = src_base - coding->source;
1448   coding->charbuf_used = charbuf - coding->charbuf;
1449 }
1450
1451
1452 static bool
1453 encode_coding_utf_8 (struct coding_system *coding)
1454 {
1455   bool multibytep = coding->dst_multibyte;
1456   int *charbuf = coding->charbuf;
1457   int *charbuf_end = charbuf + coding->charbuf_used;
1458   unsigned char *dst = coding->destination + coding->produced;
1459   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1460   ptrdiff_t produced_chars = 0;
1461   int c;
1462
1463   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1464     {
1465       ASSURE_DESTINATION (3);
1466       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1467       CODING_UTF_8_BOM (coding) = utf_without_bom;
1468     }
1469
1470   if (multibytep)
1471     {
1472       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1473
1474       while (charbuf < charbuf_end)
1475         {
1476           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1477
1478           ASSURE_DESTINATION (safe_room);
1479           c = *charbuf++;
1480           if (CHAR_BYTE8_P (c))
1481             {
1482               c = CHAR_TO_BYTE8 (c);
1483               EMIT_ONE_BYTE (c);
1484             }
1485           else
1486             {
1487               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1488               for (p = str; p < pend; p++)
1489                 EMIT_ONE_BYTE (*p);
1490             }
1491         }
1492     }
1493   else
1494     {
1495       int safe_room = MAX_MULTIBYTE_LENGTH;
1496
1497       while (charbuf < charbuf_end)
1498         {
1499           ASSURE_DESTINATION (safe_room);
1500           c = *charbuf++;
1501           if (CHAR_BYTE8_P (c))
1502             *dst++ = CHAR_TO_BYTE8 (c);
1503           else
1504             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1505         }
1506       produced_chars = dst - (coding->destination + coding->produced);
1507     }
1508   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1509   coding->produced_char += produced_chars;
1510   coding->produced = dst - coding->destination;
1511   return 0;
1512 }
1513
1514
1515 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1516    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1517
1518 #define UTF_16_HIGH_SURROGATE_P(val) \
1519   (((val) & 0xFC00) == 0xD800)
1520
1521 #define UTF_16_LOW_SURROGATE_P(val) \
1522   (((val) & 0xFC00) == 0xDC00)
1523
1524
1525 static bool
1526 detect_coding_utf_16 (struct coding_system *coding,
1527                       struct coding_detection_info *detect_info)
1528 {
1529   const unsigned char *src = coding->source;
1530   const unsigned char *src_end = coding->source + coding->src_bytes;
1531   bool multibytep = coding->src_multibyte;
1532   int c1, c2;
1533
1534   detect_info->checked |= CATEGORY_MASK_UTF_16;
1535   if (coding->mode & CODING_MODE_LAST_BLOCK
1536       && (coding->src_chars & 1))
1537     {
1538       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1539       return 0;
1540     }
1541
1542   TWO_MORE_BYTES (c1, c2);
1543   if ((c1 == 0xFF) && (c2 == 0xFE))
1544     {
1545       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1546                              | CATEGORY_MASK_UTF_16_AUTO);
1547       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1548                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1549                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1550     }
1551   else if ((c1 == 0xFE) && (c2 == 0xFF))
1552     {
1553       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1554                              | CATEGORY_MASK_UTF_16_AUTO);
1555       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1556                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1557                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1558     }
1559   else if (c2 < 0)
1560     {
1561       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1562       return 0;
1563     }
1564   else
1565     {
1566       /* We check the dispersion of Eth and Oth bytes where E is even and
1567          O is odd.  If both are high, we assume binary data.*/
1568       unsigned char e[256], o[256];
1569       unsigned e_num = 1, o_num = 1;
1570
1571       memset (e, 0, 256);
1572       memset (o, 0, 256);
1573       e[c1] = 1;
1574       o[c2] = 1;
1575
1576       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1577                                 |CATEGORY_MASK_UTF_16_BE
1578                                 | CATEGORY_MASK_UTF_16_LE);
1579
1580       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1581              != CATEGORY_MASK_UTF_16)
1582         {
1583           TWO_MORE_BYTES (c1, c2);
1584           if (c2 < 0)
1585             break;
1586           if (! e[c1])
1587             {
1588               e[c1] = 1;
1589               e_num++;
1590               if (e_num >= 128)
1591                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1592             }
1593           if (! o[c2])
1594             {
1595               o[c2] = 1;
1596               o_num++;
1597               if (o_num >= 128)
1598                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1599             }
1600         }
1601       return 0;
1602     }
1603
1604  no_more_source:
1605   return 1;
1606 }
1607
1608 static void
1609 decode_coding_utf_16 (struct coding_system *coding)
1610 {
1611   const unsigned char *src = coding->source + coding->consumed;
1612   const unsigned char *src_end = coding->source + coding->src_bytes;
1613   const unsigned char *src_base;
1614   int *charbuf = coding->charbuf + coding->charbuf_used;
1615   /* We may produces at most 3 chars in one loop.  */
1616   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1617   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1618   bool multibytep = coding->src_multibyte;
1619   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1620   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1621   int surrogate = CODING_UTF_16_SURROGATE (coding);
1622   bool eol_dos
1623     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1624   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1625
1626   if (bom == utf_with_bom)
1627     {
1628       int c, c1, c2;
1629
1630       src_base = src;
1631       ONE_MORE_BYTE (c1);
1632       ONE_MORE_BYTE (c2);
1633       c = (c1 << 8) | c2;
1634
1635       if (endian == utf_16_big_endian
1636           ? c != 0xFEFF : c != 0xFFFE)
1637         {
1638           /* The first two bytes are not BOM.  Treat them as bytes
1639              for a normal character.  */
1640           src = src_base;
1641         }
1642       CODING_UTF_16_BOM (coding) = utf_without_bom;
1643     }
1644   else if (bom == utf_detect_bom)
1645     {
1646       /* We have already tried to detect BOM and failed in
1647          detect_coding.  */
1648       CODING_UTF_16_BOM (coding) = utf_without_bom;
1649     }
1650
1651   while (1)
1652     {
1653       int c, c1, c2;
1654
1655       src_base = src;
1656       consumed_chars_base = consumed_chars;
1657
1658       if (charbuf >= charbuf_end)
1659         {
1660           if (byte_after_cr1 >= 0)
1661             src_base -= 2;
1662           break;
1663         }
1664
1665       if (byte_after_cr1 >= 0)
1666         c1 = byte_after_cr1, byte_after_cr1 = -1;
1667       else
1668         ONE_MORE_BYTE (c1);
1669       if (c1 < 0)
1670         {
1671           *charbuf++ = -c1;
1672           continue;
1673         }
1674       if (byte_after_cr2 >= 0)
1675         c2 = byte_after_cr2, byte_after_cr2 = -1;
1676       else
1677         ONE_MORE_BYTE (c2);
1678       if (c2 < 0)
1679         {
1680           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1681           *charbuf++ = -c2;
1682           continue;
1683         }
1684       c = (endian == utf_16_big_endian
1685            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1686
1687       if (surrogate)
1688         {
1689           if (! UTF_16_LOW_SURROGATE_P (c))
1690             {
1691               if (endian == utf_16_big_endian)
1692                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1693               else
1694                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1695               *charbuf++ = c1;
1696               *charbuf++ = c2;
1697               if (UTF_16_HIGH_SURROGATE_P (c))
1698                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1699               else
1700                 *charbuf++ = c;
1701             }
1702           else
1703             {
1704               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1705               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1706               *charbuf++ = 0x10000 + c;
1707             }
1708         }
1709       else
1710         {
1711           if (UTF_16_HIGH_SURROGATE_P (c))
1712             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1713           else
1714             {
1715               if (eol_dos && c == '\r')
1716                 {
1717                   ONE_MORE_BYTE (byte_after_cr1);
1718                   ONE_MORE_BYTE (byte_after_cr2);
1719                 }
1720               *charbuf++ = c;
1721             }
1722         }
1723     }
1724
1725  no_more_source:
1726   coding->consumed_char += consumed_chars_base;
1727   coding->consumed = src_base - coding->source;
1728   coding->charbuf_used = charbuf - coding->charbuf;
1729 }
1730
1731 static bool
1732 encode_coding_utf_16 (struct coding_system *coding)
1733 {
1734   bool multibytep = coding->dst_multibyte;
1735   int *charbuf = coding->charbuf;
1736   int *charbuf_end = charbuf + coding->charbuf_used;
1737   unsigned char *dst = coding->destination + coding->produced;
1738   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1739   int safe_room = 8;
1740   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1741   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1742   ptrdiff_t produced_chars = 0;
1743   int c;
1744
1745   if (bom != utf_without_bom)
1746     {
1747       ASSURE_DESTINATION (safe_room);
1748       if (big_endian)
1749         EMIT_TWO_BYTES (0xFE, 0xFF);
1750       else
1751         EMIT_TWO_BYTES (0xFF, 0xFE);
1752       CODING_UTF_16_BOM (coding) = utf_without_bom;
1753     }
1754
1755   while (charbuf < charbuf_end)
1756     {
1757       ASSURE_DESTINATION (safe_room);
1758       c = *charbuf++;
1759       if (c > MAX_UNICODE_CHAR)
1760         c = coding->default_char;
1761
1762       if (c < 0x10000)
1763         {
1764           if (big_endian)
1765             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1766           else
1767             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1768         }
1769       else
1770         {
1771           int c1, c2;
1772
1773           c -= 0x10000;
1774           c1 = (c >> 10) + 0xD800;
1775           c2 = (c & 0x3FF) + 0xDC00;
1776           if (big_endian)
1777             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1778           else
1779             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1780         }
1781     }
1782   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1783   coding->produced = dst - coding->destination;
1784   coding->produced_char += produced_chars;
1785   return 0;
1786 }
1787
1788 \f
1789 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1790
1791 /* Emacs' internal format for representation of multiple character
1792    sets is a kind of multi-byte encoding, i.e. characters are
1793    represented by variable-length sequences of one-byte codes.
1794
1795    ASCII characters and control characters (e.g. `tab', `newline') are
1796    represented by one-byte sequences which are their ASCII codes, in
1797    the range 0x00 through 0x7F.
1798
1799    8-bit characters of the range 0x80..0x9F are represented by
1800    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1801    code + 0x20).
1802
1803    8-bit characters of the range 0xA0..0xFF are represented by
1804    one-byte sequences which are their 8-bit code.
1805
1806    The other characters are represented by a sequence of `base
1807    leading-code', optional `extended leading-code', and one or two
1808    `position-code's.  The length of the sequence is determined by the
1809    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1810    whereas extended leading-code and position-code take the range 0xA0
1811    through 0xFF.  See `charset.h' for more details about leading-code
1812    and position-code.
1813
1814    --- CODE RANGE of Emacs' internal format ---
1815    character set        range
1816    -------------        -----
1817    ascii                0x00..0x7F
1818    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1819    eight-bit-graphic    0xA0..0xBF
1820    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1821    ---------------------------------------------
1822
1823    As this is the internal character representation, the format is
1824    usually not used externally (i.e. in a file or in a data sent to a
1825    process).  But, it is possible to have a text externally in this
1826    format (i.e. by encoding by the coding system `emacs-mule').
1827
1828    In that case, a sequence of one-byte codes has a slightly different
1829    form.
1830
1831    At first, all characters in eight-bit-control are represented by
1832    one-byte sequences which are their 8-bit code.
1833
1834    Next, character composition data are represented by the byte
1835    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1836    where,
1837         METHOD is 0xF2 plus one of composition method (enum
1838         composition_method),
1839
1840         BYTES is 0xA0 plus a byte length of this composition data,
1841
1842         CHARS is 0xA0 plus a number of characters composed by this
1843         data,
1844
1845         COMPONENTs are characters of multibyte form or composition
1846         rules encoded by two-byte of ASCII codes.
1847
1848    In addition, for backward compatibility, the following formats are
1849    also recognized as composition data on decoding.
1850
1851    0x80 MSEQ ...
1852    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1853
1854    Here,
1855         MSEQ is a multibyte form but in these special format:
1856           ASCII: 0xA0 ASCII_CODE+0x80,
1857           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1858         RULE is a one byte code of the range 0xA0..0xF0 that
1859         represents a composition rule.
1860   */
1861
1862 char emacs_mule_bytes[256];
1863
1864
1865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1866    Return true if a text is encoded in 'emacs-mule'.  */
1867
1868 static bool
1869 detect_coding_emacs_mule (struct coding_system *coding,
1870                           struct coding_detection_info *detect_info)
1871 {
1872   const unsigned char *src = coding->source, *src_base;
1873   const unsigned char *src_end = coding->source + coding->src_bytes;
1874   bool multibytep = coding->src_multibyte;
1875   ptrdiff_t consumed_chars = 0;
1876   int c;
1877   int found = 0;
1878
1879   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1880   /* A coding system of this category is always ASCII compatible.  */
1881   src += coding->head_ascii;
1882
1883   while (1)
1884     {
1885       src_base = src;
1886       ONE_MORE_BYTE (c);
1887       if (c < 0)
1888         continue;
1889       if (c == 0x80)
1890         {
1891           /* Perhaps the start of composite character.  We simply skip
1892              it because analyzing it is too heavy for detecting.  But,
1893              at least, we check that the composite character
1894              constitutes of more than 4 bytes.  */
1895           const unsigned char *src_start;
1896
1897         repeat:
1898           src_start = src;
1899           do
1900             {
1901               ONE_MORE_BYTE (c);
1902             }
1903           while (c >= 0xA0);
1904
1905           if (src - src_start <= 4)
1906             break;
1907           found = CATEGORY_MASK_EMACS_MULE;
1908           if (c == 0x80)
1909             goto repeat;
1910         }
1911
1912       if (c < 0x80)
1913         {
1914           if (c < 0x20
1915               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1916             break;
1917         }
1918       else
1919         {
1920           int more_bytes = emacs_mule_bytes[c] - 1;
1921
1922           while (more_bytes > 0)
1923             {
1924               ONE_MORE_BYTE (c);
1925               if (c < 0xA0)
1926                 {
1927                   src--;        /* Unread the last byte.  */
1928                   break;
1929                 }
1930               more_bytes--;
1931             }
1932           if (more_bytes != 0)
1933             break;
1934           found = CATEGORY_MASK_EMACS_MULE;
1935         }
1936     }
1937   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1938   return 0;
1939
1940  no_more_source:
1941   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1942     {
1943       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1944       return 0;
1945     }
1946   detect_info->found |= found;
1947   return 1;
1948 }
1949
1950
1951 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1952    character.  If CMP_STATUS indicates that we must expect MSEQ or
1953    RULE described above, decode it and return the negative value of
1954    the decoded character or rule.  If an invalid byte is found, return
1955    -1.  If SRC is too short, return -2.  */
1956
1957 static int
1958 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1959                  int *nbytes, int *nchars, int *id,
1960                  struct composition_status *cmp_status)
1961 {
1962   const unsigned char *src_end = coding->source + coding->src_bytes;
1963   const unsigned char *src_base = src;
1964   bool multibytep = coding->src_multibyte;
1965   int charset_ID;
1966   unsigned code;
1967   int c;
1968   ptrdiff_t consumed_chars = 0;
1969   bool mseq_found = 0;
1970
1971   ONE_MORE_BYTE (c);
1972   if (c < 0)
1973     {
1974       c = -c;
1975       charset_ID = emacs_mule_charset[0];
1976     }
1977   else
1978     {
1979       if (c >= 0xA0)
1980         {
1981           if (cmp_status->state != COMPOSING_NO
1982               && cmp_status->old_form)
1983             {
1984               if (cmp_status->state == COMPOSING_CHAR)
1985                 {
1986                   if (c == 0xA0)
1987                     {
1988                       ONE_MORE_BYTE (c);
1989                       c -= 0x80;
1990                       if (c < 0)
1991                         goto invalid_code;
1992                     }
1993                   else
1994                     c -= 0x20;
1995                   mseq_found = 1;
1996                 }
1997               else
1998                 {
1999                   *nbytes = src - src_base;
2000                   *nchars = consumed_chars;
2001                   return -c;
2002                 }
2003             }
2004           else
2005             goto invalid_code;
2006         }
2007
2008       switch (emacs_mule_bytes[c])
2009         {
2010         case 2:
2011           if ((charset_ID = emacs_mule_charset[c]) < 0)
2012             goto invalid_code;
2013           ONE_MORE_BYTE (c);
2014           if (c < 0xA0)
2015             goto invalid_code;
2016           code = c & 0x7F;
2017           break;
2018
2019         case 3:
2020           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2021               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2022             {
2023               ONE_MORE_BYTE (c);
2024               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2025                 goto invalid_code;
2026               ONE_MORE_BYTE (c);
2027               if (c < 0xA0)
2028                 goto invalid_code;
2029               code = c & 0x7F;
2030             }
2031           else
2032             {
2033               if ((charset_ID = emacs_mule_charset[c]) < 0)
2034                 goto invalid_code;
2035               ONE_MORE_BYTE (c);
2036               if (c < 0xA0)
2037                 goto invalid_code;
2038               code = (c & 0x7F) << 8;
2039               ONE_MORE_BYTE (c);
2040               if (c < 0xA0)
2041                 goto invalid_code;
2042               code |= c & 0x7F;
2043             }
2044           break;
2045
2046         case 4:
2047           ONE_MORE_BYTE (c);
2048           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2049             goto invalid_code;
2050           ONE_MORE_BYTE (c);
2051           if (c < 0xA0)
2052             goto invalid_code;
2053           code = (c & 0x7F) << 8;
2054           ONE_MORE_BYTE (c);
2055           if (c < 0xA0)
2056             goto invalid_code;
2057           code |= c & 0x7F;
2058           break;
2059
2060         case 1:
2061           code = c;
2062           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2063           break;
2064
2065         default:
2066           emacs_abort ();
2067         }
2068       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2069                           CHARSET_FROM_ID (charset_ID), code, c);
2070       if (c < 0)
2071         goto invalid_code;
2072     }
2073   *nbytes = src - src_base;
2074   *nchars = consumed_chars;
2075   if (id)
2076     *id = charset_ID;
2077   return (mseq_found ? -c : c);
2078
2079  no_more_source:
2080   return -2;
2081
2082  invalid_code:
2083   return -1;
2084 }
2085
2086
2087 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2088
2089 /* Handle these composition sequence ('|': the end of header elements,
2090    BYTES and CHARS >= 0xA0):
2091
2092    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2093    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2094    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2095
2096    and these old form:
2097
2098    (4) relative composition: 0x80 | MSEQ ... MSEQ
2099    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2100
2101    When the starter 0x80 and the following header elements are found,
2102    this annotation header is produced.
2103
2104         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2105
2106    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2107    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2108
2109    Then, upon reading the following elements, these codes are produced
2110    until the composition end is found:
2111
2112    (1) CHAR ... CHAR
2113    (2) ALT ... ALT CHAR ... CHAR
2114    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2115    (4) CHAR ... CHAR
2116    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2117
2118    When the composition end is found, LENGTH and NCHARS in the
2119    annotation header is updated as below:
2120
2121    (1) LENGTH: unchanged, NCHARS: unchanged
2122    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2123    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2124    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2125    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2126
2127    If an error is found while composing, the annotation header is
2128    changed to the original composition header (plus filler -1s) as
2129    below:
2130
2131    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2132    (5)          [ 0x80 0xFF -1 -1- -1 ]
2133
2134    and the sequence [ -2 DECODED-RULE ] is changed to the original
2135    byte sequence as below:
2136         o the original byte sequence is B: [ B -1 ]
2137         o the original byte sequence is B1 B2: [ B1 B2 ]
2138
2139    Most of the routines are implemented by macros because many
2140    variables and labels in the caller decode_coding_emacs_mule must be
2141    accessible, and they are usually called just once (thus doesn't
2142    increase the size of compiled object).  */
2143
2144 /* Decode a composition rule represented by C as a component of
2145    composition sequence of Emacs 20 style.  Set RULE to the decoded
2146    rule. */
2147
2148 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2149   do {                                                  \
2150     int gref, nref;                                     \
2151                                                         \
2152     c -= 0xA0;                                          \
2153     if (c < 0 || c >= 81)                               \
2154       goto invalid_code;                                \
2155     gref = c / 9, nref = c % 9;                         \
2156     if (gref == 4) gref = 10;                           \
2157     if (nref == 4) nref = 10;                           \
2158     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2159   } while (0)
2160
2161
2162 /* Decode a composition rule represented by C and the following byte
2163    at SRC as a component of composition sequence of Emacs 21 style.
2164    Set RULE to the decoded rule.  */
2165
2166 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2167   do {                                                  \
2168     int gref, nref;                                     \
2169                                                         \
2170     gref = c - 0x20;                                    \
2171     if (gref < 0 || gref >= 81)                         \
2172       goto invalid_code;                                \
2173     ONE_MORE_BYTE (c);                                  \
2174     nref = c - 0x20;                                    \
2175     if (nref < 0 || nref >= 81)                         \
2176       goto invalid_code;                                \
2177     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2178   } while (0)
2179
2180
2181 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2182    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2183    byte length of this composition information, CHARS is the number of
2184    characters composed by this composition.  */
2185
2186 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2187   do {                                                                  \
2188     enum composition_method method = c - 0xF2;                          \
2189     int nbytes, nchars;                                                 \
2190                                                                         \
2191     ONE_MORE_BYTE (c);                                                  \
2192     if (c < 0)                                                          \
2193       goto invalid_code;                                                \
2194     nbytes = c - 0xA0;                                                  \
2195     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2196       goto invalid_code;                                                \
2197     ONE_MORE_BYTE (c);                                                  \
2198     nchars = c - 0xA0;                                                  \
2199     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2200       goto invalid_code;                                                \
2201     cmp_status->old_form = 0;                                           \
2202     cmp_status->method = method;                                        \
2203     if (method == COMPOSITION_RELATIVE)                                 \
2204       cmp_status->state = COMPOSING_CHAR;                               \
2205     else                                                                \
2206       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2207     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2208     cmp_status->nchars = nchars;                                        \
2209     cmp_status->ncomps = nbytes - 4;                                    \
2210     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2211   } while (0)
2212
2213
2214 /* Start of Emacs 20 style format for relative composition.  */
2215
2216 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2217   do {                                                          \
2218     cmp_status->old_form = 1;                                   \
2219     cmp_status->method = COMPOSITION_RELATIVE;                  \
2220     cmp_status->state = COMPOSING_CHAR;                         \
2221     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2222     cmp_status->nchars = cmp_status->ncomps = 0;                \
2223     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2224   } while (0)
2225
2226
2227 /* Start of Emacs 20 style format for rule-base composition.  */
2228
2229 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2230   do {                                                          \
2231     cmp_status->old_form = 1;                                   \
2232     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2233     cmp_status->state = COMPOSING_CHAR;                         \
2234     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2235     cmp_status->nchars = cmp_status->ncomps = 0;                \
2236     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2237   } while (0)
2238
2239
2240 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2241   do {                                                  \
2242     const unsigned char *current_src = src;             \
2243                                                         \
2244     ONE_MORE_BYTE (c);                                  \
2245     if (c < 0)                                          \
2246       goto invalid_code;                                \
2247     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2248         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2249       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2250     else if (c < 0xA0)                                  \
2251       goto invalid_code;                                \
2252     else if (c < 0xC0)                                  \
2253       {                                                 \
2254         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2255         /* Re-read C as a composition component.  */    \
2256         src = current_src;                              \
2257       }                                                 \
2258     else if (c == 0xFF)                                 \
2259       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2260     else                                                \
2261       goto invalid_code;                                \
2262   } while (0)
2263
2264 #define EMACS_MULE_COMPOSITION_END()                            \
2265   do {                                                          \
2266     int idx = - cmp_status->length;                             \
2267                                                                 \
2268     if (cmp_status->old_form)                                   \
2269       charbuf[idx + 2] = cmp_status->nchars;                    \
2270     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2271       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2272     cmp_status->state = COMPOSING_NO;                           \
2273   } while (0)
2274
2275
2276 static int
2277 emacs_mule_finish_composition (int *charbuf,
2278                                struct composition_status *cmp_status)
2279 {
2280   int idx = - cmp_status->length;
2281   int new_chars;
2282
2283   if (cmp_status->old_form && cmp_status->nchars > 0)
2284     {
2285       charbuf[idx + 2] = cmp_status->nchars;
2286       new_chars = 0;
2287       if (cmp_status->method == COMPOSITION_WITH_RULE
2288           && cmp_status->state == COMPOSING_CHAR)
2289         {
2290           /* The last rule was invalid.  */
2291           int rule = charbuf[-1] + 0xA0;
2292
2293           charbuf[-2] = BYTE8_TO_CHAR (rule);
2294           charbuf[-1] = -1;
2295           new_chars = 1;
2296         }
2297     }
2298   else
2299     {
2300       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2301
2302       if (cmp_status->method == COMPOSITION_WITH_RULE)
2303         {
2304           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2305           charbuf[idx++] = -3;
2306           charbuf[idx++] = 0;
2307           new_chars = 1;
2308         }
2309       else
2310         {
2311           int nchars = charbuf[idx + 1] + 0xA0;
2312           int nbytes = charbuf[idx + 2] + 0xA0;
2313
2314           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2315           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2316           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2317           charbuf[idx++] = -1;
2318           new_chars = 4;
2319         }
2320     }
2321   cmp_status->state = COMPOSING_NO;
2322   return new_chars;
2323 }
2324
2325 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2326   do {                                                                    \
2327     if (cmp_status->state != COMPOSING_NO)                                \
2328       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2329   } while (0)
2330
2331
2332 static void
2333 decode_coding_emacs_mule (struct coding_system *coding)
2334 {
2335   const unsigned char *src = coding->source + coding->consumed;
2336   const unsigned char *src_end = coding->source + coding->src_bytes;
2337   const unsigned char *src_base;
2338   int *charbuf = coding->charbuf + coding->charbuf_used;
2339   /* We may produce two annotations (charset and composition) in one
2340      loop and one more charset annotation at the end.  */
2341   int *charbuf_end
2342     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2343       /* We can produce up to 2 characters in a loop.  */
2344       - 1;
2345   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2346   bool multibytep = coding->src_multibyte;
2347   ptrdiff_t char_offset = coding->produced_char;
2348   ptrdiff_t last_offset = char_offset;
2349   int last_id = charset_ascii;
2350   bool eol_dos
2351     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2352   int byte_after_cr = -1;
2353   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2354
2355   if (cmp_status->state != COMPOSING_NO)
2356     {
2357       int i;
2358
2359       if (charbuf_end - charbuf < cmp_status->length)
2360         emacs_abort ();
2361       for (i = 0; i < cmp_status->length; i++)
2362         *charbuf++ = cmp_status->carryover[i];
2363       coding->annotated = 1;
2364     }
2365
2366   while (1)
2367     {
2368       int c;
2369       int id UNINIT;
2370
2371       src_base = src;
2372       consumed_chars_base = consumed_chars;
2373
2374       if (charbuf >= charbuf_end)
2375         {
2376           if (byte_after_cr >= 0)
2377             src_base--;
2378           break;
2379         }
2380
2381       if (byte_after_cr >= 0)
2382         c = byte_after_cr, byte_after_cr = -1;
2383       else
2384         ONE_MORE_BYTE (c);
2385
2386       if (c < 0 || c == 0x80)
2387         {
2388           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2389           if (c < 0)
2390             {
2391               *charbuf++ = -c;
2392               char_offset++;
2393             }
2394           else
2395             DECODE_EMACS_MULE_COMPOSITION_START ();
2396           continue;
2397         }
2398
2399       if (c < 0x80)
2400         {
2401           if (eol_dos && c == '\r')
2402             ONE_MORE_BYTE (byte_after_cr);
2403           id = charset_ascii;
2404           if (cmp_status->state != COMPOSING_NO)
2405             {
2406               if (cmp_status->old_form)
2407                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2408               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2409                 cmp_status->ncomps--;
2410             }
2411         }
2412       else
2413         {
2414           int nchars UNINIT, nbytes UNINIT;
2415           /* emacs_mule_char can load a charset map from a file, which
2416              allocates a large structure and might cause buffer text
2417              to be relocated as result.  Thus, we need to remember the
2418              original pointer to buffer text, and fix up all related
2419              pointers after the call.  */
2420           const unsigned char *orig = coding->source;
2421           ptrdiff_t offset;
2422
2423           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2424                                cmp_status);
2425           offset = coding->source - orig;
2426           if (offset)
2427             {
2428               src += offset;
2429               src_base += offset;
2430               src_end += offset;
2431             }
2432           if (c < 0)
2433             {
2434               if (c == -1)
2435                 goto invalid_code;
2436               if (c == -2)
2437                 break;
2438             }
2439           src = src_base + nbytes;
2440           consumed_chars = consumed_chars_base + nchars;
2441           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2442             cmp_status->ncomps -= nchars;
2443         }
2444
2445       /* Now if C >= 0, we found a normally encoded character, if C <
2446          0, we found an old-style composition component character or
2447          rule.  */
2448
2449       if (cmp_status->state == COMPOSING_NO)
2450         {
2451           if (last_id != id)
2452             {
2453               if (last_id != charset_ascii)
2454                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2455                                   last_id);
2456               last_id = id;
2457               last_offset = char_offset;
2458             }
2459           *charbuf++ = c;
2460           char_offset++;
2461         }
2462       else if (cmp_status->state == COMPOSING_CHAR)
2463         {
2464           if (cmp_status->old_form)
2465             {
2466               if (c >= 0)
2467                 {
2468                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2469                   *charbuf++ = c;
2470                   char_offset++;
2471                 }
2472               else
2473                 {
2474                   *charbuf++ = -c;
2475                   cmp_status->nchars++;
2476                   cmp_status->length++;
2477                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2478                     EMACS_MULE_COMPOSITION_END ();
2479                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2480                     cmp_status->state = COMPOSING_RULE;
2481                 }
2482             }
2483           else
2484             {
2485               *charbuf++ = c;
2486               cmp_status->length++;
2487               cmp_status->nchars--;
2488               if (cmp_status->nchars == 0)
2489                 EMACS_MULE_COMPOSITION_END ();
2490             }
2491         }
2492       else if (cmp_status->state == COMPOSING_RULE)
2493         {
2494           int rule;
2495
2496           if (c >= 0)
2497             {
2498               EMACS_MULE_COMPOSITION_END ();
2499               *charbuf++ = c;
2500               char_offset++;
2501             }
2502           else
2503             {
2504               c = -c;
2505               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2506               if (rule < 0)
2507                 goto invalid_code;
2508               *charbuf++ = -2;
2509               *charbuf++ = rule;
2510               cmp_status->length += 2;
2511               cmp_status->state = COMPOSING_CHAR;
2512             }
2513         }
2514       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2515         {
2516           *charbuf++ = c;
2517           cmp_status->length++;
2518           if (cmp_status->ncomps == 0)
2519             cmp_status->state = COMPOSING_CHAR;
2520           else if (cmp_status->ncomps > 0)
2521             {
2522               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2523                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2524             }
2525           else
2526             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2527         }
2528       else                      /* COMPOSING_COMPONENT_RULE */
2529         {
2530           int rule;
2531
2532           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2533           if (rule < 0)
2534             goto invalid_code;
2535           *charbuf++ = -2;
2536           *charbuf++ = rule;
2537           cmp_status->length += 2;
2538           cmp_status->ncomps--;
2539           if (cmp_status->ncomps > 0)
2540             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2541           else
2542             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2543         }
2544       continue;
2545
2546     invalid_code:
2547       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2548       src = src_base;
2549       consumed_chars = consumed_chars_base;
2550       ONE_MORE_BYTE (c);
2551       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2552       char_offset++;
2553     }
2554
2555  no_more_source:
2556   if (cmp_status->state != COMPOSING_NO)
2557     {
2558       if (coding->mode & CODING_MODE_LAST_BLOCK)
2559         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2560       else
2561         {
2562           int i;
2563
2564           charbuf -= cmp_status->length;
2565           for (i = 0; i < cmp_status->length; i++)
2566             cmp_status->carryover[i] = charbuf[i];
2567         }
2568     }
2569   if (last_id != charset_ascii)
2570     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2571   coding->consumed_char += consumed_chars_base;
2572   coding->consumed = src_base - coding->source;
2573   coding->charbuf_used = charbuf - coding->charbuf;
2574 }
2575
2576
2577 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2578   do {                                          \
2579     if (id < 0xA0)                              \
2580       codes[0] = id, codes[1] = 0;              \
2581     else if (id < 0xE0)                         \
2582       codes[0] = 0x9A, codes[1] = id;           \
2583     else if (id < 0xF0)                         \
2584       codes[0] = 0x9B, codes[1] = id;           \
2585     else if (id < 0xF5)                         \
2586       codes[0] = 0x9C, codes[1] = id;           \
2587     else                                        \
2588       codes[0] = 0x9D, codes[1] = id;           \
2589   } while (0);
2590
2591
2592 static bool
2593 encode_coding_emacs_mule (struct coding_system *coding)
2594 {
2595   bool multibytep = coding->dst_multibyte;
2596   int *charbuf = coding->charbuf;
2597   int *charbuf_end = charbuf + coding->charbuf_used;
2598   unsigned char *dst = coding->destination + coding->produced;
2599   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2600   int safe_room = 8;
2601   ptrdiff_t produced_chars = 0;
2602   Lisp_Object attrs, charset_list;
2603   int c;
2604   int preferred_charset_id = -1;
2605
2606   CODING_GET_INFO (coding, attrs, charset_list);
2607   if (! EQ (charset_list, Vemacs_mule_charset_list))
2608     {
2609       charset_list = Vemacs_mule_charset_list;
2610       ASET (attrs, coding_attr_charset_list, charset_list);
2611     }
2612
2613   while (charbuf < charbuf_end)
2614     {
2615       ASSURE_DESTINATION (safe_room);
2616       c = *charbuf++;
2617
2618       if (c < 0)
2619         {
2620           /* Handle an annotation.  */
2621           switch (*charbuf)
2622             {
2623             case CODING_ANNOTATE_COMPOSITION_MASK:
2624               /* Not yet implemented.  */
2625               break;
2626             case CODING_ANNOTATE_CHARSET_MASK:
2627               preferred_charset_id = charbuf[3];
2628               if (preferred_charset_id >= 0
2629                   && NILP (Fmemq (make_number (preferred_charset_id),
2630                                   charset_list)))
2631                 preferred_charset_id = -1;
2632               break;
2633             default:
2634               emacs_abort ();
2635             }
2636           charbuf += -c - 1;
2637           continue;
2638         }
2639
2640       if (ASCII_CHAR_P (c))
2641         EMIT_ONE_ASCII_BYTE (c);
2642       else if (CHAR_BYTE8_P (c))
2643         {
2644           c = CHAR_TO_BYTE8 (c);
2645           EMIT_ONE_BYTE (c);
2646         }
2647       else
2648         {
2649           struct charset *charset;
2650           unsigned code;
2651           int dimension;
2652           int emacs_mule_id;
2653           unsigned char leading_codes[2];
2654
2655           if (preferred_charset_id >= 0)
2656             {
2657               bool result;
2658
2659               charset = CHARSET_FROM_ID (preferred_charset_id);
2660               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2661               if (result)
2662                 code = ENCODE_CHAR (charset, c);
2663               else
2664                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2665                                      &code, charset);
2666             }
2667           else
2668             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2669                                  &code, charset);
2670           if (! charset)
2671             {
2672               c = coding->default_char;
2673               if (ASCII_CHAR_P (c))
2674                 {
2675                   EMIT_ONE_ASCII_BYTE (c);
2676                   continue;
2677                 }
2678               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2679                                    &code, charset);
2680             }
2681           dimension = CHARSET_DIMENSION (charset);
2682           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2683           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2684           EMIT_ONE_BYTE (leading_codes[0]);
2685           if (leading_codes[1])
2686             EMIT_ONE_BYTE (leading_codes[1]);
2687           if (dimension == 1)
2688             EMIT_ONE_BYTE (code | 0x80);
2689           else
2690             {
2691               code |= 0x8080;
2692               EMIT_ONE_BYTE (code >> 8);
2693               EMIT_ONE_BYTE (code & 0xFF);
2694             }
2695         }
2696     }
2697   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2698   coding->produced_char += produced_chars;
2699   coding->produced = dst - coding->destination;
2700   return 0;
2701 }
2702
2703 \f
2704 /*** 7. ISO2022 handlers ***/
2705
2706 /* The following note describes the coding system ISO2022 briefly.
2707    Since the intention of this note is to help understand the
2708    functions in this file, some parts are NOT ACCURATE or are OVERLY
2709    SIMPLIFIED.  For thorough understanding, please refer to the
2710    original document of ISO2022.  This is equivalent to the standard
2711    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2712
2713    ISO2022 provides many mechanisms to encode several character sets
2714    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2715    is encoded using bytes less than 128.  This may make the encoded
2716    text a little bit longer, but the text passes more easily through
2717    several types of gateway, some of which strip off the MSB (Most
2718    Significant Bit).
2719
2720    There are two kinds of character sets: control character sets and
2721    graphic character sets.  The former contain control characters such
2722    as `newline' and `escape' to provide control functions (control
2723    functions are also provided by escape sequences).  The latter
2724    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2725    two control character sets and many graphic character sets.
2726
2727    Graphic character sets are classified into one of the following
2728    four classes, according to the number of bytes (DIMENSION) and
2729    number of characters in one dimension (CHARS) of the set:
2730    - DIMENSION1_CHARS94
2731    - DIMENSION1_CHARS96
2732    - DIMENSION2_CHARS94
2733    - DIMENSION2_CHARS96
2734
2735    In addition, each character set is assigned an identification tag,
2736    unique for each set, called the "final character" (denoted as <F>
2737    hereafter).  The <F> of each character set is decided by ECMA(*)
2738    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2739    (0x30..0x3F are for private use only).
2740
2741    Note (*): ECMA = European Computer Manufacturers Association
2742
2743    Here are examples of graphic character sets [NAME(<F>)]:
2744         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2745         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2746         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2747         o DIMENSION2_CHARS96 -- none for the moment
2748
2749    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2750         C0 [0x00..0x1F] -- control character plane 0
2751         GL [0x20..0x7F] -- graphic character plane 0
2752         C1 [0x80..0x9F] -- control character plane 1
2753         GR [0xA0..0xFF] -- graphic character plane 1
2754
2755    A control character set is directly designated and invoked to C0 or
2756    C1 by an escape sequence.  The most common case is that:
2757    - ISO646's  control character set is designated/invoked to C0, and
2758    - ISO6429's control character set is designated/invoked to C1,
2759    and usually these designations/invocations are omitted in encoded
2760    text.  In a 7-bit environment, only C0 can be used, and a control
2761    character for C1 is encoded by an appropriate escape sequence to
2762    fit into the environment.  All control characters for C1 are
2763    defined to have corresponding escape sequences.
2764
2765    A graphic character set is at first designated to one of four
2766    graphic registers (G0 through G3), then these graphic registers are
2767    invoked to GL or GR.  These designations and invocations can be
2768    done independently.  The most common case is that G0 is invoked to
2769    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2770    these invocations and designations are omitted in encoded text.
2771    In a 7-bit environment, only GL can be used.
2772
2773    When a graphic character set of CHARS94 is invoked to GL, codes
2774    0x20 and 0x7F of the GL area work as control characters SPACE and
2775    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2776    be used.
2777
2778    There are two ways of invocation: locking-shift and single-shift.
2779    With locking-shift, the invocation lasts until the next different
2780    invocation, whereas with single-shift, the invocation affects the
2781    following character only and doesn't affect the locking-shift
2782    state.  Invocations are done by the following control characters or
2783    escape sequences:
2784
2785    ----------------------------------------------------------------------
2786    abbrev  function                  cntrl escape seq   description
2787    ----------------------------------------------------------------------
2788    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2789    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2790    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2791    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2792    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2793    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2794    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2795    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2796    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2797    ----------------------------------------------------------------------
2798    (*) These are not used by any known coding system.
2799
2800    Control characters for these functions are defined by macros
2801    ISO_CODE_XXX in `coding.h'.
2802
2803    Designations are done by the following escape sequences:
2804    ----------------------------------------------------------------------
2805    escape sequence      description
2806    ----------------------------------------------------------------------
2807    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2808    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2809    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2810    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2811    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2812    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2813    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2814    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2815    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2816    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2817    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2818    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2819    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2820    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2821    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2822    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2823    ----------------------------------------------------------------------
2824
2825    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2826    of dimension 1, chars 94, and final character <F>, etc...
2827
2828    Note (*): Although these designations are not allowed in ISO2022,
2829    Emacs accepts them on decoding, and produces them on encoding
2830    CHARS96 character sets in a coding system which is characterized as
2831    7-bit environment, non-locking-shift, and non-single-shift.
2832
2833    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2834    '(' must be omitted.  We refer to this as "short-form" hereafter.
2835
2836    Now you may notice that there are a lot of ways of encoding the
2837    same multilingual text in ISO2022.  Actually, there exist many
2838    coding systems such as Compound Text (used in X11's inter client
2839    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2840    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2841    localized platforms), and all of these are variants of ISO2022.
2842
2843    In addition to the above, Emacs handles two more kinds of escape
2844    sequences: ISO6429's direction specification and Emacs' private
2845    sequence for specifying character composition.
2846
2847    ISO6429's direction specification takes the following form:
2848         o CSI ']'      -- end of the current direction
2849         o CSI '0' ']'  -- end of the current direction
2850         o CSI '1' ']'  -- start of left-to-right text
2851         o CSI '2' ']'  -- start of right-to-left text
2852    The control character CSI (0x9B: control sequence introducer) is
2853    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2854
2855    Character composition specification takes the following form:
2856         o ESC '0' -- start relative composition
2857         o ESC '1' -- end composition
2858         o ESC '2' -- start rule-base composition (*)
2859         o ESC '3' -- start relative composition with alternate chars  (**)
2860         o ESC '4' -- start rule-base composition with alternate chars  (**)
2861   Since these are not standard escape sequences of any ISO standard,
2862   the use of them with these meanings is restricted to Emacs only.
2863
2864   (*) This form is used only in Emacs 20.7 and older versions,
2865   but newer versions can safely decode it.
2866   (**) This form is used only in Emacs 21.1 and newer versions,
2867   and older versions can't decode it.
2868
2869   Here's a list of example usages of these composition escape
2870   sequences (categorized by `enum composition_method').
2871
2872   COMPOSITION_RELATIVE:
2873         ESC 0 CHAR [ CHAR ] ESC 1
2874   COMPOSITION_WITH_RULE:
2875         ESC 2 CHAR [ RULE CHAR ] ESC 1
2876   COMPOSITION_WITH_ALTCHARS:
2877         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2878   COMPOSITION_WITH_RULE_ALTCHARS:
2879         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2880
2881 static enum iso_code_class_type iso_code_class[256];
2882
2883 #define SAFE_CHARSET_P(coding, id)      \
2884   ((id) <= (coding)->max_charset_id     \
2885    && (coding)->safe_charsets[id] != 255)
2886
2887 static void
2888 setup_iso_safe_charsets (Lisp_Object attrs)
2889 {
2890   Lisp_Object charset_list, safe_charsets;
2891   Lisp_Object request;
2892   Lisp_Object reg_usage;
2893   Lisp_Object tail;
2894   EMACS_INT reg94, reg96;
2895   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2896   int max_charset_id;
2897
2898   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2899   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2900       && ! EQ (charset_list, Viso_2022_charset_list))
2901     {
2902       charset_list = Viso_2022_charset_list;
2903       ASET (attrs, coding_attr_charset_list, charset_list);
2904       ASET (attrs, coding_attr_safe_charsets, Qnil);
2905     }
2906
2907   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2908     return;
2909
2910   max_charset_id = 0;
2911   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2912     {
2913       int id = XINT (XCAR (tail));
2914       if (max_charset_id < id)
2915         max_charset_id = id;
2916     }
2917
2918   safe_charsets = make_uninit_string (max_charset_id + 1);
2919   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2920   request = AREF (attrs, coding_attr_iso_request);
2921   reg_usage = AREF (attrs, coding_attr_iso_usage);
2922   reg94 = XINT (XCAR (reg_usage));
2923   reg96 = XINT (XCDR (reg_usage));
2924
2925   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2926     {
2927       Lisp_Object id;
2928       Lisp_Object reg;
2929       struct charset *charset;
2930
2931       id = XCAR (tail);
2932       charset = CHARSET_FROM_ID (XINT (id));
2933       reg = Fcdr (Fassq (id, request));
2934       if (! NILP (reg))
2935         SSET (safe_charsets, XINT (id), XINT (reg));
2936       else if (charset->iso_chars_96)
2937         {
2938           if (reg96 < 4)
2939             SSET (safe_charsets, XINT (id), reg96);
2940         }
2941       else
2942         {
2943           if (reg94 < 4)
2944             SSET (safe_charsets, XINT (id), reg94);
2945         }
2946     }
2947   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2948 }
2949
2950
2951 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2952    Return true if a text is encoded in one of ISO-2022 based coding
2953    systems.  */
2954
2955 static bool
2956 detect_coding_iso_2022 (struct coding_system *coding,
2957                         struct coding_detection_info *detect_info)
2958 {
2959   const unsigned char *src = coding->source, *src_base = src;
2960   const unsigned char *src_end = coding->source + coding->src_bytes;
2961   bool multibytep = coding->src_multibyte;
2962   bool single_shifting = 0;
2963   int id;
2964   int c, c1;
2965   ptrdiff_t consumed_chars = 0;
2966   int i;
2967   int rejected = 0;
2968   int found = 0;
2969   int composition_count = -1;
2970
2971   detect_info->checked |= CATEGORY_MASK_ISO;
2972
2973   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2974     {
2975       struct coding_system *this = &(coding_categories[i]);
2976       Lisp_Object attrs, val;
2977
2978       if (this->id < 0)
2979         continue;
2980       attrs = CODING_ID_ATTRS (this->id);
2981       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2982           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2983         setup_iso_safe_charsets (attrs);
2984       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2985       this->max_charset_id = SCHARS (val) - 1;
2986       this->safe_charsets = SDATA (val);
2987     }
2988
2989   /* A coding system of this category is always ASCII compatible.  */
2990   src += coding->head_ascii;
2991
2992   while (rejected != CATEGORY_MASK_ISO)
2993     {
2994       src_base = src;
2995       ONE_MORE_BYTE (c);
2996       switch (c)
2997         {
2998         case ISO_CODE_ESC:
2999           if (inhibit_iso_escape_detection)
3000             break;
3001           single_shifting = 0;
3002           ONE_MORE_BYTE (c);
3003           if (c == 'N' || c == 'O')
3004             {
3005               /* ESC <Fe> for SS2 or SS3.  */
3006               single_shifting = 1;
3007               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3008             }
3009           else if (c == '1')
3010             {
3011               /* End of composition.  */
3012               if (composition_count < 0
3013                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3014                 /* Invalid */
3015                 break;
3016               composition_count = -1;
3017               found |= CATEGORY_MASK_ISO;
3018             }
3019           else if (c >= '0' && c <= '4')
3020             {
3021               /* ESC <Fp> for start/end composition.  */
3022               composition_count = 0;
3023             }
3024           else
3025             {
3026               if (c >= '(' && c <= '/')
3027                 {
3028                   /* Designation sequence for a charset of dimension 1.  */
3029                   ONE_MORE_BYTE (c1);
3030                   if (c1 < ' ' || c1 >= 0x80
3031                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3032                     {
3033                       /* Invalid designation sequence.  Just ignore.  */
3034                       if (c1 >= 0x80)
3035                         rejected |= (CATEGORY_MASK_ISO_7BIT
3036                                      | CATEGORY_MASK_ISO_7_ELSE);
3037                       break;
3038                     }
3039                 }
3040               else if (c == '$')
3041                 {
3042                   /* Designation sequence for a charset of dimension 2.  */
3043                   ONE_MORE_BYTE (c);
3044                   if (c >= '@' && c <= 'B')
3045                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3046                     id = iso_charset_table[1][0][c];
3047                   else if (c >= '(' && c <= '/')
3048                     {
3049                       ONE_MORE_BYTE (c1);
3050                       if (c1 < ' ' || c1 >= 0x80
3051                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3052                         {
3053                           /* Invalid designation sequence.  Just ignore.  */
3054                           if (c1 >= 0x80)
3055                             rejected |= (CATEGORY_MASK_ISO_7BIT
3056                                          | CATEGORY_MASK_ISO_7_ELSE);
3057                           break;
3058                         }
3059                     }
3060                   else
3061                     {
3062                       /* Invalid designation sequence.  Just ignore it.  */
3063                       if (c >= 0x80)
3064                         rejected |= (CATEGORY_MASK_ISO_7BIT
3065                                      | CATEGORY_MASK_ISO_7_ELSE);
3066                       break;
3067                     }
3068                 }
3069               else
3070                 {
3071                   /* Invalid escape sequence.  Just ignore it.  */
3072                   if (c >= 0x80)
3073                     rejected |= (CATEGORY_MASK_ISO_7BIT
3074                                  | CATEGORY_MASK_ISO_7_ELSE);
3075                   break;
3076                 }
3077
3078               /* We found a valid designation sequence for CHARSET.  */
3079               rejected |= CATEGORY_MASK_ISO_8BIT;
3080               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3081                                   id))
3082                 found |= CATEGORY_MASK_ISO_7;
3083               else
3084                 rejected |= CATEGORY_MASK_ISO_7;
3085               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3086                                   id))
3087                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3088               else
3089                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3090               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3091                                   id))
3092                 found |= CATEGORY_MASK_ISO_7_ELSE;
3093               else
3094                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3095               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3096                                   id))
3097                 found |= CATEGORY_MASK_ISO_8_ELSE;
3098               else
3099                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3100             }
3101           break;
3102
3103         case ISO_CODE_SO:
3104         case ISO_CODE_SI:
3105           /* Locking shift out/in.  */
3106           if (inhibit_iso_escape_detection)
3107             break;
3108           single_shifting = 0;
3109           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3110           break;
3111
3112         case ISO_CODE_CSI:
3113           /* Control sequence introducer.  */
3114           single_shifting = 0;
3115           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3116           found |= CATEGORY_MASK_ISO_8_ELSE;
3117           goto check_extra_latin;
3118
3119         case ISO_CODE_SS2:
3120         case ISO_CODE_SS3:
3121           /* Single shift.   */
3122           if (inhibit_iso_escape_detection)
3123             break;
3124           single_shifting = 0;
3125           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3126           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3127               & CODING_ISO_FLAG_SINGLE_SHIFT)
3128             {
3129               found |= CATEGORY_MASK_ISO_8_1;
3130               single_shifting = 1;
3131             }
3132           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3133               & CODING_ISO_FLAG_SINGLE_SHIFT)
3134             {
3135               found |= CATEGORY_MASK_ISO_8_2;
3136               single_shifting = 1;
3137             }
3138           if (single_shifting)
3139             break;
3140           goto check_extra_latin;
3141
3142         default:
3143           if (c < 0)
3144             continue;
3145           if (c < 0x80)
3146             {
3147               if (composition_count >= 0)
3148                 composition_count++;
3149               single_shifting = 0;
3150               break;
3151             }
3152           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3153           if (c >= 0xA0)
3154             {
3155               found |= CATEGORY_MASK_ISO_8_1;
3156               /* Check the length of succeeding codes of the range
3157                  0xA0..0FF.  If the byte length is even, we include
3158                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3159                  only when we are not single shifting.  */
3160               if (! single_shifting
3161                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3162                 {
3163                   ptrdiff_t len = 1;
3164                   while (src < src_end)
3165                     {
3166                       src_base = src;
3167                       ONE_MORE_BYTE (c);
3168                       if (c < 0xA0)
3169                         {
3170                           src = src_base;
3171                           break;
3172                         }
3173                       len++;
3174                     }
3175
3176                   if (len & 1 && src < src_end)
3177                     {
3178                       rejected |= CATEGORY_MASK_ISO_8_2;
3179                       if (composition_count >= 0)
3180                         composition_count += len;
3181                     }
3182                   else
3183                     {
3184                       found |= CATEGORY_MASK_ISO_8_2;
3185                       if (composition_count >= 0)
3186                         composition_count += len / 2;
3187                     }
3188                 }
3189               break;
3190             }
3191         check_extra_latin:
3192           if (! VECTORP (Vlatin_extra_code_table)
3193               || NILP (AREF (Vlatin_extra_code_table, c)))
3194             {
3195               rejected = CATEGORY_MASK_ISO;
3196               break;
3197             }
3198           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3199               & CODING_ISO_FLAG_LATIN_EXTRA)
3200             found |= CATEGORY_MASK_ISO_8_1;
3201           else
3202             rejected |= CATEGORY_MASK_ISO_8_1;
3203           rejected |= CATEGORY_MASK_ISO_8_2;
3204           break;
3205         }
3206     }
3207   detect_info->rejected |= CATEGORY_MASK_ISO;
3208   return 0;
3209
3210  no_more_source:
3211   detect_info->rejected |= rejected;
3212   detect_info->found |= (found & ~rejected);
3213   return 1;
3214 }
3215
3216
3217 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3218    escape sequence should be kept.  */
3219 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3220   do {                                                                  \
3221     int id, prev;                                                       \
3222                                                                         \
3223     if (final < '0' || final >= 128                                     \
3224         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3225         || !SAFE_CHARSET_P (coding, id))                                \
3226       {                                                                 \
3227         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3228         chars_96 = -1;                                                  \
3229         break;                                                          \
3230       }                                                                 \
3231     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3232     if (id == charset_jisx0201_roman)                                   \
3233       {                                                                 \
3234         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3235           id = charset_ascii;                                           \
3236       }                                                                 \
3237     else if (id == charset_jisx0208_1978)                               \
3238       {                                                                 \
3239         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3240           id = charset_jisx0208;                                        \
3241       }                                                                 \
3242     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3243     /* If there was an invalid designation to REG previously, and this  \
3244        designation is ASCII to REG, we should keep this designation     \
3245        sequence.  */                                                    \
3246     if (prev == -2 && id == charset_ascii)                              \
3247       chars_96 = -1;                                                    \
3248   } while (0)
3249
3250
3251 /* Handle these composition sequence (ALT: alternate char):
3252
3253    (1) relative composition: ESC 0 CHAR ... ESC 1
3254    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3255    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3256    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3257
3258    When the start sequence (ESC 0/2/3/4) is found, this annotation
3259    header is produced.
3260
3261         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3262
3263    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3264    produced until the end sequence (ESC 1) is found:
3265
3266    (1) CHAR ... CHAR
3267    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3268    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3269    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3270
3271    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3272    annotation header is updated as below:
3273
3274    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3275    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3276    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3277    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3278
3279    If an error is found while composing, the annotation header is
3280    changed to:
3281
3282         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3283
3284    and the sequence [ -2 DECODED-RULE ] is changed to the original
3285    byte sequence as below:
3286         o the original byte sequence is B: [ B -1 ]
3287         o the original byte sequence is B1 B2: [ B1 B2 ]
3288    and the sequence [ -1 -1 ] is changed to the original byte
3289    sequence:
3290         [ ESC '0' ]
3291 */
3292
3293 /* Decode a composition rule C1 and maybe one more byte from the
3294    source, and set RULE to the encoded composition rule.  If the rule
3295    is invalid, goto invalid_code.  */
3296
3297 #define DECODE_COMPOSITION_RULE(rule)                                   \
3298   do {                                                                  \
3299     rule = c1 - 32;                                                     \
3300     if (rule < 0)                                                       \
3301       goto invalid_code;                                                \
3302     if (rule < 81)              /* old format (before ver.21) */        \
3303       {                                                                 \
3304         int gref = (rule) / 9;                                          \
3305         int nref = (rule) % 9;                                          \
3306         if (gref == 4) gref = 10;                                       \
3307         if (nref == 4) nref = 10;                                       \
3308         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3309       }                                                                 \
3310     else                        /* new format (after ver.21) */         \
3311       {                                                                 \
3312         int b;                                                          \
3313                                                                         \
3314         ONE_MORE_BYTE (b);                                              \
3315         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3316           goto invalid_code;                                            \
3317         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3318         rule += 0x100;   /* Distinguish it from the old format.  */     \
3319       }                                                                 \
3320   } while (0)
3321
3322 #define ENCODE_COMPOSITION_RULE(rule)                           \
3323   do {                                                          \
3324     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3325                                                                 \
3326     if (rule < 0x100)           /* old format */                \
3327       {                                                         \
3328         if (gref == 10) gref = 4;                               \
3329         if (nref == 10) nref = 4;                               \
3330         charbuf[idx] = 32 + gref * 9 + nref;                    \
3331         charbuf[idx + 1] = -1;                                  \
3332         new_chars++;                                            \
3333       }                                                         \
3334     else                                /* new format */        \
3335       {                                                         \
3336         charbuf[idx] = 32 + 81 + gref;                          \
3337         charbuf[idx + 1] = 32 + nref;                           \
3338         new_chars += 2;                                         \
3339       }                                                         \
3340   } while (0)
3341
3342 /* Finish the current composition as invalid.  */
3343
3344 static int
3345 finish_composition (int *charbuf, struct composition_status *cmp_status)
3346 {
3347   int idx = - cmp_status->length;
3348   int new_chars;
3349
3350   /* Recover the original ESC sequence */
3351   charbuf[idx++] = ISO_CODE_ESC;
3352   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3353                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3354                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3355                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3356                     : '4');
3357   charbuf[idx++] = -2;
3358   charbuf[idx++] = 0;
3359   charbuf[idx++] = -1;
3360   new_chars = cmp_status->nchars;
3361   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3362     for (; idx < 0; idx++)
3363       {
3364         int elt = charbuf[idx];
3365
3366         if (elt == -2)
3367           {
3368             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3369             idx++;
3370           }
3371         else if (elt == -1)
3372           {
3373             charbuf[idx++] = ISO_CODE_ESC;
3374             charbuf[idx] = '0';
3375             new_chars += 2;
3376           }
3377       }
3378   cmp_status->state = COMPOSING_NO;
3379   return new_chars;
3380 }
3381
3382 /* If characters are under composition, finish the composition.  */
3383 #define MAYBE_FINISH_COMPOSITION()                              \
3384   do {                                                          \
3385     if (cmp_status->state != COMPOSING_NO)                      \
3386       char_offset += finish_composition (charbuf, cmp_status);  \
3387   } while (0)
3388
3389 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3390
3391    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3392    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3393    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3394    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3395
3396    Produce this annotation sequence now:
3397
3398    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3399 */
3400
3401 #define DECODE_COMPOSITION_START(c1)                                       \
3402   do {                                                                     \
3403     if (c1 == '0'                                                          \
3404         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3405              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3406             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3407                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3408       {                                                                    \
3409         *charbuf++ = -1;                                                   \
3410         *charbuf++= -1;                                                    \
3411         cmp_status->state = COMPOSING_CHAR;                                \
3412         cmp_status->length += 2;                                           \
3413       }                                                                    \
3414     else                                                                   \
3415       {                                                                    \
3416         MAYBE_FINISH_COMPOSITION ();                                       \
3417         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3418                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3419                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3420                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3421         cmp_status->state                                                  \
3422           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3423         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3424         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3425         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3426         coding->annotated = 1;                                             \
3427       }                                                                    \
3428   } while (0)
3429
3430
3431 /* Handle composition end sequence ESC 1.  */
3432
3433 #define DECODE_COMPOSITION_END()                                        \
3434   do {                                                                  \
3435     if (cmp_status->nchars == 0                                         \
3436         || ((cmp_status->state == COMPOSING_CHAR)                       \
3437             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3438       {                                                                 \
3439         MAYBE_FINISH_COMPOSITION ();                                    \
3440         goto invalid_code;                                              \
3441       }                                                                 \
3442     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3443       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3444     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3445       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3446     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3447     char_offset += cmp_status->nchars;                                  \
3448     cmp_status->state = COMPOSING_NO;                                   \
3449   } while (0)
3450
3451 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3452
3453 #define STORE_COMPOSITION_RULE(rule)    \
3454   do {                                  \
3455     *charbuf++ = -2;                    \
3456     *charbuf++ = rule;                  \
3457     cmp_status->length += 2;            \
3458     cmp_status->state--;                \
3459   } while (0)
3460
3461 /* Store a composed char or a component char C in charbuf, and update
3462    cmp_status.  */
3463
3464 #define STORE_COMPOSITION_CHAR(c)                                       \
3465   do {                                                                  \
3466     *charbuf++ = (c);                                                   \
3467     cmp_status->length++;                                               \
3468     if (cmp_status->state == COMPOSING_CHAR)                            \
3469       cmp_status->nchars++;                                             \
3470     else                                                                \
3471       cmp_status->ncomps++;                                             \
3472     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3473         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3474             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3475       cmp_status->state++;                                              \
3476   } while (0)
3477
3478
3479 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3480
3481 static void
3482 decode_coding_iso_2022 (struct coding_system *coding)
3483 {
3484   const unsigned char *src = coding->source + coding->consumed;
3485   const unsigned char *src_end = coding->source + coding->src_bytes;
3486   const unsigned char *src_base;
3487   int *charbuf = coding->charbuf + coding->charbuf_used;
3488   /* We may produce two annotations (charset and composition) in one
3489      loop and one more charset annotation at the end.  */
3490   int *charbuf_end
3491     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3492   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3493   bool multibytep = coding->src_multibyte;
3494   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3495   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3496   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3497   int charset_id_2, charset_id_3;
3498   struct charset *charset;
3499   int c;
3500   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3501   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3502   ptrdiff_t char_offset = coding->produced_char;
3503   ptrdiff_t last_offset = char_offset;
3504   int last_id = charset_ascii;
3505   bool eol_dos
3506     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3507   int byte_after_cr = -1;
3508   int i;
3509
3510   setup_iso_safe_charsets (attrs);
3511   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3512
3513   if (cmp_status->state != COMPOSING_NO)
3514     {
3515       if (charbuf_end - charbuf < cmp_status->length)
3516         emacs_abort ();
3517       for (i = 0; i < cmp_status->length; i++)
3518         *charbuf++ = cmp_status->carryover[i];
3519       coding->annotated = 1;
3520     }
3521
3522   while (1)
3523     {
3524       int c1, c2, c3;
3525
3526       src_base = src;
3527       consumed_chars_base = consumed_chars;
3528
3529       if (charbuf >= charbuf_end)
3530         {
3531           if (byte_after_cr >= 0)
3532             src_base--;
3533           break;
3534         }
3535
3536       if (byte_after_cr >= 0)
3537         c1 = byte_after_cr, byte_after_cr = -1;
3538       else
3539         ONE_MORE_BYTE (c1);
3540       if (c1 < 0)
3541         goto invalid_code;
3542
3543       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3544         {
3545           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3546           char_offset++;
3547           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3548           continue;
3549         }
3550
3551       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3552         {
3553           if (c1 == ISO_CODE_ESC)
3554             {
3555               if (src + 1 >= src_end)
3556                 goto no_more_source;
3557               *charbuf++ = ISO_CODE_ESC;
3558               char_offset++;
3559               if (src[0] == '%' && src[1] == '@')
3560                 {
3561                   src += 2;
3562                   consumed_chars += 2;
3563                   char_offset += 2;
3564                   /* We are sure charbuf can contain two more chars. */
3565                   *charbuf++ = '%';
3566                   *charbuf++ = '@';
3567                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3568                 }
3569             }
3570           else
3571             {
3572               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3573               char_offset++;
3574             }
3575           continue;
3576         }
3577
3578       if ((cmp_status->state == COMPOSING_RULE
3579            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3580           && c1 != ISO_CODE_ESC)
3581         {
3582           int rule;
3583
3584           DECODE_COMPOSITION_RULE (rule);
3585           STORE_COMPOSITION_RULE (rule);
3586           continue;
3587         }
3588
3589       /* We produce at most one character.  */
3590       switch (iso_code_class [c1])
3591         {
3592         case ISO_0x20_or_0x7F:
3593           if (charset_id_0 < 0
3594               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3595             /* This is SPACE or DEL.  */
3596             charset = CHARSET_FROM_ID (charset_ascii);
3597           else
3598             charset = CHARSET_FROM_ID (charset_id_0);
3599           break;
3600
3601         case ISO_graphic_plane_0:
3602           if (charset_id_0 < 0)
3603             charset = CHARSET_FROM_ID (charset_ascii);
3604           else
3605             charset = CHARSET_FROM_ID (charset_id_0);
3606           break;
3607
3608         case ISO_0xA0_or_0xFF:
3609           if (charset_id_1 < 0
3610               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3611               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3612             goto invalid_code;
3613           /* This is a graphic character, we fall down ... */
3614
3615         case ISO_graphic_plane_1:
3616           if (charset_id_1 < 0)
3617             goto invalid_code;
3618           charset = CHARSET_FROM_ID (charset_id_1);
3619           break;
3620
3621         case ISO_control_0:
3622           if (eol_dos && c1 == '\r')
3623             ONE_MORE_BYTE (byte_after_cr);
3624           MAYBE_FINISH_COMPOSITION ();
3625           charset = CHARSET_FROM_ID (charset_ascii);
3626           break;
3627
3628         case ISO_control_1:
3629           goto invalid_code;
3630
3631         case ISO_shift_out:
3632           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3633               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3634             goto invalid_code;
3635           CODING_ISO_INVOCATION (coding, 0) = 1;
3636           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3637           continue;
3638
3639         case ISO_shift_in:
3640           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3641             goto invalid_code;
3642           CODING_ISO_INVOCATION (coding, 0) = 0;
3643           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3644           continue;
3645
3646         case ISO_single_shift_2_7:
3647           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3648             goto invalid_code;
3649         case ISO_single_shift_2:
3650           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3651             goto invalid_code;
3652           /* SS2 is handled as an escape sequence of ESC 'N' */
3653           c1 = 'N';
3654           goto label_escape_sequence;
3655
3656         case ISO_single_shift_3:
3657           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3658             goto invalid_code;
3659           /* SS2 is handled as an escape sequence of ESC 'O' */
3660           c1 = 'O';
3661           goto label_escape_sequence;
3662
3663         case ISO_control_sequence_introducer:
3664           /* CSI is handled as an escape sequence of ESC '[' ...  */
3665           c1 = '[';
3666           goto label_escape_sequence;
3667
3668         case ISO_escape:
3669           ONE_MORE_BYTE (c1);
3670         label_escape_sequence:
3671           /* Escape sequences handled here are invocation,
3672              designation, direction specification, and character
3673              composition specification.  */
3674           switch (c1)
3675             {
3676             case '&':           /* revision of following character set */
3677               ONE_MORE_BYTE (c1);
3678               if (!(c1 >= '@' && c1 <= '~'))
3679                 goto invalid_code;
3680               ONE_MORE_BYTE (c1);
3681               if (c1 != ISO_CODE_ESC)
3682                 goto invalid_code;
3683               ONE_MORE_BYTE (c1);
3684               goto label_escape_sequence;
3685
3686             case '$':           /* designation of 2-byte character set */
3687               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3688                 goto invalid_code;
3689               {
3690                 int reg, chars96;
3691
3692                 ONE_MORE_BYTE (c1);
3693                 if (c1 >= '@' && c1 <= 'B')
3694                   {     /* designation of JISX0208.1978, GB2312.1980,
3695                            or JISX0208.1980 */
3696                     reg = 0, chars96 = 0;
3697                   }
3698                 else if (c1 >= 0x28 && c1 <= 0x2B)
3699                   { /* designation of DIMENSION2_CHARS94 character set */
3700                     reg = c1 - 0x28, chars96 = 0;
3701                     ONE_MORE_BYTE (c1);
3702                   }
3703                 else if (c1 >= 0x2C && c1 <= 0x2F)
3704                   { /* designation of DIMENSION2_CHARS96 character set */
3705                     reg = c1 - 0x2C, chars96 = 1;
3706                     ONE_MORE_BYTE (c1);
3707                   }
3708                 else
3709                   goto invalid_code;
3710                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3711                 /* We must update these variables now.  */
3712                 if (reg == 0)
3713                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3714                 else if (reg == 1)
3715                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3716                 if (chars96 < 0)
3717                   goto invalid_code;
3718               }
3719               continue;
3720
3721             case 'n':           /* invocation of locking-shift-2 */
3722               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3723                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3724                 goto invalid_code;
3725               CODING_ISO_INVOCATION (coding, 0) = 2;
3726               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3727               continue;
3728
3729             case 'o':           /* invocation of locking-shift-3 */
3730               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3731                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3732                 goto invalid_code;
3733               CODING_ISO_INVOCATION (coding, 0) = 3;
3734               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3735               continue;
3736
3737             case 'N':           /* invocation of single-shift-2 */
3738               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3739                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3740                 goto invalid_code;
3741               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3742               if (charset_id_2 < 0)
3743                 charset = CHARSET_FROM_ID (charset_ascii);
3744               else
3745                 charset = CHARSET_FROM_ID (charset_id_2);
3746               ONE_MORE_BYTE (c1);
3747               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3748                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3749                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3750                           ? c1 >= 0x80 : c1 < 0x80)))
3751                 goto invalid_code;
3752               break;
3753
3754             case 'O':           /* invocation of single-shift-3 */
3755               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3756                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3757                 goto invalid_code;
3758               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3759               if (charset_id_3 < 0)
3760                 charset = CHARSET_FROM_ID (charset_ascii);
3761               else
3762                 charset = CHARSET_FROM_ID (charset_id_3);
3763               ONE_MORE_BYTE (c1);
3764               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3765                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3766                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3767                           ? c1 >= 0x80 : c1 < 0x80)))
3768                 goto invalid_code;
3769               break;
3770
3771             case '0': case '2': case '3': case '4': /* start composition */
3772               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3773                 goto invalid_code;
3774               if (last_id != charset_ascii)
3775                 {
3776                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3777                   last_id = charset_ascii;
3778                   last_offset = char_offset;
3779                 }
3780               DECODE_COMPOSITION_START (c1);
3781               continue;
3782
3783             case '1':           /* end composition */
3784               if (cmp_status->state == COMPOSING_NO)
3785                 goto invalid_code;
3786               DECODE_COMPOSITION_END ();
3787               continue;
3788
3789             case '[':           /* specification of direction */
3790               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3791                 goto invalid_code;
3792               /* For the moment, nested direction is not supported.
3793                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3794                  left-to-right, and nonzero means right-to-left.  */
3795               ONE_MORE_BYTE (c1);
3796               switch (c1)
3797                 {
3798                 case ']':       /* end of the current direction */
3799                   coding->mode &= ~CODING_MODE_DIRECTION;
3800
3801                 case '0':       /* end of the current direction */
3802                 case '1':       /* start of left-to-right direction */
3803                   ONE_MORE_BYTE (c1);
3804                   if (c1 == ']')
3805                     coding->mode &= ~CODING_MODE_DIRECTION;
3806                   else
3807                     goto invalid_code;
3808                   break;
3809
3810                 case '2':       /* start of right-to-left direction */
3811                   ONE_MORE_BYTE (c1);
3812                   if (c1 == ']')
3813                     coding->mode |= CODING_MODE_DIRECTION;
3814                   else
3815                     goto invalid_code;
3816                   break;
3817
3818                 default:
3819                   goto invalid_code;
3820                 }
3821               continue;
3822
3823             case '%':
3824               ONE_MORE_BYTE (c1);
3825               if (c1 == '/')
3826                 {
3827                   /* CTEXT extended segment:
3828                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3829                      We keep these bytes as is for the moment.
3830                      They may be decoded by post-read-conversion.  */
3831                   int dim, M, L;
3832                   int size;
3833
3834                   ONE_MORE_BYTE (dim);
3835                   if (dim < '0' || dim > '4')
3836                     goto invalid_code;
3837                   ONE_MORE_BYTE (M);
3838                   if (M < 128)
3839                     goto invalid_code;
3840                   ONE_MORE_BYTE (L);
3841                   if (L < 128)
3842                     goto invalid_code;
3843                   size = ((M - 128) * 128) + (L - 128);
3844                   if (charbuf + 6 > charbuf_end)
3845                     goto break_loop;
3846                   *charbuf++ = ISO_CODE_ESC;
3847                   *charbuf++ = '%';
3848                   *charbuf++ = '/';
3849                   *charbuf++ = dim;
3850                   *charbuf++ = BYTE8_TO_CHAR (M);
3851                   *charbuf++ = BYTE8_TO_CHAR (L);
3852                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3853                 }
3854               else if (c1 == 'G')
3855                 {
3856                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3857                      ESC % G --UTF-8-BYTES-- ESC % @
3858                      We keep these bytes as is for the moment.
3859                      They may be decoded by post-read-conversion.  */
3860                   if (charbuf + 3 > charbuf_end)
3861                     goto break_loop;
3862                   *charbuf++ = ISO_CODE_ESC;
3863                   *charbuf++ = '%';
3864                   *charbuf++ = 'G';
3865                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3866                 }
3867               else
3868                 goto invalid_code;
3869               continue;
3870               break;
3871
3872             default:
3873               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3874                 goto invalid_code;
3875               {
3876                 int reg, chars96;
3877
3878                 if (c1 >= 0x28 && c1 <= 0x2B)
3879                   { /* designation of DIMENSION1_CHARS94 character set */
3880                     reg = c1 - 0x28, chars96 = 0;
3881                     ONE_MORE_BYTE (c1);
3882                   }
3883                 else if (c1 >= 0x2C && c1 <= 0x2F)
3884                   { /* designation of DIMENSION1_CHARS96 character set */
3885                     reg = c1 - 0x2C, chars96 = 1;
3886                     ONE_MORE_BYTE (c1);
3887                   }
3888                 else
3889                   goto invalid_code;
3890                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3891                 /* We must update these variables now.  */
3892                 if (reg == 0)
3893                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3894                 else if (reg == 1)
3895                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3896                 if (chars96 < 0)
3897                   goto invalid_code;
3898               }
3899               continue;
3900             }
3901           break;
3902
3903         default:
3904           emacs_abort ();
3905         }
3906
3907       if (cmp_status->state == COMPOSING_NO
3908           && charset->id != charset_ascii
3909           && last_id != charset->id)
3910         {
3911           if (last_id != charset_ascii)
3912             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3913           last_id = charset->id;
3914           last_offset = char_offset;
3915         }
3916
3917       /* Now we know CHARSET and 1st position code C1 of a character.
3918          Produce a decoded character while getting 2nd and 3rd
3919          position codes C2, C3 if necessary.  */
3920       if (CHARSET_DIMENSION (charset) > 1)
3921         {
3922           ONE_MORE_BYTE (c2);
3923           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3924               || ((c1 & 0x80) != (c2 & 0x80)))
3925             /* C2 is not in a valid range.  */
3926             goto invalid_code;
3927           if (CHARSET_DIMENSION (charset) == 2)
3928             c1 = (c1 << 8) | c2;
3929           else
3930             {
3931               ONE_MORE_BYTE (c3);
3932               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3933                   || ((c1 & 0x80) != (c3 & 0x80)))
3934                 /* C3 is not in a valid range.  */
3935                 goto invalid_code;
3936               c1 = (c1 << 16) | (c2 << 8) | c2;
3937             }
3938         }
3939       c1 &= 0x7F7F7F;
3940       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3941       if (c < 0)
3942         {
3943           MAYBE_FINISH_COMPOSITION ();
3944           for (; src_base < src; src_base++, char_offset++)
3945             {
3946               if (ASCII_CHAR_P (*src_base))
3947                 *charbuf++ = *src_base;
3948               else
3949                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3950             }
3951         }
3952       else if (cmp_status->state == COMPOSING_NO)
3953         {
3954           *charbuf++ = c;
3955           char_offset++;
3956         }
3957       else if ((cmp_status->state == COMPOSING_CHAR
3958                 ? cmp_status->nchars
3959                 : cmp_status->ncomps)
3960                >= MAX_COMPOSITION_COMPONENTS)
3961         {
3962           /* Too long composition.  */
3963           MAYBE_FINISH_COMPOSITION ();
3964           *charbuf++ = c;
3965           char_offset++;
3966         }
3967       else
3968         STORE_COMPOSITION_CHAR (c);
3969       continue;
3970
3971     invalid_code:
3972       MAYBE_FINISH_COMPOSITION ();
3973       src = src_base;
3974       consumed_chars = consumed_chars_base;
3975       ONE_MORE_BYTE (c);
3976       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
3977       char_offset++;
3978       /* Reset the invocation and designation status to the safest
3979          one; i.e. designate ASCII to the graphic register 0, and
3980          invoke that register to the graphic plane 0.  This typically
3981          helps the case that an designation sequence for ASCII "ESC (
3982          B" is somehow broken (e.g. broken by a newline).  */
3983       CODING_ISO_INVOCATION (coding, 0) = 0;
3984       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3985       charset_id_0 = charset_ascii;
3986       continue;
3987
3988     break_loop:
3989       break;
3990     }
3991
3992  no_more_source:
3993   if (cmp_status->state != COMPOSING_NO)
3994     {
3995       if (coding->mode & CODING_MODE_LAST_BLOCK)
3996         MAYBE_FINISH_COMPOSITION ();
3997       else
3998         {
3999           charbuf -= cmp_status->length;
4000           for (i = 0; i < cmp_status->length; i++)
4001             cmp_status->carryover[i] = charbuf[i];
4002         }
4003     }
4004   else if (last_id != charset_ascii)
4005     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4006   coding->consumed_char += consumed_chars_base;
4007   coding->consumed = src_base - coding->source;
4008   coding->charbuf_used = charbuf - coding->charbuf;
4009 }
4010
4011
4012 /* ISO2022 encoding stuff.  */
4013
4014 /*
4015    It is not enough to say just "ISO2022" on encoding, we have to
4016    specify more details.  In Emacs, each coding system of ISO2022
4017    variant has the following specifications:
4018         1. Initial designation to G0 thru G3.
4019         2. Allows short-form designation?
4020         3. ASCII should be designated to G0 before control characters?
4021         4. ASCII should be designated to G0 at end of line?
4022         5. 7-bit environment or 8-bit environment?
4023         6. Use locking-shift?
4024         7. Use Single-shift?
4025    And the following two are only for Japanese:
4026         8. Use ASCII in place of JIS0201-1976-Roman?
4027         9. Use JISX0208-1983 in place of JISX0208-1978?
4028    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4029    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4030    details.
4031 */
4032
4033 /* Produce codes (escape sequence) for designating CHARSET to graphic
4034    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4035    '@', 'A', or 'B' and the coding system CODING allows, produce
4036    designation sequence of short-form.  */
4037
4038 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4039   do {                                                                  \
4040     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4041     const char *intermediate_char_94 = "()*+";                          \
4042     const char *intermediate_char_96 = ",-./";                          \
4043     int revision = -1;                                                  \
4044                                                                         \
4045     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4046       revision = CHARSET_ISO_REVISION (charset);                        \
4047                                                                         \
4048     if (revision >= 0)                                                  \
4049       {                                                                 \
4050         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4051         EMIT_ONE_BYTE ('@' + revision);                                 \
4052       }                                                                 \
4053     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4054     if (CHARSET_DIMENSION (charset) == 1)                               \
4055       {                                                                 \
4056         int b;                                                          \
4057         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4058           b = intermediate_char_94[reg];                                \
4059         else                                                            \
4060           b = intermediate_char_96[reg];                                \
4061         EMIT_ONE_ASCII_BYTE (b);                                        \
4062       }                                                                 \
4063     else                                                                \
4064       {                                                                 \
4065         EMIT_ONE_ASCII_BYTE ('$');                                      \
4066         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4067           {                                                             \
4068             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4069                 || reg != 0                                             \
4070                 || final_char < '@' || final_char > 'B')                \
4071               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4072           }                                                             \
4073         else                                                            \
4074           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4075       }                                                                 \
4076     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4077                                                                         \
4078     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4079   } while (0)
4080
4081
4082 /* The following two macros produce codes (control character or escape
4083    sequence) for ISO2022 single-shift functions (single-shift-2 and
4084    single-shift-3).  */
4085
4086 #define ENCODE_SINGLE_SHIFT_2                                           \
4087   do {                                                                  \
4088     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4089       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4090     else                                                                \
4091       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4092     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4093   } while (0)
4094
4095
4096 #define ENCODE_SINGLE_SHIFT_3                                           \
4097   do {                                                                  \
4098     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4099       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4100     else                                                                \
4101       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4102     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4103   } while (0)
4104
4105
4106 /* The following four macros produce codes (control character or
4107    escape sequence) for ISO2022 locking-shift functions (shift-in,
4108    shift-out, locking-shift-2, and locking-shift-3).  */
4109
4110 #define ENCODE_SHIFT_IN                                 \
4111   do {                                                  \
4112     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4113     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4114   } while (0)
4115
4116
4117 #define ENCODE_SHIFT_OUT                                \
4118   do {                                                  \
4119     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4120     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4121   } while (0)
4122
4123
4124 #define ENCODE_LOCKING_SHIFT_2                          \
4125   do {                                                  \
4126     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4127     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4128   } while (0)
4129
4130
4131 #define ENCODE_LOCKING_SHIFT_3                          \
4132   do {                                                  \
4133     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4134     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4135   } while (0)
4136
4137
4138 /* Produce codes for a DIMENSION1 character whose character set is
4139    CHARSET and whose position-code is C1.  Designation and invocation
4140    sequences are also produced in advance if necessary.  */
4141
4142 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4143   do {                                                                  \
4144     int id = CHARSET_ID (charset);                                      \
4145                                                                         \
4146     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4147         && id == charset_ascii)                                         \
4148       {                                                                 \
4149         id = charset_jisx0201_roman;                                    \
4150         charset = CHARSET_FROM_ID (id);                                 \
4151       }                                                                 \
4152                                                                         \
4153     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4154       {                                                                 \
4155         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4156           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4157         else                                                            \
4158           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4159         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4160         break;                                                          \
4161       }                                                                 \
4162     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4163       {                                                                 \
4164         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4165         break;                                                          \
4166       }                                                                 \
4167     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4168       {                                                                 \
4169         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4170         break;                                                          \
4171       }                                                                 \
4172     else                                                                \
4173       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4174          must invoke it, or, at first, designate it to some graphic     \
4175          register.  Then repeat the loop to actually produce the        \
4176          character.  */                                                 \
4177       dst = encode_invocation_designation (charset, coding, dst,        \
4178                                            &produced_chars);            \
4179   } while (1)
4180
4181
4182 /* Produce codes for a DIMENSION2 character whose character set is
4183    CHARSET and whose position-codes are C1 and C2.  Designation and
4184    invocation codes are also produced in advance if necessary.  */
4185
4186 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4187   do {                                                                  \
4188     int id = CHARSET_ID (charset);                                      \
4189                                                                         \
4190     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4191         && id == charset_jisx0208)                                      \
4192       {                                                                 \
4193         id = charset_jisx0208_1978;                                     \
4194         charset = CHARSET_FROM_ID (id);                                 \
4195       }                                                                 \
4196                                                                         \
4197     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4198       {                                                                 \
4199         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4200           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4201         else                                                            \
4202           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4203         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4204         break;                                                          \
4205       }                                                                 \
4206     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4207       {                                                                 \
4208         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4209         break;                                                          \
4210       }                                                                 \
4211     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4212       {                                                                 \
4213         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4214         break;                                                          \
4215       }                                                                 \
4216     else                                                                \
4217       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4218          must invoke it, or, at first, designate it to some graphic     \
4219          register.  Then repeat the loop to actually produce the        \
4220          character.  */                                                 \
4221       dst = encode_invocation_designation (charset, coding, dst,        \
4222                                            &produced_chars);            \
4223   } while (1)
4224
4225
4226 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4227   do {                                                                     \
4228     unsigned code;                                                         \
4229     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4230                                                                            \
4231     if (CHARSET_DIMENSION (charset) == 1)                                  \
4232       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4233     else                                                                   \
4234       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4235   } while (0)
4236
4237
4238 /* Produce designation and invocation codes at a place pointed by DST
4239    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4240    Return new DST.  */
4241
4242 static unsigned char *
4243 encode_invocation_designation (struct charset *charset,
4244                                struct coding_system *coding,
4245                                unsigned char *dst, ptrdiff_t *p_nchars)
4246 {
4247   bool multibytep = coding->dst_multibyte;
4248   ptrdiff_t produced_chars = *p_nchars;
4249   int reg;                      /* graphic register number */
4250   int id = CHARSET_ID (charset);
4251
4252   /* At first, check designations.  */
4253   for (reg = 0; reg < 4; reg++)
4254     if (id == CODING_ISO_DESIGNATION (coding, reg))
4255       break;
4256
4257   if (reg >= 4)
4258     {
4259       /* CHARSET is not yet designated to any graphic registers.  */
4260       /* At first check the requested designation.  */
4261       reg = CODING_ISO_REQUEST (coding, id);
4262       if (reg < 0)
4263         /* Since CHARSET requests no special designation, designate it
4264            to graphic register 0.  */
4265         reg = 0;
4266
4267       ENCODE_DESIGNATION (charset, reg, coding);
4268     }
4269
4270   if (CODING_ISO_INVOCATION (coding, 0) != reg
4271       && CODING_ISO_INVOCATION (coding, 1) != reg)
4272     {
4273       /* Since the graphic register REG is not invoked to any graphic
4274          planes, invoke it to graphic plane 0.  */
4275       switch (reg)
4276         {
4277         case 0:                 /* graphic register 0 */
4278           ENCODE_SHIFT_IN;
4279           break;
4280
4281         case 1:                 /* graphic register 1 */
4282           ENCODE_SHIFT_OUT;
4283           break;
4284
4285         case 2:                 /* graphic register 2 */
4286           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4287             ENCODE_SINGLE_SHIFT_2;
4288           else
4289             ENCODE_LOCKING_SHIFT_2;
4290           break;
4291
4292         case 3:                 /* graphic register 3 */
4293           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4294             ENCODE_SINGLE_SHIFT_3;
4295           else
4296             ENCODE_LOCKING_SHIFT_3;
4297           break;
4298
4299         default:
4300           break;
4301         }
4302     }
4303
4304   *p_nchars = produced_chars;
4305   return dst;
4306 }
4307
4308
4309 /* Produce codes for designation and invocation to reset the graphic
4310    planes and registers to initial state.  */
4311 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4312   do {                                                                  \
4313     int reg;                                                            \
4314     struct charset *charset;                                            \
4315                                                                         \
4316     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4317       ENCODE_SHIFT_IN;                                                  \
4318     for (reg = 0; reg < 4; reg++)                                       \
4319       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4320           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4321               != CODING_ISO_INITIAL (coding, reg)))                     \
4322         {                                                               \
4323           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4324           ENCODE_DESIGNATION (charset, reg, coding);                    \
4325         }                                                               \
4326   } while (0)
4327
4328
4329 /* Produce designation sequences of charsets in the line started from
4330    CHARBUF to a place pointed by DST, and return the number of
4331    produced bytes.  DST should not directly point a buffer text area
4332    which may be relocated by char_charset call.
4333
4334    If the current block ends before any end-of-line, we may fail to
4335    find all the necessary designations.  */
4336
4337 static ptrdiff_t
4338 encode_designation_at_bol (struct coding_system *coding,
4339                            int *charbuf, int *charbuf_end,
4340                            unsigned char *dst)
4341 {
4342   unsigned char *orig = dst;
4343   struct charset *charset;
4344   /* Table of charsets to be designated to each graphic register.  */
4345   int r[4];
4346   int c, found = 0, reg;
4347   ptrdiff_t produced_chars = 0;
4348   bool multibytep = coding->dst_multibyte;
4349   Lisp_Object attrs;
4350   Lisp_Object charset_list;
4351
4352   attrs = CODING_ID_ATTRS (coding->id);
4353   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4354   if (EQ (charset_list, Qiso_2022))
4355     charset_list = Viso_2022_charset_list;
4356
4357   for (reg = 0; reg < 4; reg++)
4358     r[reg] = -1;
4359
4360   while (charbuf < charbuf_end && found < 4)
4361     {
4362       int id;
4363
4364       c = *charbuf++;
4365       if (c == '\n')
4366         break;
4367       charset = char_charset (c, charset_list, NULL);
4368       id = CHARSET_ID (charset);
4369       reg = CODING_ISO_REQUEST (coding, id);
4370       if (reg >= 0 && r[reg] < 0)
4371         {
4372           found++;
4373           r[reg] = id;
4374         }
4375     }
4376
4377   if (found)
4378     {
4379       for (reg = 0; reg < 4; reg++)
4380         if (r[reg] >= 0
4381             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4382           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4383     }
4384
4385   return dst - orig;
4386 }
4387
4388 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4389
4390 static bool
4391 encode_coding_iso_2022 (struct coding_system *coding)
4392 {
4393   bool multibytep = coding->dst_multibyte;
4394   int *charbuf = coding->charbuf;
4395   int *charbuf_end = charbuf + coding->charbuf_used;
4396   unsigned char *dst = coding->destination + coding->produced;
4397   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4398   int safe_room = 16;
4399   bool bol_designation
4400     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4401        && CODING_ISO_BOL (coding));
4402   ptrdiff_t produced_chars = 0;
4403   Lisp_Object attrs, eol_type, charset_list;
4404   bool ascii_compatible;
4405   int c;
4406   int preferred_charset_id = -1;
4407
4408   CODING_GET_INFO (coding, attrs, charset_list);
4409   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4410   if (VECTORP (eol_type))
4411     eol_type = Qunix;
4412
4413   setup_iso_safe_charsets (attrs);
4414   /* Charset list may have been changed.  */
4415   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4416   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4417
4418   ascii_compatible
4419     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4420        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4421                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4422
4423   while (charbuf < charbuf_end)
4424     {
4425       ASSURE_DESTINATION (safe_room);
4426
4427       if (bol_designation)
4428         {
4429           /* We have to produce designation sequences if any now.  */
4430           unsigned char desig_buf[16];
4431           ptrdiff_t nbytes;
4432           ptrdiff_t offset;
4433
4434           charset_map_loaded = 0;
4435           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4436                                               desig_buf);
4437           if (charset_map_loaded
4438               && (offset = coding_change_destination (coding)))
4439             {
4440               dst += offset;
4441               dst_end += offset;
4442             }
4443           memcpy (dst, desig_buf, nbytes);
4444           dst += nbytes;
4445           /* We are sure that designation sequences are all ASCII bytes.  */
4446           produced_chars += nbytes;
4447           bol_designation = 0;
4448           ASSURE_DESTINATION (safe_room);
4449         }
4450
4451       c = *charbuf++;
4452
4453       if (c < 0)
4454         {
4455           /* Handle an annotation.  */
4456           switch (*charbuf)
4457             {
4458             case CODING_ANNOTATE_COMPOSITION_MASK:
4459               /* Not yet implemented.  */
4460               break;
4461             case CODING_ANNOTATE_CHARSET_MASK:
4462               preferred_charset_id = charbuf[2];
4463               if (preferred_charset_id >= 0
4464                   && NILP (Fmemq (make_number (preferred_charset_id),
4465                                   charset_list)))
4466                 preferred_charset_id = -1;
4467               break;
4468             default:
4469               emacs_abort ();
4470             }
4471           charbuf += -c - 1;
4472           continue;
4473         }
4474
4475       /* Now encode the character C.  */
4476       if (c < 0x20 || c == 0x7F)
4477         {
4478           if (c == '\n'
4479               || (c == '\r' && EQ (eol_type, Qmac)))
4480             {
4481               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4482                 ENCODE_RESET_PLANE_AND_REGISTER ();
4483               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4484                 {
4485                   int i;
4486
4487                   for (i = 0; i < 4; i++)
4488                     CODING_ISO_DESIGNATION (coding, i)
4489                       = CODING_ISO_INITIAL (coding, i);
4490                 }
4491               bol_designation = ((CODING_ISO_FLAGS (coding)
4492                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4493                                  != 0);
4494             }
4495           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4496             ENCODE_RESET_PLANE_AND_REGISTER ();
4497           EMIT_ONE_ASCII_BYTE (c);
4498         }
4499       else if (ASCII_CHAR_P (c))
4500         {
4501           if (ascii_compatible)
4502             EMIT_ONE_ASCII_BYTE (c);
4503           else
4504             {
4505               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4506               ENCODE_ISO_CHARACTER (charset, c);
4507             }
4508         }
4509       else if (CHAR_BYTE8_P (c))
4510         {
4511           c = CHAR_TO_BYTE8 (c);
4512           EMIT_ONE_BYTE (c);
4513         }
4514       else
4515         {
4516           struct charset *charset;
4517
4518           if (preferred_charset_id >= 0)
4519             {
4520               bool result;
4521
4522               charset = CHARSET_FROM_ID (preferred_charset_id);
4523               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4524               if (! result)
4525                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4526                                      NULL, charset);
4527             }
4528           else
4529             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4530                                  NULL, charset);
4531           if (!charset)
4532             {
4533               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4534                 {
4535                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4536                   charset = CHARSET_FROM_ID (charset_ascii);
4537                 }
4538               else
4539                 {
4540                   c = coding->default_char;
4541                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4542                                        charset_list, NULL, charset);
4543                 }
4544             }
4545           ENCODE_ISO_CHARACTER (charset, c);
4546         }
4547     }
4548
4549   if (coding->mode & CODING_MODE_LAST_BLOCK
4550       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4551     {
4552       ASSURE_DESTINATION (safe_room);
4553       ENCODE_RESET_PLANE_AND_REGISTER ();
4554     }
4555   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4556   CODING_ISO_BOL (coding) = bol_designation;
4557   coding->produced_char += produced_chars;
4558   coding->produced = dst - coding->destination;
4559   return 0;
4560 }
4561
4562 \f
4563 /*** 8,9. SJIS and BIG5 handlers ***/
4564
4565 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4566    quite widely.  So, for the moment, Emacs supports them in the bare
4567    C code.  But, in the future, they may be supported only by CCL.  */
4568
4569 /* SJIS is a coding system encoding three character sets: ASCII, right
4570    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4571    as is.  A character of charset katakana-jisx0201 is encoded by
4572    "position-code + 0x80".  A character of charset japanese-jisx0208
4573    is encoded in 2-byte but two position-codes are divided and shifted
4574    so that it fit in the range below.
4575
4576    --- CODE RANGE of SJIS ---
4577    (character set)      (range)
4578    ASCII                0x00 .. 0x7F
4579    KATAKANA-JISX0201    0xA0 .. 0xDF
4580    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4581             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4582    -------------------------------
4583
4584 */
4585
4586 /* BIG5 is a coding system encoding two character sets: ASCII and
4587    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4588    character set and is encoded in two-byte.
4589
4590    --- CODE RANGE of BIG5 ---
4591    (character set)      (range)
4592    ASCII                0x00 .. 0x7F
4593    Big5 (1st byte)      0xA1 .. 0xFE
4594         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4595    --------------------------
4596
4597   */
4598
4599 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4600    Return true if a text is encoded in SJIS.  */
4601
4602 static bool
4603 detect_coding_sjis (struct coding_system *coding,
4604                     struct coding_detection_info *detect_info)
4605 {
4606   const unsigned char *src = coding->source, *src_base;
4607   const unsigned char *src_end = coding->source + coding->src_bytes;
4608   bool multibytep = coding->src_multibyte;
4609   ptrdiff_t consumed_chars = 0;
4610   int found = 0;
4611   int c;
4612   Lisp_Object attrs, charset_list;
4613   int max_first_byte_of_2_byte_code;
4614
4615   CODING_GET_INFO (coding, attrs, charset_list);
4616   max_first_byte_of_2_byte_code
4617     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4618
4619   detect_info->checked |= CATEGORY_MASK_SJIS;
4620   /* A coding system of this category is always ASCII compatible.  */
4621   src += coding->head_ascii;
4622
4623   while (1)
4624     {
4625       src_base = src;
4626       ONE_MORE_BYTE (c);
4627       if (c < 0x80)
4628         continue;
4629       if ((c >= 0x81 && c <= 0x9F)
4630           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4631         {
4632           ONE_MORE_BYTE (c);
4633           if (c < 0x40 || c == 0x7F || c > 0xFC)
4634             break;
4635           found = CATEGORY_MASK_SJIS;
4636         }
4637       else if (c >= 0xA0 && c < 0xE0)
4638         found = CATEGORY_MASK_SJIS;
4639       else
4640         break;
4641     }
4642   detect_info->rejected |= CATEGORY_MASK_SJIS;
4643   return 0;
4644
4645  no_more_source:
4646   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4647     {
4648       detect_info->rejected |= CATEGORY_MASK_SJIS;
4649       return 0;
4650     }
4651   detect_info->found |= found;
4652   return 1;
4653 }
4654
4655 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4656    Return true if a text is encoded in BIG5.  */
4657
4658 static bool
4659 detect_coding_big5 (struct coding_system *coding,
4660                     struct coding_detection_info *detect_info)
4661 {
4662   const unsigned char *src = coding->source, *src_base;
4663   const unsigned char *src_end = coding->source + coding->src_bytes;
4664   bool multibytep = coding->src_multibyte;
4665   ptrdiff_t consumed_chars = 0;
4666   int found = 0;
4667   int c;
4668
4669   detect_info->checked |= CATEGORY_MASK_BIG5;
4670   /* A coding system of this category is always ASCII compatible.  */
4671   src += coding->head_ascii;
4672
4673   while (1)
4674     {
4675       src_base = src;
4676       ONE_MORE_BYTE (c);
4677       if (c < 0x80)
4678         continue;
4679       if (c >= 0xA1)
4680         {
4681           ONE_MORE_BYTE (c);
4682           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4683             return 0;
4684           found = CATEGORY_MASK_BIG5;
4685         }
4686       else
4687         break;
4688     }
4689   detect_info->rejected |= CATEGORY_MASK_BIG5;
4690   return 0;
4691
4692  no_more_source:
4693   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4694     {
4695       detect_info->rejected |= CATEGORY_MASK_BIG5;
4696       return 0;
4697     }
4698   detect_info->found |= found;
4699   return 1;
4700 }
4701
4702 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4703
4704 static void
4705 decode_coding_sjis (struct coding_system *coding)
4706 {
4707   const unsigned char *src = coding->source + coding->consumed;
4708   const unsigned char *src_end = coding->source + coding->src_bytes;
4709   const unsigned char *src_base;
4710   int *charbuf = coding->charbuf + coding->charbuf_used;
4711   /* We may produce one charset annotation in one loop and one more at
4712      the end.  */
4713   int *charbuf_end
4714     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4715   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4716   bool multibytep = coding->src_multibyte;
4717   struct charset *charset_roman, *charset_kanji, *charset_kana;
4718   struct charset *charset_kanji2;
4719   Lisp_Object attrs, charset_list, val;
4720   ptrdiff_t char_offset = coding->produced_char;
4721   ptrdiff_t last_offset = char_offset;
4722   int last_id = charset_ascii;
4723   bool eol_dos
4724     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4725   int byte_after_cr = -1;
4726
4727   CODING_GET_INFO (coding, attrs, charset_list);
4728
4729   val = charset_list;
4730   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4731   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4732   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4733   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4734
4735   while (1)
4736     {
4737       int c, c1;
4738       struct charset *charset;
4739
4740       src_base = src;
4741       consumed_chars_base = consumed_chars;
4742
4743       if (charbuf >= charbuf_end)
4744         {
4745           if (byte_after_cr >= 0)
4746             src_base--;
4747           break;
4748         }
4749
4750       if (byte_after_cr >= 0)
4751         c = byte_after_cr, byte_after_cr = -1;
4752       else
4753         ONE_MORE_BYTE (c);
4754       if (c < 0)
4755         goto invalid_code;
4756       if (c < 0x80)
4757         {
4758           if (eol_dos && c == '\r')
4759             ONE_MORE_BYTE (byte_after_cr);
4760           charset = charset_roman;
4761         }
4762       else if (c == 0x80 || c == 0xA0)
4763         goto invalid_code;
4764       else if (c >= 0xA1 && c <= 0xDF)
4765         {
4766           /* SJIS -> JISX0201-Kana */
4767           c &= 0x7F;
4768           charset = charset_kana;
4769         }
4770       else if (c <= 0xEF)
4771         {
4772           /* SJIS -> JISX0208 */
4773           ONE_MORE_BYTE (c1);
4774           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4775             goto invalid_code;
4776           c = (c << 8) | c1;
4777           SJIS_TO_JIS (c);
4778           charset = charset_kanji;
4779         }
4780       else if (c <= 0xFC && charset_kanji2)
4781         {
4782           /* SJIS -> JISX0213-2 */
4783           ONE_MORE_BYTE (c1);
4784           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4785             goto invalid_code;
4786           c = (c << 8) | c1;
4787           SJIS_TO_JIS2 (c);
4788           charset = charset_kanji2;
4789         }
4790       else
4791         goto invalid_code;
4792       if (charset->id != charset_ascii
4793           && last_id != charset->id)
4794         {
4795           if (last_id != charset_ascii)
4796             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4797           last_id = charset->id;
4798           last_offset = char_offset;
4799         }
4800       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4801       *charbuf++ = c;
4802       char_offset++;
4803       continue;
4804
4805     invalid_code:
4806       src = src_base;
4807       consumed_chars = consumed_chars_base;
4808       ONE_MORE_BYTE (c);
4809       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4810       char_offset++;
4811     }
4812
4813  no_more_source:
4814   if (last_id != charset_ascii)
4815     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4816   coding->consumed_char += consumed_chars_base;
4817   coding->consumed = src_base - coding->source;
4818   coding->charbuf_used = charbuf - coding->charbuf;
4819 }
4820
4821 static void
4822 decode_coding_big5 (struct coding_system *coding)
4823 {
4824   const unsigned char *src = coding->source + coding->consumed;
4825   const unsigned char *src_end = coding->source + coding->src_bytes;
4826   const unsigned char *src_base;
4827   int *charbuf = coding->charbuf + coding->charbuf_used;
4828   /* We may produce one charset annotation in one loop and one more at
4829      the end.  */
4830   int *charbuf_end
4831     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4832   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4833   bool multibytep = coding->src_multibyte;
4834   struct charset *charset_roman, *charset_big5;
4835   Lisp_Object attrs, charset_list, val;
4836   ptrdiff_t char_offset = coding->produced_char;
4837   ptrdiff_t last_offset = char_offset;
4838   int last_id = charset_ascii;
4839   bool eol_dos
4840     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4841   int byte_after_cr = -1;
4842
4843   CODING_GET_INFO (coding, attrs, charset_list);
4844   val = charset_list;
4845   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4846   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4847
4848   while (1)
4849     {
4850       int c, c1;
4851       struct charset *charset;
4852
4853       src_base = src;
4854       consumed_chars_base = consumed_chars;
4855
4856       if (charbuf >= charbuf_end)
4857         {
4858           if (byte_after_cr >= 0)
4859             src_base--;
4860           break;
4861         }
4862
4863       if (byte_after_cr >= 0)
4864         c = byte_after_cr, byte_after_cr = -1;
4865       else
4866         ONE_MORE_BYTE (c);
4867
4868       if (c < 0)
4869         goto invalid_code;
4870       if (c < 0x80)
4871         {
4872           if (eol_dos && c == '\r')
4873             ONE_MORE_BYTE (byte_after_cr);
4874           charset = charset_roman;
4875         }
4876       else
4877         {
4878           /* BIG5 -> Big5 */
4879           if (c < 0xA1 || c > 0xFE)
4880             goto invalid_code;
4881           ONE_MORE_BYTE (c1);
4882           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4883             goto invalid_code;
4884           c = c << 8 | c1;
4885           charset = charset_big5;
4886         }
4887       if (charset->id != charset_ascii
4888           && last_id != charset->id)
4889         {
4890           if (last_id != charset_ascii)
4891             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4892           last_id = charset->id;
4893           last_offset = char_offset;
4894         }
4895       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4896       *charbuf++ = c;
4897       char_offset++;
4898       continue;
4899
4900     invalid_code:
4901       src = src_base;
4902       consumed_chars = consumed_chars_base;
4903       ONE_MORE_BYTE (c);
4904       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4905       char_offset++;
4906     }
4907
4908  no_more_source:
4909   if (last_id != charset_ascii)
4910     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4911   coding->consumed_char += consumed_chars_base;
4912   coding->consumed = src_base - coding->source;
4913   coding->charbuf_used = charbuf - coding->charbuf;
4914 }
4915
4916 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4917    This function can encode charsets `ascii', `katakana-jisx0201',
4918    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4919    are sure that all these charsets are registered as official charset
4920    (i.e. do not have extended leading-codes).  Characters of other
4921    charsets are produced without any encoding.  */
4922
4923 static bool
4924 encode_coding_sjis (struct coding_system *coding)
4925 {
4926   bool multibytep = coding->dst_multibyte;
4927   int *charbuf = coding->charbuf;
4928   int *charbuf_end = charbuf + coding->charbuf_used;
4929   unsigned char *dst = coding->destination + coding->produced;
4930   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4931   int safe_room = 4;
4932   ptrdiff_t produced_chars = 0;
4933   Lisp_Object attrs, charset_list, val;
4934   bool ascii_compatible;
4935   struct charset *charset_kanji, *charset_kana;
4936   struct charset *charset_kanji2;
4937   int c;
4938
4939   CODING_GET_INFO (coding, attrs, charset_list);
4940   val = XCDR (charset_list);
4941   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4942   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4943   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4944
4945   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4946
4947   while (charbuf < charbuf_end)
4948     {
4949       ASSURE_DESTINATION (safe_room);
4950       c = *charbuf++;
4951       /* Now encode the character C.  */
4952       if (ASCII_CHAR_P (c) && ascii_compatible)
4953         EMIT_ONE_ASCII_BYTE (c);
4954       else if (CHAR_BYTE8_P (c))
4955         {
4956           c = CHAR_TO_BYTE8 (c);
4957           EMIT_ONE_BYTE (c);
4958         }
4959       else
4960         {
4961           unsigned code;
4962           struct charset *charset;
4963           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4964                                &code, charset);
4965
4966           if (!charset)
4967             {
4968               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4969                 {
4970                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4971                   charset = CHARSET_FROM_ID (charset_ascii);
4972                 }
4973               else
4974                 {
4975                   c = coding->default_char;
4976                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4977                                        charset_list, &code, charset);
4978                 }
4979             }
4980           if (code == CHARSET_INVALID_CODE (charset))
4981             emacs_abort ();
4982           if (charset == charset_kanji)
4983             {
4984               int c1, c2;
4985               JIS_TO_SJIS (code);
4986               c1 = code >> 8, c2 = code & 0xFF;
4987               EMIT_TWO_BYTES (c1, c2);
4988             }
4989           else if (charset == charset_kana)
4990             EMIT_ONE_BYTE (code | 0x80);
4991           else if (charset_kanji2 && charset == charset_kanji2)
4992             {
4993               int c1, c2;
4994
4995               c1 = code >> 8;
4996               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4997                   || c1 == 0x28
4998                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4999                 {
5000                   JIS_TO_SJIS2 (code);
5001                   c1 = code >> 8, c2 = code & 0xFF;
5002                   EMIT_TWO_BYTES (c1, c2);
5003                 }
5004               else
5005                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5006             }
5007           else
5008             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5009         }
5010     }
5011   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5012   coding->produced_char += produced_chars;
5013   coding->produced = dst - coding->destination;
5014   return 0;
5015 }
5016
5017 static bool
5018 encode_coding_big5 (struct coding_system *coding)
5019 {
5020   bool multibytep = coding->dst_multibyte;
5021   int *charbuf = coding->charbuf;
5022   int *charbuf_end = charbuf + coding->charbuf_used;
5023   unsigned char *dst = coding->destination + coding->produced;
5024   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5025   int safe_room = 4;
5026   ptrdiff_t produced_chars = 0;
5027   Lisp_Object attrs, charset_list, val;
5028   bool ascii_compatible;
5029   struct charset *charset_big5;
5030   int c;
5031
5032   CODING_GET_INFO (coding, attrs, charset_list);
5033   val = XCDR (charset_list);
5034   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5035   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5036
5037   while (charbuf < charbuf_end)
5038     {
5039       ASSURE_DESTINATION (safe_room);
5040       c = *charbuf++;
5041       /* Now encode the character C.  */
5042       if (ASCII_CHAR_P (c) && ascii_compatible)
5043         EMIT_ONE_ASCII_BYTE (c);
5044       else if (CHAR_BYTE8_P (c))
5045         {
5046           c = CHAR_TO_BYTE8 (c);
5047           EMIT_ONE_BYTE (c);
5048         }
5049       else
5050         {
5051           unsigned code;
5052           struct charset *charset;
5053           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5054                                &code, charset);
5055
5056           if (! charset)
5057             {
5058               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5059                 {
5060                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5061                   charset = CHARSET_FROM_ID (charset_ascii);
5062                 }
5063               else
5064                 {
5065                   c = coding->default_char;
5066                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5067                                        charset_list, &code, charset);
5068                 }
5069             }
5070           if (code == CHARSET_INVALID_CODE (charset))
5071             emacs_abort ();
5072           if (charset == charset_big5)
5073             {
5074               int c1, c2;
5075
5076               c1 = code >> 8, c2 = code & 0xFF;
5077               EMIT_TWO_BYTES (c1, c2);
5078             }
5079           else
5080             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5081         }
5082     }
5083   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5084   coding->produced_char += produced_chars;
5085   coding->produced = dst - coding->destination;
5086   return 0;
5087 }
5088
5089 \f
5090 /*** 10. CCL handlers ***/
5091
5092 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5093    Return true if a text is encoded in a coding system of which
5094    encoder/decoder are written in CCL program.  */
5095
5096 static bool
5097 detect_coding_ccl (struct coding_system *coding,
5098                    struct coding_detection_info *detect_info)
5099 {
5100   const unsigned char *src = coding->source, *src_base;
5101   const unsigned char *src_end = coding->source + coding->src_bytes;
5102   bool multibytep = coding->src_multibyte;
5103   ptrdiff_t consumed_chars = 0;
5104   int found = 0;
5105   unsigned char *valids;
5106   ptrdiff_t head_ascii = coding->head_ascii;
5107   Lisp_Object attrs;
5108
5109   detect_info->checked |= CATEGORY_MASK_CCL;
5110
5111   coding = &coding_categories[coding_category_ccl];
5112   valids = CODING_CCL_VALIDS (coding);
5113   attrs = CODING_ID_ATTRS (coding->id);
5114   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5115     src += head_ascii;
5116
5117   while (1)
5118     {
5119       int c;
5120
5121       src_base = src;
5122       ONE_MORE_BYTE (c);
5123       if (c < 0 || ! valids[c])
5124         break;
5125       if ((valids[c] > 1))
5126         found = CATEGORY_MASK_CCL;
5127     }
5128   detect_info->rejected |= CATEGORY_MASK_CCL;
5129   return 0;
5130
5131  no_more_source:
5132   detect_info->found |= found;
5133   return 1;
5134 }
5135
5136 static void
5137 decode_coding_ccl (struct coding_system *coding)
5138 {
5139   const unsigned char *src = coding->source + coding->consumed;
5140   const unsigned char *src_end = coding->source + coding->src_bytes;
5141   int *charbuf = coding->charbuf + coding->charbuf_used;
5142   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5143   ptrdiff_t consumed_chars = 0;
5144   bool multibytep = coding->src_multibyte;
5145   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5146   int source_charbuf[1024];
5147   int source_byteidx[1025];
5148   Lisp_Object attrs, charset_list;
5149
5150   CODING_GET_INFO (coding, attrs, charset_list);
5151
5152   while (1)
5153     {
5154       const unsigned char *p = src;
5155       ptrdiff_t offset;
5156       int i = 0;
5157
5158       if (multibytep)
5159         {
5160           while (i < 1024 && p < src_end)
5161             {
5162               source_byteidx[i] = p - src;
5163               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5164             }
5165           source_byteidx[i] = p - src;
5166         }
5167       else
5168         while (i < 1024 && p < src_end)
5169           source_charbuf[i++] = *p++;
5170
5171       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5172         ccl->last_block = true;
5173       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5174       charset_map_loaded = 0;
5175       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5176                   charset_list);
5177       if (charset_map_loaded
5178           && (offset = coding_change_source (coding)))
5179         {
5180           p += offset;
5181           src += offset;
5182           src_end += offset;
5183         }
5184       charbuf += ccl->produced;
5185       if (multibytep)
5186         src += source_byteidx[ccl->consumed];
5187       else
5188         src += ccl->consumed;
5189       consumed_chars += ccl->consumed;
5190       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5191         break;
5192     }
5193
5194   switch (ccl->status)
5195     {
5196     case CCL_STAT_SUSPEND_BY_SRC:
5197       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5198       break;
5199     case CCL_STAT_SUSPEND_BY_DST:
5200       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5201       break;
5202     case CCL_STAT_QUIT:
5203     case CCL_STAT_INVALID_CMD:
5204       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5205       break;
5206     default:
5207       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5208       break;
5209     }
5210   coding->consumed_char += consumed_chars;
5211   coding->consumed = src - coding->source;
5212   coding->charbuf_used = charbuf - coding->charbuf;
5213 }
5214
5215 static bool
5216 encode_coding_ccl (struct coding_system *coding)
5217 {
5218   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5219   bool multibytep = coding->dst_multibyte;
5220   int *charbuf = coding->charbuf;
5221   int *charbuf_end = charbuf + coding->charbuf_used;
5222   unsigned char *dst = coding->destination + coding->produced;
5223   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5224   int destination_charbuf[1024];
5225   ptrdiff_t produced_chars = 0;
5226   int i;
5227   Lisp_Object attrs, charset_list;
5228
5229   CODING_GET_INFO (coding, attrs, charset_list);
5230   if (coding->consumed_char == coding->src_chars
5231       && coding->mode & CODING_MODE_LAST_BLOCK)
5232     ccl->last_block = true;
5233
5234   do
5235     {
5236       ptrdiff_t offset;
5237
5238       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5239       charset_map_loaded = 0;
5240       ccl_driver (ccl, charbuf, destination_charbuf,
5241                   charbuf_end - charbuf, 1024, charset_list);
5242       if (charset_map_loaded
5243           && (offset = coding_change_destination (coding)))
5244         dst += offset;
5245       if (multibytep)
5246         {
5247           ASSURE_DESTINATION (ccl->produced * 2);
5248           for (i = 0; i < ccl->produced; i++)
5249             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5250         }
5251       else
5252         {
5253           ASSURE_DESTINATION (ccl->produced);
5254           for (i = 0; i < ccl->produced; i++)
5255             *dst++ = destination_charbuf[i] & 0xFF;
5256           produced_chars += ccl->produced;
5257         }
5258       charbuf += ccl->consumed;
5259       if (ccl->status == CCL_STAT_QUIT
5260           || ccl->status == CCL_STAT_INVALID_CMD)
5261         break;
5262     }
5263   while (charbuf < charbuf_end);
5264
5265   switch (ccl->status)
5266     {
5267     case CCL_STAT_SUSPEND_BY_SRC:
5268       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5269       break;
5270     case CCL_STAT_SUSPEND_BY_DST:
5271       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5272       break;
5273     case CCL_STAT_QUIT:
5274     case CCL_STAT_INVALID_CMD:
5275       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5276       break;
5277     default:
5278       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5279       break;
5280     }
5281
5282   coding->produced_char += produced_chars;
5283   coding->produced = dst - coding->destination;
5284   return 0;
5285 }
5286
5287 \f
5288 /*** 10, 11. no-conversion handlers ***/
5289
5290 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5291
5292 static void
5293 decode_coding_raw_text (struct coding_system *coding)
5294 {
5295   bool eol_dos
5296     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5297
5298   coding->chars_at_source = 1;
5299   coding->consumed_char = coding->src_chars;
5300   coding->consumed = coding->src_bytes;
5301   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5302     {
5303       coding->consumed_char--;
5304       coding->consumed--;
5305       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5306     }
5307   else
5308     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5309 }
5310
5311 static bool
5312 encode_coding_raw_text (struct coding_system *coding)
5313 {
5314   bool multibytep = coding->dst_multibyte;
5315   int *charbuf = coding->charbuf;
5316   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5317   unsigned char *dst = coding->destination + coding->produced;
5318   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5319   ptrdiff_t produced_chars = 0;
5320   int c;
5321
5322   if (multibytep)
5323     {
5324       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5325
5326       if (coding->src_multibyte)
5327         while (charbuf < charbuf_end)
5328           {
5329             ASSURE_DESTINATION (safe_room);
5330             c = *charbuf++;
5331             if (ASCII_CHAR_P (c))
5332               EMIT_ONE_ASCII_BYTE (c);
5333             else if (CHAR_BYTE8_P (c))
5334               {
5335                 c = CHAR_TO_BYTE8 (c);
5336                 EMIT_ONE_BYTE (c);
5337               }
5338             else
5339               {
5340                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5341
5342                 CHAR_STRING_ADVANCE (c, p1);
5343                 do
5344                   {
5345                     EMIT_ONE_BYTE (*p0);
5346                     p0++;
5347                   }
5348                 while (p0 < p1);
5349               }
5350           }
5351       else
5352         while (charbuf < charbuf_end)
5353           {
5354             ASSURE_DESTINATION (safe_room);
5355             c = *charbuf++;
5356             EMIT_ONE_BYTE (c);
5357           }
5358     }
5359   else
5360     {
5361       if (coding->src_multibyte)
5362         {
5363           int safe_room = MAX_MULTIBYTE_LENGTH;
5364
5365           while (charbuf < charbuf_end)
5366             {
5367               ASSURE_DESTINATION (safe_room);
5368               c = *charbuf++;
5369               if (ASCII_CHAR_P (c))
5370                 *dst++ = c;
5371               else if (CHAR_BYTE8_P (c))
5372                 *dst++ = CHAR_TO_BYTE8 (c);
5373               else
5374                 CHAR_STRING_ADVANCE (c, dst);
5375             }
5376         }
5377       else
5378         {
5379           ASSURE_DESTINATION (charbuf_end - charbuf);
5380           while (charbuf < charbuf_end && dst < dst_end)
5381             *dst++ = *charbuf++;
5382         }
5383       produced_chars = dst - (coding->destination + coding->produced);
5384     }
5385   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5386   coding->produced_char += produced_chars;
5387   coding->produced = dst - coding->destination;
5388   return 0;
5389 }
5390
5391 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5392    Return true if a text is encoded in a charset-based coding system.  */
5393
5394 static bool
5395 detect_coding_charset (struct coding_system *coding,
5396                        struct coding_detection_info *detect_info)
5397 {
5398   const unsigned char *src = coding->source, *src_base;
5399   const unsigned char *src_end = coding->source + coding->src_bytes;
5400   bool multibytep = coding->src_multibyte;
5401   ptrdiff_t consumed_chars = 0;
5402   Lisp_Object attrs, valids, name;
5403   int found = 0;
5404   ptrdiff_t head_ascii = coding->head_ascii;
5405   bool check_latin_extra = 0;
5406
5407   detect_info->checked |= CATEGORY_MASK_CHARSET;
5408
5409   coding = &coding_categories[coding_category_charset];
5410   attrs = CODING_ID_ATTRS (coding->id);
5411   valids = AREF (attrs, coding_attr_charset_valids);
5412   name = CODING_ID_NAME (coding->id);
5413   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5414                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5415       || strncmp (SSDATA (SYMBOL_NAME (name)),
5416                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5417     check_latin_extra = 1;
5418
5419   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5420     src += head_ascii;
5421
5422   while (1)
5423     {
5424       int c;
5425       Lisp_Object val;
5426       struct charset *charset;
5427       int dim, idx;
5428
5429       src_base = src;
5430       ONE_MORE_BYTE (c);
5431       if (c < 0)
5432         continue;
5433       val = AREF (valids, c);
5434       if (NILP (val))
5435         break;
5436       if (c >= 0x80)
5437         {
5438           if (c < 0xA0
5439               && check_latin_extra
5440               && (!VECTORP (Vlatin_extra_code_table)
5441                   || NILP (AREF (Vlatin_extra_code_table, c))))
5442             break;
5443           found = CATEGORY_MASK_CHARSET;
5444         }
5445       if (INTEGERP (val))
5446         {
5447           charset = CHARSET_FROM_ID (XFASTINT (val));
5448           dim = CHARSET_DIMENSION (charset);
5449           for (idx = 1; idx < dim; idx++)
5450             {
5451               if (src == src_end)
5452                 goto too_short;
5453               ONE_MORE_BYTE (c);
5454               if (c < charset->code_space[(dim - 1 - idx) * 4]
5455                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5456                 break;
5457             }
5458           if (idx < dim)
5459             break;
5460         }
5461       else
5462         {
5463           idx = 1;
5464           for (; CONSP (val); val = XCDR (val))
5465             {
5466               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5467               dim = CHARSET_DIMENSION (charset);
5468               while (idx < dim)
5469                 {
5470                   if (src == src_end)
5471                     goto too_short;
5472                   ONE_MORE_BYTE (c);
5473                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5474                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5475                     break;
5476                   idx++;
5477                 }
5478               if (idx == dim)
5479                 {
5480                   val = Qnil;
5481                   break;
5482                 }
5483             }
5484           if (CONSP (val))
5485             break;
5486         }
5487     }
5488  too_short:
5489   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5490   return 0;
5491
5492  no_more_source:
5493   detect_info->found |= found;
5494   return 1;
5495 }
5496
5497 static void
5498 decode_coding_charset (struct coding_system *coding)
5499 {
5500   const unsigned char *src = coding->source + coding->consumed;
5501   const unsigned char *src_end = coding->source + coding->src_bytes;
5502   const unsigned char *src_base;
5503   int *charbuf = coding->charbuf + coding->charbuf_used;
5504   /* We may produce one charset annotation in one loop and one more at
5505      the end.  */
5506   int *charbuf_end
5507     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5508   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5509   bool multibytep = coding->src_multibyte;
5510   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5511   Lisp_Object valids;
5512   ptrdiff_t char_offset = coding->produced_char;
5513   ptrdiff_t last_offset = char_offset;
5514   int last_id = charset_ascii;
5515   bool eol_dos
5516     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5517   int byte_after_cr = -1;
5518
5519   valids = AREF (attrs, coding_attr_charset_valids);
5520
5521   while (1)
5522     {
5523       int c;
5524       Lisp_Object val;
5525       struct charset *charset;
5526       int dim;
5527       int len = 1;
5528       unsigned code;
5529
5530       src_base = src;
5531       consumed_chars_base = consumed_chars;
5532
5533       if (charbuf >= charbuf_end)
5534         {
5535           if (byte_after_cr >= 0)
5536             src_base--;
5537           break;
5538         }
5539
5540       if (byte_after_cr >= 0)
5541         {
5542           c = byte_after_cr;
5543           byte_after_cr = -1;
5544         }
5545       else
5546         {
5547           ONE_MORE_BYTE (c);
5548           if (eol_dos && c == '\r')
5549             ONE_MORE_BYTE (byte_after_cr);
5550         }
5551       if (c < 0)
5552         goto invalid_code;
5553       code = c;
5554
5555       val = AREF (valids, c);
5556       if (! INTEGERP (val) && ! CONSP (val))
5557         goto invalid_code;
5558       if (INTEGERP (val))
5559         {
5560           charset = CHARSET_FROM_ID (XFASTINT (val));
5561           dim = CHARSET_DIMENSION (charset);
5562           while (len < dim)
5563             {
5564               ONE_MORE_BYTE (c);
5565               code = (code << 8) | c;
5566               len++;
5567             }
5568           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5569                               charset, code, c);
5570         }
5571       else
5572         {
5573           /* VAL is a list of charset IDs.  It is assured that the
5574              list is sorted by charset dimensions (smaller one
5575              comes first).  */
5576           while (CONSP (val))
5577             {
5578               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5579               dim = CHARSET_DIMENSION (charset);
5580               while (len < dim)
5581                 {
5582                   ONE_MORE_BYTE (c);
5583                   code = (code << 8) | c;
5584                   len++;
5585                 }
5586               CODING_DECODE_CHAR (coding, src, src_base,
5587                                   src_end, charset, code, c);
5588               if (c >= 0)
5589                 break;
5590               val = XCDR (val);
5591             }
5592         }
5593       if (c < 0)
5594         goto invalid_code;
5595       if (charset->id != charset_ascii
5596           && last_id != charset->id)
5597         {
5598           if (last_id != charset_ascii)
5599             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5600           last_id = charset->id;
5601           last_offset = char_offset;
5602         }
5603
5604       *charbuf++ = c;
5605       char_offset++;
5606       continue;
5607
5608     invalid_code:
5609       src = src_base;
5610       consumed_chars = consumed_chars_base;
5611       ONE_MORE_BYTE (c);
5612       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5613       char_offset++;
5614     }
5615
5616  no_more_source:
5617   if (last_id != charset_ascii)
5618     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5619   coding->consumed_char += consumed_chars_base;
5620   coding->consumed = src_base - coding->source;
5621   coding->charbuf_used = charbuf - coding->charbuf;
5622 }
5623
5624 static bool
5625 encode_coding_charset (struct coding_system *coding)
5626 {
5627   bool multibytep = coding->dst_multibyte;
5628   int *charbuf = coding->charbuf;
5629   int *charbuf_end = charbuf + coding->charbuf_used;
5630   unsigned char *dst = coding->destination + coding->produced;
5631   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5632   int safe_room = MAX_MULTIBYTE_LENGTH;
5633   ptrdiff_t produced_chars = 0;
5634   Lisp_Object attrs, charset_list;
5635   bool ascii_compatible;
5636   int c;
5637
5638   CODING_GET_INFO (coding, attrs, charset_list);
5639   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5640
5641   while (charbuf < charbuf_end)
5642     {
5643       struct charset *charset;
5644       unsigned code;
5645
5646       ASSURE_DESTINATION (safe_room);
5647       c = *charbuf++;
5648       if (ascii_compatible && ASCII_CHAR_P (c))
5649         EMIT_ONE_ASCII_BYTE (c);
5650       else if (CHAR_BYTE8_P (c))
5651         {
5652           c = CHAR_TO_BYTE8 (c);
5653           EMIT_ONE_BYTE (c);
5654         }
5655       else
5656         {
5657           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5658                                &code, charset);
5659
5660           if (charset)
5661             {
5662               if (CHARSET_DIMENSION (charset) == 1)
5663                 EMIT_ONE_BYTE (code);
5664               else if (CHARSET_DIMENSION (charset) == 2)
5665                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5666               else if (CHARSET_DIMENSION (charset) == 3)
5667                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5668               else
5669                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5670                                  (code >> 8) & 0xFF, code & 0xFF);
5671             }
5672           else
5673             {
5674               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5675                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5676               else
5677                 c = coding->default_char;
5678               EMIT_ONE_BYTE (c);
5679             }
5680         }
5681     }
5682
5683   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5684   coding->produced_char += produced_chars;
5685   coding->produced = dst - coding->destination;
5686   return 0;
5687 }
5688
5689 \f
5690 /*** 7. C library functions ***/
5691
5692 /* Setup coding context CODING from information about CODING_SYSTEM.
5693    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5694    CODING_SYSTEM is invalid, signal an error.  */
5695
5696 void
5697 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5698 {
5699   Lisp_Object attrs;
5700   Lisp_Object eol_type;
5701   Lisp_Object coding_type;
5702   Lisp_Object val;
5703
5704   if (NILP (coding_system))
5705     coding_system = Qundecided;
5706
5707   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5708
5709   attrs = CODING_ID_ATTRS (coding->id);
5710   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5711
5712   coding->mode = 0;
5713   if (VECTORP (eol_type))
5714     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5715                             | CODING_REQUIRE_DETECTION_MASK);
5716   else if (! EQ (eol_type, Qunix))
5717     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5718                             | CODING_REQUIRE_ENCODING_MASK);
5719   else
5720     coding->common_flags = 0;
5721   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5722     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5723   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5724     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5725   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5726     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5727
5728   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5729   coding->max_charset_id = SCHARS (val) - 1;
5730   coding->safe_charsets = SDATA (val);
5731   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5732   coding->carryover_bytes = 0;
5733   coding->raw_destination = 0;
5734
5735   coding_type = CODING_ATTR_TYPE (attrs);
5736   if (EQ (coding_type, Qundecided))
5737     {
5738       coding->detector = NULL;
5739       coding->decoder = decode_coding_raw_text;
5740       coding->encoder = encode_coding_raw_text;
5741       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5742       coding->spec.undecided.inhibit_nbd
5743         = (encode_inhibit_flag
5744            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5745       coding->spec.undecided.inhibit_ied
5746         = (encode_inhibit_flag
5747            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5748       coding->spec.undecided.prefer_utf_8
5749         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5750     }
5751   else if (EQ (coding_type, Qiso_2022))
5752     {
5753       int i;
5754       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5755
5756       /* Invoke graphic register 0 to plane 0.  */
5757       CODING_ISO_INVOCATION (coding, 0) = 0;
5758       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5759       CODING_ISO_INVOCATION (coding, 1)
5760         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5761       /* Setup the initial status of designation.  */
5762       for (i = 0; i < 4; i++)
5763         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5764       /* Not single shifting initially.  */
5765       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5766       /* Beginning of buffer should also be regarded as bol. */
5767       CODING_ISO_BOL (coding) = 1;
5768       coding->detector = detect_coding_iso_2022;
5769       coding->decoder = decode_coding_iso_2022;
5770       coding->encoder = encode_coding_iso_2022;
5771       if (flags & CODING_ISO_FLAG_SAFE)
5772         coding->mode |= CODING_MODE_SAFE_ENCODING;
5773       coding->common_flags
5774         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5775             | CODING_REQUIRE_FLUSHING_MASK);
5776       if (flags & CODING_ISO_FLAG_COMPOSITION)
5777         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5778       if (flags & CODING_ISO_FLAG_DESIGNATION)
5779         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5780       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5781         {
5782           setup_iso_safe_charsets (attrs);
5783           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5784           coding->max_charset_id = SCHARS (val) - 1;
5785           coding->safe_charsets = SDATA (val);
5786         }
5787       CODING_ISO_FLAGS (coding) = flags;
5788       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5789       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5790       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5791       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5792     }
5793   else if (EQ (coding_type, Qcharset))
5794     {
5795       coding->detector = detect_coding_charset;
5796       coding->decoder = decode_coding_charset;
5797       coding->encoder = encode_coding_charset;
5798       coding->common_flags
5799         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5800     }
5801   else if (EQ (coding_type, Qutf_8))
5802     {
5803       val = AREF (attrs, coding_attr_utf_bom);
5804       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5805                                    : EQ (val, Qt) ? utf_with_bom
5806                                    : utf_without_bom);
5807       coding->detector = detect_coding_utf_8;
5808       coding->decoder = decode_coding_utf_8;
5809       coding->encoder = encode_coding_utf_8;
5810       coding->common_flags
5811         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5812       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5813         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5814     }
5815   else if (EQ (coding_type, Qutf_16))
5816     {
5817       val = AREF (attrs, coding_attr_utf_bom);
5818       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5819                                     : EQ (val, Qt) ? utf_with_bom
5820                                     : utf_without_bom);
5821       val = AREF (attrs, coding_attr_utf_16_endian);
5822       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5823                                        : utf_16_little_endian);
5824       CODING_UTF_16_SURROGATE (coding) = 0;
5825       coding->detector = detect_coding_utf_16;
5826       coding->decoder = decode_coding_utf_16;
5827       coding->encoder = encode_coding_utf_16;
5828       coding->common_flags
5829         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5830       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5831         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5832     }
5833   else if (EQ (coding_type, Qccl))
5834     {
5835       coding->detector = detect_coding_ccl;
5836       coding->decoder = decode_coding_ccl;
5837       coding->encoder = encode_coding_ccl;
5838       coding->common_flags
5839         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5840             | CODING_REQUIRE_FLUSHING_MASK);
5841     }
5842   else if (EQ (coding_type, Qemacs_mule))
5843     {
5844       coding->detector = detect_coding_emacs_mule;
5845       coding->decoder = decode_coding_emacs_mule;
5846       coding->encoder = encode_coding_emacs_mule;
5847       coding->common_flags
5848         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5849       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5850           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5851         {
5852           Lisp_Object tail, safe_charsets;
5853           int max_charset_id = 0;
5854
5855           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5856                tail = XCDR (tail))
5857             if (max_charset_id < XFASTINT (XCAR (tail)))
5858               max_charset_id = XFASTINT (XCAR (tail));
5859           safe_charsets = make_uninit_string (max_charset_id + 1);
5860           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5861           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5862                tail = XCDR (tail))
5863             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5864           coding->max_charset_id = max_charset_id;
5865           coding->safe_charsets = SDATA (safe_charsets);
5866         }
5867       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5868       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5869     }
5870   else if (EQ (coding_type, Qshift_jis))
5871     {
5872       coding->detector = detect_coding_sjis;
5873       coding->decoder = decode_coding_sjis;
5874       coding->encoder = encode_coding_sjis;
5875       coding->common_flags
5876         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5877     }
5878   else if (EQ (coding_type, Qbig5))
5879     {
5880       coding->detector = detect_coding_big5;
5881       coding->decoder = decode_coding_big5;
5882       coding->encoder = encode_coding_big5;
5883       coding->common_flags
5884         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5885     }
5886   else                          /* EQ (coding_type, Qraw_text) */
5887     {
5888       coding->detector = NULL;
5889       coding->decoder = decode_coding_raw_text;
5890       coding->encoder = encode_coding_raw_text;
5891       if (! EQ (eol_type, Qunix))
5892         {
5893           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5894           if (! VECTORP (eol_type))
5895             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5896         }
5897
5898     }
5899
5900   return;
5901 }
5902
5903 /* Return a list of charsets supported by CODING.  */
5904
5905 Lisp_Object
5906 coding_charset_list (struct coding_system *coding)
5907 {
5908   Lisp_Object attrs, charset_list;
5909
5910   CODING_GET_INFO (coding, attrs, charset_list);
5911   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5912     {
5913       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5914
5915       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5916         charset_list = Viso_2022_charset_list;
5917     }
5918   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5919     {
5920       charset_list = Vemacs_mule_charset_list;
5921     }
5922   return charset_list;
5923 }
5924
5925
5926 /* Return a list of charsets supported by CODING-SYSTEM.  */
5927
5928 Lisp_Object
5929 coding_system_charset_list (Lisp_Object coding_system)
5930 {
5931   ptrdiff_t id;
5932   Lisp_Object attrs, charset_list;
5933
5934   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5935   attrs = CODING_ID_ATTRS (id);
5936
5937   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5938     {
5939       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5940
5941       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5942         charset_list = Viso_2022_charset_list;
5943       else
5944         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5945     }
5946   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5947     {
5948       charset_list = Vemacs_mule_charset_list;
5949     }
5950   else
5951     {
5952       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5953     }
5954   return charset_list;
5955 }
5956
5957
5958 /* Return raw-text or one of its subsidiaries that has the same
5959    eol_type as CODING-SYSTEM.  */
5960
5961 Lisp_Object
5962 raw_text_coding_system (Lisp_Object coding_system)
5963 {
5964   Lisp_Object spec, attrs;
5965   Lisp_Object eol_type, raw_text_eol_type;
5966
5967   if (NILP (coding_system))
5968     return Qraw_text;
5969   spec = CODING_SYSTEM_SPEC (coding_system);
5970   attrs = AREF (spec, 0);
5971
5972   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5973     return coding_system;
5974
5975   eol_type = AREF (spec, 2);
5976   if (VECTORP (eol_type))
5977     return Qraw_text;
5978   spec = CODING_SYSTEM_SPEC (Qraw_text);
5979   raw_text_eol_type = AREF (spec, 2);
5980   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5981           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5982           : AREF (raw_text_eol_type, 2));
5983 }
5984
5985 /* Return true if CODING corresponds to raw-text coding-system.  */
5986
5987 bool
5988 raw_text_coding_system_p (struct coding_system *coding)
5989 {
5990   return (coding->decoder == decode_coding_raw_text
5991           && coding->encoder == encode_coding_raw_text) ? true : false;
5992 }
5993
5994
5995 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5996    the subsidiary that has the same eol-spec as PARENT (if it is not
5997    nil and specifies end-of-line format) or the system's setting
5998    (system_eol_type).  */
5999
6000 Lisp_Object
6001 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6002 {
6003   Lisp_Object spec, eol_type;
6004
6005   if (NILP (coding_system))
6006     coding_system = Qraw_text;
6007   else
6008     CHECK_CODING_SYSTEM (coding_system);
6009   spec = CODING_SYSTEM_SPEC (coding_system);
6010   eol_type = AREF (spec, 2);
6011   if (VECTORP (eol_type))
6012     {
6013       Lisp_Object parent_eol_type;
6014
6015       if (! NILP (parent))
6016         {
6017           Lisp_Object parent_spec;
6018
6019           CHECK_CODING_SYSTEM (parent);
6020           parent_spec = CODING_SYSTEM_SPEC (parent);
6021           parent_eol_type = AREF (parent_spec, 2);
6022           if (VECTORP (parent_eol_type))
6023             parent_eol_type = system_eol_type;
6024         }
6025       else
6026         parent_eol_type = system_eol_type;
6027       if (EQ (parent_eol_type, Qunix))
6028         coding_system = AREF (eol_type, 0);
6029       else if (EQ (parent_eol_type, Qdos))
6030         coding_system = AREF (eol_type, 1);
6031       else if (EQ (parent_eol_type, Qmac))
6032         coding_system = AREF (eol_type, 2);
6033     }
6034   return coding_system;
6035 }
6036
6037
6038 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6039    decided for writing to a process.  If not, complement them, and
6040    return a new coding system.  */
6041
6042 Lisp_Object
6043 complement_process_encoding_system (Lisp_Object coding_system)
6044 {
6045   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6046   Lisp_Object spec, attrs;
6047   int i;
6048
6049   for (i = 0; i < 3; i++)
6050     {
6051       if (i == 1)
6052         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6053       else if (i == 2)
6054         coding_system = preferred_coding_system ();
6055       spec = CODING_SYSTEM_SPEC (coding_system);
6056       if (NILP (spec))
6057         continue;
6058       attrs = AREF (spec, 0);
6059       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6060         coding_base = CODING_ATTR_BASE_NAME (attrs);
6061       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6062         eol_base = coding_system;
6063       if (! NILP (coding_base) && ! NILP (eol_base))
6064         break;
6065     }
6066
6067   if (i > 0)
6068     /* The original CODING_SYSTEM didn't specify text-conversion or
6069        eol-conversion.  Be sure that we return a fully complemented
6070        coding system.  */
6071     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6072   return coding_system;
6073 }
6074
6075
6076 /* Emacs has a mechanism to automatically detect a coding system if it
6077    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6078    it's impossible to distinguish some coding systems accurately
6079    because they use the same range of codes.  So, at first, coding
6080    systems are categorized into 7, those are:
6081
6082    o coding-category-emacs-mule
6083
6084         The category for a coding system which has the same code range
6085         as Emacs' internal format.  Assigned the coding-system (Lisp
6086         symbol) `emacs-mule' by default.
6087
6088    o coding-category-sjis
6089
6090         The category for a coding system which has the same code range
6091         as SJIS.  Assigned the coding-system (Lisp
6092         symbol) `japanese-shift-jis' by default.
6093
6094    o coding-category-iso-7
6095
6096         The category for a coding system which has the same code range
6097         as ISO2022 of 7-bit environment.  This doesn't use any locking
6098         shift and single shift functions.  This can encode/decode all
6099         charsets.  Assigned the coding-system (Lisp symbol)
6100         `iso-2022-7bit' by default.
6101
6102    o coding-category-iso-7-tight
6103
6104         Same as coding-category-iso-7 except that this can
6105         encode/decode only the specified charsets.
6106
6107    o coding-category-iso-8-1
6108
6109         The category for a coding system which has the same code range
6110         as ISO2022 of 8-bit environment and graphic plane 1 used only
6111         for DIMENSION1 charset.  This doesn't use any locking shift
6112         and single shift functions.  Assigned the coding-system (Lisp
6113         symbol) `iso-latin-1' by default.
6114
6115    o coding-category-iso-8-2
6116
6117         The category for a coding system which has the same code range
6118         as ISO2022 of 8-bit environment and graphic plane 1 used only
6119         for DIMENSION2 charset.  This doesn't use any locking shift
6120         and single shift functions.  Assigned the coding-system (Lisp
6121         symbol) `japanese-iso-8bit' by default.
6122
6123    o coding-category-iso-7-else
6124
6125         The category for a coding system which has the same code range
6126         as ISO2022 of 7-bit environment but uses locking shift or
6127         single shift functions.  Assigned the coding-system (Lisp
6128         symbol) `iso-2022-7bit-lock' by default.
6129
6130    o coding-category-iso-8-else
6131
6132         The category for a coding system which has the same code range
6133         as ISO2022 of 8-bit environment but uses locking shift or
6134         single shift functions.  Assigned the coding-system (Lisp
6135         symbol) `iso-2022-8bit-ss2' by default.
6136
6137    o coding-category-big5
6138
6139         The category for a coding system which has the same code range
6140         as BIG5.  Assigned the coding-system (Lisp symbol)
6141         `cn-big5' by default.
6142
6143    o coding-category-utf-8
6144
6145         The category for a coding system which has the same code range
6146         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6147         symbol) `utf-8' by default.
6148
6149    o coding-category-utf-16-be
6150
6151         The category for a coding system in which a text has an
6152         Unicode signature (cf. Unicode Standard) in the order of BIG
6153         endian at the head.  Assigned the coding-system (Lisp symbol)
6154         `utf-16-be' by default.
6155
6156    o coding-category-utf-16-le
6157
6158         The category for a coding system in which a text has an
6159         Unicode signature (cf. Unicode Standard) in the order of
6160         LITTLE endian at the head.  Assigned the coding-system (Lisp
6161         symbol) `utf-16-le' by default.
6162
6163    o coding-category-ccl
6164
6165         The category for a coding system of which encoder/decoder is
6166         written in CCL programs.  The default value is nil, i.e., no
6167         coding system is assigned.
6168
6169    o coding-category-binary
6170
6171         The category for a coding system not categorized in any of the
6172         above.  Assigned the coding-system (Lisp symbol)
6173         `no-conversion' by default.
6174
6175    Each of them is a Lisp symbol and the value is an actual
6176    `coding-system's (this is also a Lisp symbol) assigned by a user.
6177    What Emacs does actually is to detect a category of coding system.
6178    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6179    decide only one possible category, it selects a category of the
6180    highest priority.  Priorities of categories are also specified by a
6181    user in a Lisp variable `coding-category-list'.
6182
6183 */
6184
6185 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6186                                            int eol_seen);
6187
6188
6189 /* Return the number of ASCII characters at the head of the source.
6190    By side effects, set coding->head_ascii and update
6191    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6192    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6193    reliable only when all the source bytes are ASCII.  */
6194
6195 static ptrdiff_t
6196 check_ascii (struct coding_system *coding)
6197 {
6198   const unsigned char *src, *end;
6199   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6200   int eol_seen = coding->eol_seen;
6201
6202   coding_set_source (coding);
6203   src = coding->source;
6204   end = src + coding->src_bytes;
6205
6206   if (inhibit_eol_conversion
6207       || SYMBOLP (eol_type))
6208     {
6209       /* We don't have to check EOL format.  */
6210       while (src < end && !( *src & 0x80))
6211         {
6212           if (*src++ == '\n')
6213             eol_seen |= EOL_SEEN_LF;
6214         }
6215     }
6216   else
6217     {
6218       end--;                /* We look ahead one byte for "CR LF".  */
6219       while (src < end)
6220         {
6221           int c = *src;
6222
6223           if (c & 0x80)
6224             break;
6225           src++;
6226           if (c == '\r')
6227             {
6228               if (*src == '\n')
6229                 {
6230                   eol_seen |= EOL_SEEN_CRLF;
6231                   src++;
6232                 }
6233               else
6234                 eol_seen |= EOL_SEEN_CR;
6235             }
6236           else if (c == '\n')
6237             eol_seen |= EOL_SEEN_LF;
6238         }
6239       if (src == end)
6240         {
6241           int c = *src;
6242
6243           /* All bytes but the last one C are ASCII.  */
6244           if (! (c & 0x80))
6245             {
6246               if (c == '\r')
6247                 eol_seen |= EOL_SEEN_CR;
6248               else if (c  == '\n')
6249                 eol_seen |= EOL_SEEN_LF;
6250               src++;
6251             }
6252         }
6253     }
6254   coding->head_ascii = src - coding->source;
6255   coding->eol_seen = eol_seen;
6256   return (coding->head_ascii);
6257 }
6258
6259
6260 /* Return the number of characters at the source if all the bytes are
6261    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6262    effects, update coding->eol_seen.  The value of coding->eol_seen is
6263    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6264    the value is reliable only when all the source bytes are valid
6265    UTF-8.  */
6266
6267 static ptrdiff_t
6268 check_utf_8 (struct coding_system *coding)
6269 {
6270   const unsigned char *src, *end;
6271   int eol_seen;
6272   ptrdiff_t nchars = coding->head_ascii;
6273
6274   if (coding->head_ascii < 0)
6275     check_ascii (coding);
6276   else
6277     coding_set_source (coding);
6278   src = coding->source + coding->head_ascii;
6279   /* We look ahead one byte for CR LF.  */
6280   end = coding->source + coding->src_bytes - 1;
6281   eol_seen = coding->eol_seen;
6282   while (src < end)
6283     {
6284       int c = *src;
6285
6286       if (UTF_8_1_OCTET_P (*src))
6287         {
6288           src++;
6289           if (c < 0x20)
6290             {
6291               if (c == '\r')
6292                 {
6293                   if (*src == '\n')
6294                     {
6295                       eol_seen |= EOL_SEEN_CRLF;
6296                       src++;
6297                       nchars++;
6298                     }
6299                   else
6300                     eol_seen |= EOL_SEEN_CR;
6301                 }
6302               else if (c == '\n')
6303                 eol_seen |= EOL_SEEN_LF;
6304             }
6305         }
6306       else if (UTF_8_2_OCTET_LEADING_P (c))
6307         {
6308           if (c < 0xC2          /* overlong sequence */
6309               || src + 1 >= end
6310               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6311             return -1;
6312           src += 2;
6313         }
6314       else if (UTF_8_3_OCTET_LEADING_P (c))
6315         {
6316           if (src + 2 >= end
6317               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6318                     && UTF_8_EXTRA_OCTET_P (src[2])))
6319             return -1;
6320           c = (((c & 0xF) << 12)
6321                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6322           if (c < 0x800                       /* overlong sequence */
6323               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6324             return -1;
6325           src += 3;
6326         }
6327       else if (UTF_8_4_OCTET_LEADING_P (c))
6328         {
6329           if (src + 3 >= end
6330               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6331                     && UTF_8_EXTRA_OCTET_P (src[2])
6332                     && UTF_8_EXTRA_OCTET_P (src[3])))
6333             return -1;
6334           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6335                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6336           if (c < 0x10000       /* overlong sequence */
6337               || c >= 0x110000) /* non-Unicode character  */
6338             return -1;
6339           src += 4;
6340         }
6341       else
6342         return -1;
6343       nchars++;
6344     }
6345
6346   if (src == end)
6347     {
6348       if (! UTF_8_1_OCTET_P (*src))
6349         return -1;
6350       nchars++;
6351       if (*src == '\r')
6352         eol_seen |= EOL_SEEN_CR;
6353       else if (*src  == '\n')
6354         eol_seen |= EOL_SEEN_LF;
6355     }
6356   coding->eol_seen = eol_seen;
6357   return nchars;
6358 }
6359
6360
6361 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6362    SOURCE is encoded.  If CATEGORY is one of
6363    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6364    two-byte, else they are encoded by one-byte.
6365
6366    Return one of EOL_SEEN_XXX.  */
6367
6368 #define MAX_EOL_CHECK_COUNT 3
6369
6370 static int
6371 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6372             enum coding_category category)
6373 {
6374   const unsigned char *src = source, *src_end = src + src_bytes;
6375   unsigned char c;
6376   int total  = 0;
6377   int eol_seen = EOL_SEEN_NONE;
6378
6379   if ((1 << category) & CATEGORY_MASK_UTF_16)
6380     {
6381       bool msb = category == (coding_category_utf_16_le
6382                               | coding_category_utf_16_le_nosig);
6383       bool lsb = !msb;
6384
6385       while (src + 1 < src_end)
6386         {
6387           c = src[lsb];
6388           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6389             {
6390               int this_eol;
6391
6392               if (c == '\n')
6393                 this_eol = EOL_SEEN_LF;
6394               else if (src + 3 >= src_end
6395                        || src[msb + 2] != 0
6396                        || src[lsb + 2] != '\n')
6397                 this_eol = EOL_SEEN_CR;
6398               else
6399                 {
6400                   this_eol = EOL_SEEN_CRLF;
6401                   src += 2;
6402                 }
6403
6404               if (eol_seen == EOL_SEEN_NONE)
6405                 /* This is the first end-of-line.  */
6406                 eol_seen = this_eol;
6407               else if (eol_seen != this_eol)
6408                 {
6409                   /* The found type is different from what found before.
6410                      Allow for stray ^M characters in DOS EOL files.  */
6411                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6412                       || (eol_seen == EOL_SEEN_CRLF
6413                           && this_eol == EOL_SEEN_CR))
6414                     eol_seen = EOL_SEEN_CRLF;
6415                   else
6416                     {
6417                       eol_seen = EOL_SEEN_LF;
6418                       break;
6419                     }
6420                 }
6421               if (++total == MAX_EOL_CHECK_COUNT)
6422                 break;
6423             }
6424           src += 2;
6425         }
6426     }
6427   else
6428     while (src < src_end)
6429       {
6430         c = *src++;
6431         if (c == '\n' || c == '\r')
6432           {
6433             int this_eol;
6434
6435             if (c == '\n')
6436               this_eol = EOL_SEEN_LF;
6437             else if (src >= src_end || *src != '\n')
6438               this_eol = EOL_SEEN_CR;
6439             else
6440               this_eol = EOL_SEEN_CRLF, src++;
6441
6442             if (eol_seen == EOL_SEEN_NONE)
6443               /* This is the first end-of-line.  */
6444               eol_seen = this_eol;
6445             else if (eol_seen != this_eol)
6446               {
6447                 /* The found type is different from what found before.
6448                    Allow for stray ^M characters in DOS EOL files.  */
6449                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6450                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6451                   eol_seen = EOL_SEEN_CRLF;
6452                 else
6453                   {
6454                     eol_seen = EOL_SEEN_LF;
6455                     break;
6456                   }
6457               }
6458             if (++total == MAX_EOL_CHECK_COUNT)
6459               break;
6460           }
6461       }
6462   return eol_seen;
6463 }
6464
6465
6466 static Lisp_Object
6467 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6468 {
6469   Lisp_Object eol_type;
6470
6471   eol_type = CODING_ID_EOL_TYPE (coding->id);
6472   if (! VECTORP (eol_type))
6473     /* Already adjusted.  */
6474     return eol_type;
6475   if (eol_seen & EOL_SEEN_LF)
6476     {
6477       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6478       eol_type = Qunix;
6479     }
6480   else if (eol_seen & EOL_SEEN_CRLF)
6481     {
6482       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6483       eol_type = Qdos;
6484     }
6485   else if (eol_seen & EOL_SEEN_CR)
6486     {
6487       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6488       eol_type = Qmac;
6489     }
6490   return eol_type;
6491 }
6492
6493 /* Detect how a text specified in CODING is encoded.  If a coding
6494    system is detected, update fields of CODING by the detected coding
6495    system.  */
6496
6497 static void
6498 detect_coding (struct coding_system *coding)
6499 {
6500   const unsigned char *src, *src_end;
6501   unsigned int saved_mode = coding->mode;
6502   Lisp_Object found = Qnil;
6503   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6504
6505   coding->consumed = coding->consumed_char = 0;
6506   coding->produced = coding->produced_char = 0;
6507   coding_set_source (coding);
6508
6509   src_end = coding->source + coding->src_bytes;
6510
6511   coding->eol_seen = EOL_SEEN_NONE;
6512   /* If we have not yet decided the text encoding type, detect it
6513      now.  */
6514   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6515     {
6516       int c, i;
6517       struct coding_detection_info detect_info;
6518       bool null_byte_found = 0, eight_bit_found = 0;
6519       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6520                                        inhibit_null_byte_detection);
6521       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6522                                        inhibit_iso_escape_detection);
6523       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6524
6525       coding->head_ascii = 0;
6526       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6527       for (src = coding->source; src < src_end; src++)
6528         {
6529           c = *src;
6530           if (c & 0x80)
6531             {
6532               eight_bit_found = 1;
6533               if (null_byte_found)
6534                 break;
6535             }
6536           else if (c < 0x20)
6537             {
6538               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6539                   && ! inhibit_ied
6540                   && ! detect_info.checked)
6541                 {
6542                   if (detect_coding_iso_2022 (coding, &detect_info))
6543                     {
6544                       /* We have scanned the whole data.  */
6545                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6546                         {
6547                           /* We didn't find an 8-bit code.  We may
6548                              have found a null-byte, but it's very
6549                              rare that a binary file conforms to
6550                              ISO-2022.  */
6551                           src = src_end;
6552                           coding->head_ascii = src - coding->source;
6553                         }
6554                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6555                       break;
6556                     }
6557                 }
6558               else if (! c && !inhibit_nbd)
6559                 {
6560                   null_byte_found = 1;
6561                   if (eight_bit_found)
6562                     break;
6563                 }
6564               else if (! disable_ascii_optimization
6565                        && ! inhibit_eol_conversion)
6566                 {
6567                   if (c == '\r')
6568                     {
6569                       if (src < src_end && src[1] == '\n')
6570                         {
6571                           coding->eol_seen |= EOL_SEEN_CRLF;
6572                           src++;
6573                           if (! eight_bit_found)
6574                             coding->head_ascii++;
6575                         }
6576                       else
6577                         coding->eol_seen |= EOL_SEEN_CR;
6578                     }
6579                   else if (c == '\n')
6580                     {
6581                       coding->eol_seen |= EOL_SEEN_LF;
6582                     }
6583                 }
6584
6585               if (! eight_bit_found)
6586                 coding->head_ascii++;
6587             }
6588           else if (! eight_bit_found)
6589             coding->head_ascii++;
6590         }
6591
6592       if (null_byte_found || eight_bit_found
6593           || coding->head_ascii < coding->src_bytes
6594           || detect_info.found)
6595         {
6596           enum coding_category category;
6597           struct coding_system *this;
6598
6599           if (coding->head_ascii == coding->src_bytes)
6600             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6601             for (i = 0; i < coding_category_raw_text; i++)
6602               {
6603                 category = coding_priorities[i];
6604                 this = coding_categories + category;
6605                 if (detect_info.found & (1 << category))
6606                   break;
6607               }
6608           else
6609             {
6610               if (null_byte_found)
6611                 {
6612                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6613                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6614                 }
6615               else if (prefer_utf_8
6616                        && detect_coding_utf_8 (coding, &detect_info))
6617                 {
6618                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6619                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6620                 }
6621               for (i = 0; i < coding_category_raw_text; i++)
6622                 {
6623                   category = coding_priorities[i];
6624                   this = coding_categories + category;
6625                   /* Some of this->detector (e.g. detect_coding_sjis)
6626                      require this information.  */
6627                   coding->id = this->id;
6628                   if (this->id < 0)
6629                     {
6630                       /* No coding system of this category is defined.  */
6631                       detect_info.rejected |= (1 << category);
6632                     }
6633                   else if (category >= coding_category_raw_text)
6634                     continue;
6635                   else if (detect_info.checked & (1 << category))
6636                     {
6637                       if (detect_info.found & (1 << category))
6638                         break;
6639                     }
6640                   else if ((*(this->detector)) (coding, &detect_info)
6641                            && detect_info.found & (1 << category))
6642                     break;
6643                 }
6644             }
6645
6646           if (i < coding_category_raw_text)
6647             {
6648               if (category == coding_category_utf_8_auto)
6649                 {
6650                   Lisp_Object coding_systems;
6651
6652                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6653                                          coding_attr_utf_bom);
6654                   if (CONSP (coding_systems))
6655                     {
6656                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6657                         found = XCAR (coding_systems);
6658                       else
6659                         found = XCDR (coding_systems);
6660                     }
6661                   else
6662                     found = CODING_ID_NAME (this->id);
6663                 }
6664               else if (category == coding_category_utf_16_auto)
6665                 {
6666                   Lisp_Object coding_systems;
6667
6668                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6669                                          coding_attr_utf_bom);
6670                   if (CONSP (coding_systems))
6671                     {
6672                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6673                         found = XCAR (coding_systems);
6674                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6675                         found = XCDR (coding_systems);
6676                     }
6677                   else
6678                     found = CODING_ID_NAME (this->id);
6679                 }
6680               else
6681                 found = CODING_ID_NAME (this->id);
6682             }
6683           else if (null_byte_found)
6684             found = Qno_conversion;
6685           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6686                    == CATEGORY_MASK_ANY)
6687             found = Qraw_text;
6688           else if (detect_info.rejected)
6689             for (i = 0; i < coding_category_raw_text; i++)
6690               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6691                 {
6692                   this = coding_categories + coding_priorities[i];
6693                   found = CODING_ID_NAME (this->id);
6694                   break;
6695                 }
6696         }
6697     }
6698   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6699            == coding_category_utf_8_auto)
6700     {
6701       Lisp_Object coding_systems;
6702       struct coding_detection_info detect_info;
6703
6704       coding_systems
6705         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6706       detect_info.found = detect_info.rejected = 0;
6707       if (check_ascii (coding) == coding->src_bytes)
6708         {
6709           if (CONSP (coding_systems))
6710             found = XCDR (coding_systems);
6711         }
6712       else
6713         {
6714           if (CONSP (coding_systems)
6715               && detect_coding_utf_8 (coding, &detect_info))
6716             {
6717               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6718                 found = XCAR (coding_systems);
6719               else
6720                 found = XCDR (coding_systems);
6721             }
6722         }
6723     }
6724   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6725            == coding_category_utf_16_auto)
6726     {
6727       Lisp_Object coding_systems;
6728       struct coding_detection_info detect_info;
6729
6730       coding_systems
6731         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6732       detect_info.found = detect_info.rejected = 0;
6733       coding->head_ascii = 0;
6734       if (CONSP (coding_systems)
6735           && detect_coding_utf_16 (coding, &detect_info))
6736         {
6737           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6738             found = XCAR (coding_systems);
6739           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6740             found = XCDR (coding_systems);
6741         }
6742     }
6743
6744   if (! NILP (found))
6745     {
6746       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6747                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6748                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6749                            : EOL_SEEN_LF);
6750
6751       setup_coding_system (found, coding);
6752       if (specified_eol != EOL_SEEN_NONE)
6753         adjust_coding_eol_type (coding, specified_eol);
6754     }
6755
6756   coding->mode = saved_mode;
6757 }
6758
6759
6760 static void
6761 decode_eol (struct coding_system *coding)
6762 {
6763   Lisp_Object eol_type;
6764   unsigned char *p, *pbeg, *pend;
6765
6766   eol_type = CODING_ID_EOL_TYPE (coding->id);
6767   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6768     return;
6769
6770   if (NILP (coding->dst_object))
6771     pbeg = coding->destination;
6772   else
6773     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6774   pend = pbeg + coding->produced;
6775
6776   if (VECTORP (eol_type))
6777     {
6778       int eol_seen = EOL_SEEN_NONE;
6779
6780       for (p = pbeg; p < pend; p++)
6781         {
6782           if (*p == '\n')
6783             eol_seen |= EOL_SEEN_LF;
6784           else if (*p == '\r')
6785             {
6786               if (p + 1 < pend && *(p + 1) == '\n')
6787                 {
6788                   eol_seen |= EOL_SEEN_CRLF;
6789                   p++;
6790                 }
6791               else
6792                 eol_seen |= EOL_SEEN_CR;
6793             }
6794         }
6795       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6796       if ((eol_seen & EOL_SEEN_CRLF) != 0
6797           && (eol_seen & EOL_SEEN_CR) != 0
6798           && (eol_seen & EOL_SEEN_LF) == 0)
6799         eol_seen = EOL_SEEN_CRLF;
6800       else if (eol_seen != EOL_SEEN_NONE
6801           && eol_seen != EOL_SEEN_LF
6802           && eol_seen != EOL_SEEN_CRLF
6803           && eol_seen != EOL_SEEN_CR)
6804         eol_seen = EOL_SEEN_LF;
6805       if (eol_seen != EOL_SEEN_NONE)
6806         eol_type = adjust_coding_eol_type (coding, eol_seen);
6807     }
6808
6809   if (EQ (eol_type, Qmac))
6810     {
6811       for (p = pbeg; p < pend; p++)
6812         if (*p == '\r')
6813           *p = '\n';
6814     }
6815   else if (EQ (eol_type, Qdos))
6816     {
6817       ptrdiff_t n = 0;
6818       ptrdiff_t pos = coding->dst_pos;
6819       ptrdiff_t pos_byte = coding->dst_pos_byte;
6820       ptrdiff_t pos_end = pos_byte + coding->produced - 1;
6821
6822       /* This assertion is here instead of code, now deleted, that
6823          handled the NILP case, which no longer happens with the
6824          current codebase.  */
6825       eassert (!NILP (coding->dst_object));
6826
6827       while (pos_byte < pos_end)
6828         {
6829           int incr;
6830
6831           p = BYTE_POS_ADDR (pos_byte);
6832           if (coding->dst_multibyte)
6833             incr = BYTES_BY_CHAR_HEAD (*p);
6834           else
6835             incr = 1;
6836
6837           if (*p == '\r' && p[1] == '\n')
6838             {
6839               del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6840               n++;
6841               pos_end--;
6842             }
6843           pos++;
6844           pos_byte += incr;
6845         }
6846       coding->produced -= n;
6847       coding->produced_char -= n;
6848     }
6849 }
6850
6851
6852 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
6853    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
6854    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
6855 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6856
6857 /* Return a translation table (or list of them) from coding system
6858    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6859    not ENCODEP). */
6860
6861 static Lisp_Object
6862 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6863 {
6864   Lisp_Object standard, translation_table;
6865   Lisp_Object val;
6866
6867   if (NILP (Venable_character_translation))
6868     {
6869       if (max_lookup)
6870         *max_lookup = 0;
6871       return Qnil;
6872     }
6873   if (encodep)
6874     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6875       standard = Vstandard_translation_table_for_encode;
6876   else
6877     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6878       standard = Vstandard_translation_table_for_decode;
6879   if (NILP (translation_table))
6880     translation_table = standard;
6881   else
6882     {
6883       if (SYMBOLP (translation_table))
6884         translation_table = Fget (translation_table, Qtranslation_table);
6885       else if (CONSP (translation_table))
6886         {
6887           translation_table = Fcopy_sequence (translation_table);
6888           for (val = translation_table; CONSP (val); val = XCDR (val))
6889             if (SYMBOLP (XCAR (val)))
6890               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6891         }
6892       if (CHAR_TABLE_P (standard))
6893         {
6894           if (CONSP (translation_table))
6895             translation_table = nconc2 (translation_table, list1 (standard));
6896           else
6897             translation_table = list2 (translation_table, standard);
6898         }
6899     }
6900
6901   if (max_lookup)
6902     {
6903       *max_lookup = 1;
6904       if (CHAR_TABLE_P (translation_table)
6905           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6906         {
6907           val = XCHAR_TABLE (translation_table)->extras[1];
6908           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6909             *max_lookup = min (XFASTINT (val), MAX_LOOKUP_MAX);
6910         }
6911       else if (CONSP (translation_table))
6912         {
6913           Lisp_Object tail;
6914
6915           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6916             if (CHAR_TABLE_P (XCAR (tail))
6917                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6918               {
6919                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6920                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6921                   *max_lookup = min (XFASTINT (tailval), MAX_LOOKUP_MAX);
6922               }
6923         }
6924     }
6925   return translation_table;
6926 }
6927
6928 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6929   do {                                                          \
6930     trans = Qnil;                                               \
6931     if (CHAR_TABLE_P (table))                                   \
6932       {                                                         \
6933         trans = CHAR_TABLE_REF (table, c);                      \
6934         if (CHARACTERP (trans))                                 \
6935           c = XFASTINT (trans), trans = Qnil;                   \
6936       }                                                         \
6937     else if (CONSP (table))                                     \
6938       {                                                         \
6939         Lisp_Object tail;                                       \
6940                                                                 \
6941         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6942           if (CHAR_TABLE_P (XCAR (tail)))                       \
6943             {                                                   \
6944               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6945               if (CHARACTERP (trans))                           \
6946                 c = XFASTINT (trans), trans = Qnil;             \
6947               else if (! NILP (trans))                          \
6948                 break;                                          \
6949             }                                                   \
6950       }                                                         \
6951   } while (0)
6952
6953
6954 /* Return a translation of character(s) at BUF according to TRANS.
6955    TRANS is TO-CHAR, [TO-CHAR ...], or ((FROM .  TO) ...) where FROM =
6956    [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].  The return value
6957    is TO-CHAR or [TO-CHAR ...] if a translation is found, Qnil if not
6958    found, or Qt if BUF is too short to lookup characters in FROM.  As
6959    a side effect, if a translation is found, *NCHARS is set to the
6960    number of characters being translated.  */
6961
6962 static Lisp_Object
6963 get_translation (Lisp_Object trans, int *buf, int *buf_end, ptrdiff_t *nchars)
6964 {
6965   if (INTEGERP (trans) || VECTORP (trans))
6966     {
6967       *nchars = 1;
6968       return trans;
6969     }
6970   for (; CONSP (trans); trans = XCDR (trans))
6971     {
6972       Lisp_Object val = XCAR (trans);
6973       Lisp_Object from = XCAR (val);
6974       ptrdiff_t len = ASIZE (from);
6975       ptrdiff_t i;
6976
6977       for (i = 0; i < len; i++)
6978         {
6979           if (buf + i == buf_end)
6980             return Qt;
6981           if (XINT (AREF (from, i)) != buf[i])
6982             break;
6983         }
6984       if (i == len)
6985         {
6986           *nchars = len;
6987           return XCDR (val);
6988         }
6989     }
6990   return Qnil;
6991 }
6992
6993
6994 static int
6995 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6996                bool last_block)
6997 {
6998   unsigned char *dst = coding->destination + coding->produced;
6999   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7000   ptrdiff_t produced;
7001   ptrdiff_t produced_chars = 0;
7002   int carryover = 0;
7003
7004   if (! coding->chars_at_source)
7005     {
7006       /* Source characters are in coding->charbuf.  */
7007       int *buf = coding->charbuf;
7008       int *buf_end = buf + coding->charbuf_used;
7009
7010       if (EQ (coding->src_object, coding->dst_object)
7011           && ! NILP (coding->dst_object))
7012         {
7013           eassert (growable_destination (coding));
7014           coding_set_source (coding);
7015           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7016         }
7017
7018       while (buf < buf_end)
7019         {
7020           int c = *buf;
7021           ptrdiff_t i;
7022
7023           if (c >= 0)
7024             {
7025               ptrdiff_t from_nchars = 1, to_nchars = 1;
7026               Lisp_Object trans = Qnil;
7027
7028               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7029               if (! NILP (trans))
7030                 {
7031                   trans = get_translation (trans, buf, buf_end, &from_nchars);
7032                   if (INTEGERP (trans))
7033                     c = XINT (trans);
7034                   else if (VECTORP (trans))
7035                     {
7036                       to_nchars = ASIZE (trans);
7037                       c = XINT (AREF (trans, 0));
7038                     }
7039                   else if (EQ (trans, Qt) && ! last_block)
7040                     break;
7041                 }
7042
7043               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7044                 {
7045                   eassert (growable_destination (coding));
7046                   ptrdiff_t dst_size;
7047                   if (INT_MULTIPLY_WRAPV (to_nchars, MAX_MULTIBYTE_LENGTH,
7048                                           &dst_size)
7049                       || INT_ADD_WRAPV (buf_end - buf, dst_size, &dst_size))
7050                     memory_full (SIZE_MAX);
7051                   dst = alloc_destination (coding, dst_size, dst);
7052                   if (EQ (coding->src_object, coding->dst_object))
7053                     {
7054                       coding_set_source (coding);
7055                       dst_end = (((unsigned char *) coding->source)
7056                                  + coding->consumed);
7057                     }
7058                   else
7059                     dst_end = coding->destination + coding->dst_bytes;
7060                 }
7061
7062               for (i = 0; i < to_nchars; i++)
7063                 {
7064                   if (i > 0)
7065                     c = XINT (AREF (trans, i));
7066                   if (coding->dst_multibyte
7067                       || ! CHAR_BYTE8_P (c))
7068                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7069                   else
7070                     *dst++ = CHAR_TO_BYTE8 (c);
7071                 }
7072               produced_chars += to_nchars;
7073               buf += from_nchars;
7074             }
7075           else
7076             /* This is an annotation datum.  (-C) is the length.  */
7077             buf += -c;
7078         }
7079       carryover = buf_end - buf;
7080     }
7081   else
7082     {
7083       /* Source characters are at coding->source.  */
7084       const unsigned char *src = coding->source;
7085       const unsigned char *src_end = src + coding->consumed;
7086
7087       if (EQ (coding->dst_object, coding->src_object))
7088         {
7089           eassert (growable_destination (coding));
7090           dst_end = (unsigned char *) src;
7091         }
7092       if (coding->src_multibyte != coding->dst_multibyte)
7093         {
7094           if (coding->src_multibyte)
7095             {
7096               bool multibytep = 1;
7097               ptrdiff_t consumed_chars = 0;
7098
7099               while (1)
7100                 {
7101                   const unsigned char *src_base = src;
7102                   int c;
7103
7104                   ONE_MORE_BYTE (c);
7105                   if (dst == dst_end)
7106                     {
7107                       eassert (growable_destination (coding));
7108                       if (EQ (coding->src_object, coding->dst_object))
7109                         dst_end = (unsigned char *) src;
7110                       if (dst == dst_end)
7111                         {
7112                           ptrdiff_t offset = src - coding->source;
7113
7114                           dst = alloc_destination (coding, src_end - src + 1,
7115                                                    dst);
7116                           dst_end = coding->destination + coding->dst_bytes;
7117                           coding_set_source (coding);
7118                           src = coding->source + offset;
7119                           src_end = coding->source + coding->consumed;
7120                           if (EQ (coding->src_object, coding->dst_object))
7121                             dst_end = (unsigned char *) src;
7122                         }
7123                     }
7124                   *dst++ = c;
7125                   produced_chars++;
7126                 }
7127             no_more_source:
7128               ;
7129             }
7130           else
7131             while (src < src_end)
7132               {
7133                 bool multibytep = 1;
7134                 int c = *src++;
7135
7136                 if (dst >= dst_end - 1)
7137                   {
7138                     eassert (growable_destination (coding));
7139                     if (EQ (coding->src_object, coding->dst_object))
7140                       dst_end = (unsigned char *) src;
7141                     if (dst >= dst_end - 1)
7142                       {
7143                         ptrdiff_t offset = src - coding->source;
7144                         ptrdiff_t more_bytes;
7145
7146                         if (EQ (coding->src_object, coding->dst_object))
7147                           more_bytes = ((src_end - src) / 2) + 2;
7148                         else
7149                           more_bytes = src_end - src + 2;
7150                         dst = alloc_destination (coding, more_bytes, dst);
7151                         dst_end = coding->destination + coding->dst_bytes;
7152                         coding_set_source (coding);
7153                         src = coding->source + offset;
7154                         src_end = coding->source + coding->consumed;
7155                         if (EQ (coding->src_object, coding->dst_object))
7156                           dst_end = (unsigned char *) src;
7157                       }
7158                   }
7159                 EMIT_ONE_BYTE (c);
7160               }
7161         }
7162       else
7163         {
7164           if (!EQ (coding->src_object, coding->dst_object))
7165             {
7166               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7167
7168               if (require > 0)
7169                 {
7170                   ptrdiff_t offset = src - coding->source;
7171
7172                   dst = alloc_destination (coding, require, dst);
7173                   coding_set_source (coding);
7174                   src = coding->source + offset;
7175                   src_end = coding->source + coding->consumed;
7176                 }
7177             }
7178           produced_chars = coding->consumed_char;
7179           while (src < src_end)
7180             *dst++ = *src++;
7181         }
7182     }
7183
7184   produced = dst - (coding->destination + coding->produced);
7185   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7186     insert_from_gap (produced_chars, produced, 0);
7187   coding->produced += produced;
7188   coding->produced_char += produced_chars;
7189   return carryover;
7190 }
7191
7192 /* Compose text in CODING->object according to the annotation data at
7193    CHARBUF.  CHARBUF is an array:
7194      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7195  */
7196
7197 static void
7198 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7199 {
7200   int len;
7201   ptrdiff_t to;
7202   enum composition_method method;
7203   Lisp_Object components;
7204
7205   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7206   to = pos + charbuf[2];
7207   method = (enum composition_method) (charbuf[4]);
7208
7209   if (method == COMPOSITION_RELATIVE)
7210     components = Qnil;
7211   else
7212     {
7213       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7214       int i, j;
7215
7216       if (method == COMPOSITION_WITH_RULE)
7217         len = charbuf[2] * 3 - 2;
7218       charbuf += MAX_ANNOTATION_LENGTH;
7219       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7220       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7221         {
7222           if (charbuf[i] >= 0)
7223             args[j] = make_number (charbuf[i]);
7224           else
7225             {
7226               i++;
7227               args[j] = make_number (charbuf[i] % 0x100);
7228             }
7229         }
7230       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7231     }
7232   compose_text (pos, to, components, Qnil, coding->dst_object);
7233 }
7234
7235
7236 /* Put `charset' property on text in CODING->object according to
7237    the annotation data at CHARBUF.  CHARBUF is an array:
7238      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7239  */
7240
7241 static void
7242 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7243 {
7244   ptrdiff_t from = pos - charbuf[2];
7245   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7246
7247   Fput_text_property (make_number (from), make_number (pos),
7248                       Qcharset, CHARSET_NAME (charset),
7249                       coding->dst_object);
7250 }
7251
7252 #define MAX_CHARBUF_SIZE 0x4000
7253 /* How many units decoding functions expect in coding->charbuf at
7254    most.  Currently, decode_coding_emacs_mule expects the following
7255    size, and that is the largest value.  */
7256 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7257
7258 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7259   do {                                                          \
7260     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
7261                            MAX_CHARBUF_SIZE);                   \
7262     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
7263     coding->charbuf_size = units;                               \
7264   } while (0)
7265
7266 static void
7267 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7268 {
7269   int *charbuf = coding->charbuf;
7270   int *charbuf_end = charbuf + coding->charbuf_used;
7271
7272   if (NILP (coding->dst_object))
7273     return;
7274
7275   while (charbuf < charbuf_end)
7276     {
7277       if (*charbuf >= 0)
7278         pos++, charbuf++;
7279       else
7280         {
7281           int len = -*charbuf;
7282
7283           if (len > 2)
7284             switch (charbuf[1])
7285               {
7286               case CODING_ANNOTATE_COMPOSITION_MASK:
7287                 produce_composition (coding, charbuf, pos);
7288                 break;
7289               case CODING_ANNOTATE_CHARSET_MASK:
7290                 produce_charset (coding, charbuf, pos);
7291                 break;
7292               default:
7293                 break;
7294               }
7295           charbuf += len;
7296         }
7297     }
7298 }
7299
7300 /* Decode the data at CODING->src_object into CODING->dst_object.
7301    CODING->src_object is a buffer, a string, or nil.
7302    CODING->dst_object is a buffer.
7303
7304    If CODING->src_object is a buffer, it must be the current buffer.
7305    In this case, if CODING->src_pos is positive, it is a position of
7306    the source text in the buffer, otherwise, the source text is in the
7307    gap area of the buffer, and CODING->src_pos specifies the offset of
7308    the text from GPT (which must be the same as PT).  If this is the
7309    same buffer as CODING->dst_object, CODING->src_pos must be
7310    negative.
7311
7312    If CODING->src_object is a string, CODING->src_pos is an index to
7313    that string.
7314
7315    If CODING->src_object is nil, CODING->source must already point to
7316    the non-relocatable memory area.  In this case, CODING->src_pos is
7317    an offset from CODING->source.
7318
7319    The decoded data is inserted at the current point of the buffer
7320    CODING->dst_object.
7321 */
7322
7323 static void
7324 decode_coding (struct coding_system *coding)
7325 {
7326   Lisp_Object attrs;
7327   Lisp_Object undo_list;
7328   Lisp_Object translation_table;
7329   struct ccl_spec cclspec;
7330   int carryover;
7331   int i;
7332
7333   USE_SAFE_ALLOCA;
7334
7335   if (BUFFERP (coding->src_object)
7336       && coding->src_pos > 0
7337       && coding->src_pos < GPT
7338       && coding->src_pos + coding->src_chars > GPT)
7339     move_gap_both (coding->src_pos, coding->src_pos_byte);
7340
7341   undo_list = Qt;
7342   if (BUFFERP (coding->dst_object))
7343     {
7344       set_buffer_internal (XBUFFER (coding->dst_object));
7345       if (GPT != PT)
7346         move_gap_both (PT, PT_BYTE);
7347
7348       /* We must disable undo_list in order to record the whole insert
7349          transaction via record_insert at the end.  But doing so also
7350          disables the recording of the first change to the undo_list.
7351          Therefore we check for first change here and record it via
7352          record_first_change if needed.  */
7353       if (MODIFF <= SAVE_MODIFF)
7354         record_first_change ();
7355
7356       undo_list = BVAR (current_buffer, undo_list);
7357       bset_undo_list (current_buffer, Qt);
7358     }
7359
7360   coding->consumed = coding->consumed_char = 0;
7361   coding->produced = coding->produced_char = 0;
7362   coding->chars_at_source = 0;
7363   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7364
7365   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7366
7367   attrs = CODING_ID_ATTRS (coding->id);
7368   translation_table = get_translation_table (attrs, 0, NULL);
7369
7370   carryover = 0;
7371   if (coding->decoder == decode_coding_ccl)
7372     {
7373       coding->spec.ccl = &cclspec;
7374       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7375     }
7376   do
7377     {
7378       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7379
7380       coding_set_source (coding);
7381       coding->annotated = 0;
7382       coding->charbuf_used = carryover;
7383       (*(coding->decoder)) (coding);
7384       coding_set_destination (coding);
7385       carryover = produce_chars (coding, translation_table, 0);
7386       if (coding->annotated)
7387         produce_annotation (coding, pos);
7388       for (i = 0; i < carryover; i++)
7389         coding->charbuf[i]
7390           = coding->charbuf[coding->charbuf_used - carryover + i];
7391     }
7392   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7393          || (coding->consumed < coding->src_bytes
7394              && (coding->result == CODING_RESULT_SUCCESS
7395                  || coding->result == CODING_RESULT_INVALID_SRC)));
7396
7397   if (carryover > 0)
7398     {
7399       coding_set_destination (coding);
7400       coding->charbuf_used = carryover;
7401       produce_chars (coding, translation_table, 1);
7402     }
7403
7404   coding->carryover_bytes = 0;
7405   if (coding->consumed < coding->src_bytes)
7406     {
7407       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7408       const unsigned char *src;
7409
7410       coding_set_source (coding);
7411       coding_set_destination (coding);
7412       src = coding->source + coding->consumed;
7413
7414       if (coding->mode & CODING_MODE_LAST_BLOCK)
7415         {
7416           /* Flush out unprocessed data as binary chars.  We are sure
7417              that the number of data is less than the size of
7418              coding->charbuf.  */
7419           coding->charbuf_used = 0;
7420           coding->chars_at_source = 0;
7421
7422           while (nbytes-- > 0)
7423             {
7424               int c = *src++;
7425
7426               if (c & 0x80)
7427                 c = BYTE8_TO_CHAR (c);
7428               coding->charbuf[coding->charbuf_used++] = c;
7429             }
7430           produce_chars (coding, Qnil, 1);
7431         }
7432       else
7433         {
7434           /* Record unprocessed bytes in coding->carryover.  We are
7435              sure that the number of data is less than the size of
7436              coding->carryover.  */
7437           unsigned char *p = coding->carryover;
7438
7439           if (nbytes > sizeof coding->carryover)
7440             nbytes = sizeof coding->carryover;
7441           coding->carryover_bytes = nbytes;
7442           while (nbytes-- > 0)
7443             *p++ = *src++;
7444         }
7445       coding->consumed = coding->src_bytes;
7446     }
7447
7448   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7449       && !inhibit_eol_conversion)
7450     decode_eol (coding);
7451   if (BUFFERP (coding->dst_object))
7452     {
7453       bset_undo_list (current_buffer, undo_list);
7454       record_insert (coding->dst_pos, coding->produced_char);
7455     }
7456
7457   SAFE_FREE ();
7458 }
7459
7460
7461 /* Extract an annotation datum from a composition starting at POS and
7462    ending before LIMIT of CODING->src_object (buffer or string), store
7463    the data in BUF, set *STOP to a starting position of the next
7464    composition (if any) or to LIMIT, and return the address of the
7465    next element of BUF.
7466
7467    If such an annotation is not found, set *STOP to a starting
7468    position of a composition after POS (if any) or to LIMIT, and
7469    return BUF.  */
7470
7471 static int *
7472 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7473                                struct coding_system *coding, int *buf,
7474                                ptrdiff_t *stop)
7475 {
7476   ptrdiff_t start, end;
7477   Lisp_Object prop;
7478
7479   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7480       || end > limit)
7481     *stop = limit;
7482   else if (start > pos)
7483     *stop = start;
7484   else
7485     {
7486       if (start == pos)
7487         {
7488           /* We found a composition.  Store the corresponding
7489              annotation data in BUF.  */
7490           int *head = buf;
7491           enum composition_method method = composition_method (prop);
7492           int nchars = COMPOSITION_LENGTH (prop);
7493
7494           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7495           if (method != COMPOSITION_RELATIVE)
7496             {
7497               Lisp_Object components;
7498               ptrdiff_t i, len, i_byte;
7499
7500               components = COMPOSITION_COMPONENTS (prop);
7501               if (VECTORP (components))
7502                 {
7503                   len = ASIZE (components);
7504                   for (i = 0; i < len; i++)
7505                     *buf++ = XINT (AREF (components, i));
7506                 }
7507               else if (STRINGP (components))
7508                 {
7509                   len = SCHARS (components);
7510                   i = i_byte = 0;
7511                   while (i < len)
7512                     {
7513                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7514                       buf++;
7515                     }
7516                 }
7517               else if (INTEGERP (components))
7518                 {
7519                   len = 1;
7520                   *buf++ = XINT (components);
7521                 }
7522               else if (CONSP (components))
7523                 {
7524                   for (len = 0; CONSP (components);
7525                        len++, components = XCDR (components))
7526                     *buf++ = XINT (XCAR (components));
7527                 }
7528               else
7529                 emacs_abort ();
7530               *head -= len;
7531             }
7532         }
7533
7534       if (find_composition (end, limit, &start, &end, &prop,
7535                             coding->src_object)
7536           && end <= limit)
7537         *stop = start;
7538       else
7539         *stop = limit;
7540     }
7541   return buf;
7542 }
7543
7544
7545 /* Extract an annotation datum from a text property `charset' at POS of
7546    CODING->src_object (buffer of string), store the data in BUF, set
7547    *STOP to the position where the value of `charset' property changes
7548    (limiting by LIMIT), and return the address of the next element of
7549    BUF.
7550
7551    If the property value is nil, set *STOP to the position where the
7552    property value is non-nil (limiting by LIMIT), and return BUF.  */
7553
7554 static int *
7555 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7556                            struct coding_system *coding, int *buf,
7557                            ptrdiff_t *stop)
7558 {
7559   Lisp_Object val, next;
7560   int id;
7561
7562   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7563   if (! NILP (val) && CHARSETP (val))
7564     id = XINT (CHARSET_SYMBOL_ID (val));
7565   else
7566     id = -1;
7567   ADD_CHARSET_DATA (buf, 0, id);
7568   next = Fnext_single_property_change (make_number (pos), Qcharset,
7569                                        coding->src_object,
7570                                        make_number (limit));
7571   *stop = XINT (next);
7572   return buf;
7573 }
7574
7575
7576 static void
7577 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7578                int max_lookup)
7579 {
7580   int *buf = coding->charbuf;
7581   int *buf_end = coding->charbuf + coding->charbuf_size;
7582   const unsigned char *src = coding->source + coding->consumed;
7583   const unsigned char *src_end = coding->source + coding->src_bytes;
7584   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7585   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7586   bool multibytep = coding->src_multibyte;
7587   Lisp_Object eol_type;
7588   int c;
7589   ptrdiff_t stop, stop_composition, stop_charset;
7590   int *lookup_buf = NULL;
7591
7592   if (! NILP (translation_table))
7593     lookup_buf = alloca (sizeof (int) * max_lookup);
7594
7595   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7596   if (VECTORP (eol_type))
7597     eol_type = Qunix;
7598
7599   /* Note: composition handling is not yet implemented.  */
7600   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7601
7602   if (NILP (coding->src_object))
7603     stop = stop_composition = stop_charset = end_pos;
7604   else
7605     {
7606       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7607         stop = stop_composition = pos;
7608       else
7609         stop = stop_composition = end_pos;
7610       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7611         stop = stop_charset = pos;
7612       else
7613         stop_charset = end_pos;
7614     }
7615
7616   /* Compensate for CRLF and conversion.  */
7617   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7618   while (buf < buf_end)
7619     {
7620       Lisp_Object trans;
7621
7622       if (pos == stop)
7623         {
7624           if (pos == end_pos)
7625             break;
7626           if (pos == stop_composition)
7627             buf = handle_composition_annotation (pos, end_pos, coding,
7628                                                  buf, &stop_composition);
7629           if (pos == stop_charset)
7630             buf = handle_charset_annotation (pos, end_pos, coding,
7631                                              buf, &stop_charset);
7632           stop = (stop_composition < stop_charset
7633                   ? stop_composition : stop_charset);
7634         }
7635
7636       if (! multibytep)
7637         {
7638           int bytes;
7639
7640           if (coding->encoder == encode_coding_raw_text
7641               || coding->encoder == encode_coding_ccl)
7642             c = *src++, pos++;
7643           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7644             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7645           else
7646             c = BYTE8_TO_CHAR (*src), src++, pos++;
7647         }
7648       else
7649         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7650       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7651         c = '\n';
7652       if (! EQ (eol_type, Qunix))
7653         {
7654           if (c == '\n')
7655             {
7656               if (EQ (eol_type, Qdos))
7657                 *buf++ = '\r';
7658               else
7659                 c = '\r';
7660             }
7661         }
7662
7663       trans = Qnil;
7664       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7665       if (NILP (trans))
7666         *buf++ = c;
7667       else
7668         {
7669           ptrdiff_t from_nchars = 1, to_nchars = 1;
7670           int *lookup_buf_end;
7671           const unsigned char *p = src;
7672           int i;
7673
7674           lookup_buf[0] = c;
7675           for (i = 1; i < max_lookup && p < src_end; i++)
7676             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7677           lookup_buf_end = lookup_buf + i;
7678           trans = get_translation (trans, lookup_buf, lookup_buf_end,
7679                                    &from_nchars);
7680           if (INTEGERP (trans))
7681             c = XINT (trans);
7682           else if (VECTORP (trans))
7683             {
7684               to_nchars = ASIZE (trans);
7685               if (buf_end - buf < to_nchars)
7686                 break;
7687               c = XINT (AREF (trans, 0));
7688             }
7689           else
7690             break;
7691           *buf++ = c;
7692           for (i = 1; i < to_nchars; i++)
7693             *buf++ = XINT (AREF (trans, i));
7694           for (i = 1; i < from_nchars; i++, pos++)
7695             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7696         }
7697     }
7698
7699   coding->consumed = src - coding->source;
7700   coding->consumed_char = pos - coding->src_pos;
7701   coding->charbuf_used = buf - coding->charbuf;
7702   coding->chars_at_source = 0;
7703 }
7704
7705
7706 /* Encode the text at CODING->src_object into CODING->dst_object.
7707    CODING->src_object is a buffer or a string.
7708    CODING->dst_object is a buffer or nil.
7709
7710    If CODING->src_object is a buffer, it must be the current buffer.
7711    In this case, if CODING->src_pos is positive, it is a position of
7712    the source text in the buffer, otherwise. the source text is in the
7713    gap area of the buffer, and coding->src_pos specifies the offset of
7714    the text from GPT (which must be the same as PT).  If this is the
7715    same buffer as CODING->dst_object, CODING->src_pos must be
7716    negative and CODING should not have `pre-write-conversion'.
7717
7718    If CODING->src_object is a string, CODING should not have
7719    `pre-write-conversion'.
7720
7721    If CODING->dst_object is a buffer, the encoded data is inserted at
7722    the current point of that buffer.
7723
7724    If CODING->dst_object is nil, the encoded data is placed at the
7725    memory area specified by CODING->destination.  */
7726
7727 static void
7728 encode_coding (struct coding_system *coding)
7729 {
7730   Lisp_Object attrs;
7731   Lisp_Object translation_table;
7732   int max_lookup;
7733   struct ccl_spec cclspec;
7734
7735   USE_SAFE_ALLOCA;
7736
7737   attrs = CODING_ID_ATTRS (coding->id);
7738   if (coding->encoder == encode_coding_raw_text)
7739     translation_table = Qnil, max_lookup = 0;
7740   else
7741     translation_table = get_translation_table (attrs, 1, &max_lookup);
7742
7743   if (BUFFERP (coding->dst_object))
7744     {
7745       set_buffer_internal (XBUFFER (coding->dst_object));
7746       coding->dst_multibyte
7747         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7748     }
7749
7750   coding->consumed = coding->consumed_char = 0;
7751   coding->produced = coding->produced_char = 0;
7752   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7753
7754   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7755
7756   if (coding->encoder == encode_coding_ccl)
7757     {
7758       coding->spec.ccl = &cclspec;
7759       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7760     }
7761   do {
7762     coding_set_source (coding);
7763     consume_chars (coding, translation_table, max_lookup);
7764     coding_set_destination (coding);
7765     (*(coding->encoder)) (coding);
7766   } while (coding->consumed_char < coding->src_chars);
7767
7768   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7769     insert_from_gap (coding->produced_char, coding->produced, 0);
7770
7771   SAFE_FREE ();
7772 }
7773
7774
7775 /* Name (or base name) of work buffer for code conversion.  */
7776 static Lisp_Object Vcode_conversion_workbuf_name;
7777
7778 /* A working buffer used by the top level conversion.  Once it is
7779    created, it is never destroyed.  It has the name
7780    Vcode_conversion_workbuf_name.  The other working buffers are
7781    destroyed after the use is finished, and their names are modified
7782    versions of Vcode_conversion_workbuf_name.  */
7783 static Lisp_Object Vcode_conversion_reused_workbuf;
7784
7785 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7786 static bool reused_workbuf_in_use;
7787
7788
7789 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7790    multibyteness of returning buffer.  */
7791
7792 static Lisp_Object
7793 make_conversion_work_buffer (bool multibyte)
7794 {
7795   Lisp_Object name, workbuf;
7796   struct buffer *current;
7797
7798   if (reused_workbuf_in_use)
7799     {
7800       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7801       workbuf = Fget_buffer_create (name);
7802     }
7803   else
7804     {
7805       reused_workbuf_in_use = 1;
7806       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7807         Vcode_conversion_reused_workbuf
7808           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7809       workbuf = Vcode_conversion_reused_workbuf;
7810     }
7811   current = current_buffer;
7812   set_buffer_internal (XBUFFER (workbuf));
7813   /* We can't allow modification hooks to run in the work buffer.  For
7814      instance, directory_files_internal assumes that file decoding
7815      doesn't compile new regexps.  */
7816   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7817   Ferase_buffer ();
7818   bset_undo_list (current_buffer, Qt);
7819   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7820   set_buffer_internal (current);
7821   return workbuf;
7822 }
7823
7824
7825 static void
7826 code_conversion_restore (Lisp_Object arg)
7827 {
7828   Lisp_Object current, workbuf;
7829
7830   current = XCAR (arg);
7831   workbuf = XCDR (arg);
7832   if (! NILP (workbuf))
7833     {
7834       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7835         reused_workbuf_in_use = 0;
7836       else
7837         Fkill_buffer (workbuf);
7838     }
7839   set_buffer_internal (XBUFFER (current));
7840 }
7841
7842 Lisp_Object
7843 code_conversion_save (bool with_work_buf, bool multibyte)
7844 {
7845   Lisp_Object workbuf = Qnil;
7846
7847   if (with_work_buf)
7848     workbuf = make_conversion_work_buffer (multibyte);
7849   record_unwind_protect (code_conversion_restore,
7850                          Fcons (Fcurrent_buffer (), workbuf));
7851   return workbuf;
7852 }
7853
7854 static void
7855 coding_restore_undo_list (Lisp_Object arg)
7856 {
7857   Lisp_Object undo_list = XCAR (arg);
7858   struct buffer *buf = XBUFFER (XCDR (arg));
7859
7860   bset_undo_list (buf, undo_list);
7861 }
7862
7863 void
7864 decode_coding_gap (struct coding_system *coding,
7865                    ptrdiff_t chars, ptrdiff_t bytes)
7866 {
7867   ptrdiff_t count = SPECPDL_INDEX ();
7868   Lisp_Object attrs;
7869
7870   coding->src_object = Fcurrent_buffer ();
7871   coding->src_chars = chars;
7872   coding->src_bytes = bytes;
7873   coding->src_pos = -chars;
7874   coding->src_pos_byte = -bytes;
7875   coding->src_multibyte = chars < bytes;
7876   coding->dst_object = coding->src_object;
7877   coding->dst_pos = PT;
7878   coding->dst_pos_byte = PT_BYTE;
7879   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7880
7881   coding->head_ascii = -1;
7882   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7883   coding->eol_seen = EOL_SEEN_NONE;
7884   if (CODING_REQUIRE_DETECTION (coding))
7885     detect_coding (coding);
7886   attrs = CODING_ID_ATTRS (coding->id);
7887   if (! disable_ascii_optimization
7888       && ! coding->src_multibyte
7889       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7890       && NILP (CODING_ATTR_POST_READ (attrs))
7891       && NILP (get_translation_table (attrs, 0, NULL)))
7892     {
7893       chars = coding->head_ascii;
7894       if (chars < 0)
7895         chars = check_ascii (coding);
7896       if (chars != bytes)
7897         {
7898           /* There exists a non-ASCII byte.  */
7899           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7900               && coding->detected_utf8_bytes == coding->src_bytes)
7901             {
7902               if (coding->detected_utf8_chars >= 0)
7903                 chars = coding->detected_utf8_chars;
7904               else
7905                 chars = check_utf_8 (coding);
7906               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7907                   && coding->head_ascii == 0
7908                   && coding->source[0] == UTF_8_BOM_1
7909                   && coding->source[1] == UTF_8_BOM_2
7910                   && coding->source[2] == UTF_8_BOM_3)
7911                 {
7912                   chars--;
7913                   bytes -= 3;
7914                   coding->src_bytes -= 3;
7915                 }
7916             }
7917           else
7918             chars = -1;
7919         }
7920       if (chars >= 0)
7921         {
7922           Lisp_Object eol_type;
7923
7924           eol_type = CODING_ID_EOL_TYPE (coding->id);
7925           if (VECTORP (eol_type))
7926             {
7927               if (coding->eol_seen != EOL_SEEN_NONE)
7928                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7929             }
7930           if (EQ (eol_type, Qmac))
7931             {
7932               unsigned char *src_end = GAP_END_ADDR;
7933               unsigned char *src = src_end - coding->src_bytes;
7934
7935               while (src < src_end)
7936                 {
7937                   if (*src++ == '\r')
7938                     src[-1] = '\n';
7939                 }
7940             }
7941           else if (EQ (eol_type, Qdos))
7942             {
7943               unsigned char *src = GAP_END_ADDR;
7944               unsigned char *src_beg = src - coding->src_bytes;
7945               unsigned char *dst = src;
7946               ptrdiff_t diff;
7947
7948               while (src_beg < src)
7949                 {
7950                   *--dst = *--src;
7951                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7952                     src--;
7953                 }
7954               diff = dst - src;
7955               bytes -= diff;
7956               chars -= diff;
7957             }
7958           coding->produced = bytes;
7959           coding->produced_char = chars;
7960           insert_from_gap (chars, bytes, 1);
7961           return;
7962         }
7963     }
7964   code_conversion_save (0, 0);
7965
7966   coding->mode |= CODING_MODE_LAST_BLOCK;
7967   current_buffer->text->inhibit_shrinking = 1;
7968   decode_coding (coding);
7969   current_buffer->text->inhibit_shrinking = 0;
7970
7971   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7972     {
7973       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7974       Lisp_Object val;
7975       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
7976       ptrdiff_t count1 = SPECPDL_INDEX ();
7977
7978       record_unwind_protect (coding_restore_undo_list,
7979                              Fcons (undo_list, Fcurrent_buffer ()));
7980       bset_undo_list (current_buffer, Qt);
7981       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7982       val = call1 (CODING_ATTR_POST_READ (attrs),
7983                    make_number (coding->produced_char));
7984       CHECK_NATNUM (val);
7985       coding->produced_char += Z - prev_Z;
7986       coding->produced += Z_BYTE - prev_Z_BYTE;
7987       unbind_to (count1, Qnil);
7988     }
7989
7990   unbind_to (count, Qnil);
7991 }
7992
7993
7994 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7995    SRC_OBJECT into DST_OBJECT by coding context CODING.
7996
7997    SRC_OBJECT is a buffer, a string, or Qnil.
7998
7999    If it is a buffer, the text is at point of the buffer.  FROM and TO
8000    are positions in the buffer.
8001
8002    If it is a string, the text is at the beginning of the string.
8003    FROM and TO are indices to the string.
8004
8005    If it is nil, the text is at coding->source.  FROM and TO are
8006    indices to coding->source.
8007
8008    DST_OBJECT is a buffer, Qt, or Qnil.
8009
8010    If it is a buffer, the decoded text is inserted at point of the
8011    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8012    is deleted.
8013
8014    If it is Qt, a string is made from the decoded text, and
8015    set in CODING->dst_object.
8016
8017    If it is Qnil, the decoded text is stored at CODING->destination.
8018    The caller must allocate CODING->dst_bytes bytes at
8019    CODING->destination by xmalloc.  If the decoded text is longer than
8020    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8021  */
8022
8023 void
8024 decode_coding_object (struct coding_system *coding,
8025                       Lisp_Object src_object,
8026                       ptrdiff_t from, ptrdiff_t from_byte,
8027                       ptrdiff_t to, ptrdiff_t to_byte,
8028                       Lisp_Object dst_object)
8029 {
8030   ptrdiff_t count = SPECPDL_INDEX ();
8031   unsigned char *destination UNINIT;
8032   ptrdiff_t dst_bytes UNINIT;
8033   ptrdiff_t chars = to - from;
8034   ptrdiff_t bytes = to_byte - from_byte;
8035   Lisp_Object attrs;
8036   ptrdiff_t saved_pt = -1, saved_pt_byte UNINIT;
8037   bool need_marker_adjustment = 0;
8038   Lisp_Object old_deactivate_mark;
8039
8040   old_deactivate_mark = Vdeactivate_mark;
8041
8042   if (NILP (dst_object))
8043     {
8044       destination = coding->destination;
8045       dst_bytes = coding->dst_bytes;
8046     }
8047
8048   coding->src_object = src_object;
8049   coding->src_chars = chars;
8050   coding->src_bytes = bytes;
8051   coding->src_multibyte = chars < bytes;
8052
8053   if (STRINGP (src_object))
8054     {
8055       coding->src_pos = from;
8056       coding->src_pos_byte = from_byte;
8057     }
8058   else if (BUFFERP (src_object))
8059     {
8060       set_buffer_internal (XBUFFER (src_object));
8061       if (from != GPT)
8062         move_gap_both (from, from_byte);
8063       if (EQ (src_object, dst_object))
8064         {
8065           struct Lisp_Marker *tail;
8066
8067           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8068             {
8069               tail->need_adjustment
8070                 = tail->charpos == (tail->insertion_type ? from : to);
8071               need_marker_adjustment |= tail->need_adjustment;
8072             }
8073           saved_pt = PT, saved_pt_byte = PT_BYTE;
8074           TEMP_SET_PT_BOTH (from, from_byte);
8075           current_buffer->text->inhibit_shrinking = 1;
8076           del_range_both (from, from_byte, to, to_byte, 1);
8077           coding->src_pos = -chars;
8078           coding->src_pos_byte = -bytes;
8079         }
8080       else
8081         {
8082           coding->src_pos = from;
8083           coding->src_pos_byte = from_byte;
8084         }
8085     }
8086
8087   if (CODING_REQUIRE_DETECTION (coding))
8088     detect_coding (coding);
8089   attrs = CODING_ID_ATTRS (coding->id);
8090
8091   if (EQ (dst_object, Qt)
8092       || (! NILP (CODING_ATTR_POST_READ (attrs))
8093           && NILP (dst_object)))
8094     {
8095       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8096       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8097       coding->dst_pos = BEG;
8098       coding->dst_pos_byte = BEG_BYTE;
8099     }
8100   else if (BUFFERP (dst_object))
8101     {
8102       code_conversion_save (0, 0);
8103       coding->dst_object = dst_object;
8104       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8105       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8106       coding->dst_multibyte
8107         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8108     }
8109   else
8110     {
8111       code_conversion_save (0, 0);
8112       coding->dst_object = Qnil;
8113       /* Most callers presume this will return a multibyte result, and they
8114          won't use `binary' or `raw-text' anyway, so let's not worry about
8115          CODING_FOR_UNIBYTE.  */
8116       coding->dst_multibyte = 1;
8117     }
8118
8119   decode_coding (coding);
8120
8121   if (BUFFERP (coding->dst_object))
8122     set_buffer_internal (XBUFFER (coding->dst_object));
8123
8124   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8125     {
8126       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8127       Lisp_Object val;
8128       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
8129       ptrdiff_t count1 = SPECPDL_INDEX ();
8130
8131       record_unwind_protect (coding_restore_undo_list,
8132                              Fcons (undo_list, Fcurrent_buffer ()));
8133       bset_undo_list (current_buffer, Qt);
8134       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8135       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8136                         make_number (coding->produced_char));
8137       CHECK_NATNUM (val);
8138       coding->produced_char += Z - prev_Z;
8139       coding->produced += Z_BYTE - prev_Z_BYTE;
8140       unbind_to (count1, Qnil);
8141     }
8142
8143   if (EQ (dst_object, Qt))
8144     {
8145       coding->dst_object = Fbuffer_string ();
8146     }
8147   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8148     {
8149       set_buffer_internal (XBUFFER (coding->dst_object));
8150       if (dst_bytes < coding->produced)
8151         {
8152           eassert (coding->produced > 0);
8153           destination = xrealloc (destination, coding->produced);
8154           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8155             move_gap_both (BEGV, BEGV_BYTE);
8156           memcpy (destination, BEGV_ADDR, coding->produced);
8157           coding->destination = destination;
8158         }
8159     }
8160
8161   if (saved_pt >= 0)
8162     {
8163       /* This is the case of:
8164          (BUFFERP (src_object) && EQ (src_object, dst_object))
8165          As we have moved PT while replacing the original buffer
8166          contents, we must recover it now.  */
8167       set_buffer_internal (XBUFFER (src_object));
8168       current_buffer->text->inhibit_shrinking = 0;
8169       if (saved_pt < from)
8170         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8171       else if (saved_pt < from + chars)
8172         TEMP_SET_PT_BOTH (from, from_byte);
8173       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8174         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8175                           saved_pt_byte + (coding->produced - bytes));
8176       else
8177         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8178                           saved_pt_byte + (coding->produced - bytes));
8179
8180       if (need_marker_adjustment)
8181         {
8182           struct Lisp_Marker *tail;
8183
8184           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8185             if (tail->need_adjustment)
8186               {
8187                 tail->need_adjustment = 0;
8188                 if (tail->insertion_type)
8189                   {
8190                     tail->bytepos = from_byte;
8191                     tail->charpos = from;
8192                   }
8193                 else
8194                   {
8195                     tail->bytepos = from_byte + coding->produced;
8196                     tail->charpos
8197                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8198                          ? tail->bytepos : from + coding->produced_char);
8199                   }
8200               }
8201         }
8202     }
8203
8204   Vdeactivate_mark = old_deactivate_mark;
8205   unbind_to (count, coding->dst_object);
8206 }
8207
8208
8209 void
8210 encode_coding_object (struct coding_system *coding,
8211                       Lisp_Object src_object,
8212                       ptrdiff_t from, ptrdiff_t from_byte,
8213                       ptrdiff_t to, ptrdiff_t to_byte,
8214                       Lisp_Object dst_object)
8215 {
8216   ptrdiff_t count = SPECPDL_INDEX ();
8217   ptrdiff_t chars = to - from;
8218   ptrdiff_t bytes = to_byte - from_byte;
8219   Lisp_Object attrs;
8220   ptrdiff_t saved_pt = -1, saved_pt_byte;
8221   bool need_marker_adjustment = 0;
8222   bool kill_src_buffer = 0;
8223   Lisp_Object old_deactivate_mark;
8224
8225   old_deactivate_mark = Vdeactivate_mark;
8226
8227   coding->src_object = src_object;
8228   coding->src_chars = chars;
8229   coding->src_bytes = bytes;
8230   coding->src_multibyte = chars < bytes;
8231
8232   attrs = CODING_ID_ATTRS (coding->id);
8233
8234   if (EQ (src_object, dst_object))
8235     {
8236       struct Lisp_Marker *tail;
8237
8238       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8239         {
8240           tail->need_adjustment
8241             = tail->charpos == (tail->insertion_type ? from : to);
8242           need_marker_adjustment |= tail->need_adjustment;
8243         }
8244     }
8245
8246   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8247     {
8248       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8249       set_buffer_internal (XBUFFER (coding->src_object));
8250       if (STRINGP (src_object))
8251         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8252       else if (BUFFERP (src_object))
8253         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8254       else
8255         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8256
8257       if (EQ (src_object, dst_object))
8258         {
8259           set_buffer_internal (XBUFFER (src_object));
8260           saved_pt = PT, saved_pt_byte = PT_BYTE;
8261           del_range_both (from, from_byte, to, to_byte, 1);
8262           set_buffer_internal (XBUFFER (coding->src_object));
8263         }
8264
8265       safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8266                   make_number (BEG), make_number (Z));
8267       if (XBUFFER (coding->src_object) != current_buffer)
8268         kill_src_buffer = 1;
8269       coding->src_object = Fcurrent_buffer ();
8270       if (BEG != GPT)
8271         move_gap_both (BEG, BEG_BYTE);
8272       coding->src_chars = Z - BEG;
8273       coding->src_bytes = Z_BYTE - BEG_BYTE;
8274       coding->src_pos = BEG;
8275       coding->src_pos_byte = BEG_BYTE;
8276       coding->src_multibyte = Z < Z_BYTE;
8277     }
8278   else if (STRINGP (src_object))
8279     {
8280       code_conversion_save (0, 0);
8281       coding->src_pos = from;
8282       coding->src_pos_byte = from_byte;
8283     }
8284   else if (BUFFERP (src_object))
8285     {
8286       code_conversion_save (0, 0);
8287       set_buffer_internal (XBUFFER (src_object));
8288       if (EQ (src_object, dst_object))
8289         {
8290           saved_pt = PT, saved_pt_byte = PT_BYTE;
8291           coding->src_object = del_range_1 (from, to, 1, 1);
8292           coding->src_pos = 0;
8293           coding->src_pos_byte = 0;
8294         }
8295       else
8296         {
8297           if (from < GPT && to >= GPT)
8298             move_gap_both (from, from_byte);
8299           coding->src_pos = from;
8300           coding->src_pos_byte = from_byte;
8301         }
8302     }
8303   else
8304     {
8305       code_conversion_save (0, 0);
8306       coding->src_pos = from;
8307       coding->src_pos_byte = from_byte;
8308     }
8309
8310   if (BUFFERP (dst_object))
8311     {
8312       coding->dst_object = dst_object;
8313       if (EQ (src_object, dst_object))
8314         {
8315           coding->dst_pos = from;
8316           coding->dst_pos_byte = from_byte;
8317         }
8318       else
8319         {
8320           struct buffer *current = current_buffer;
8321
8322           set_buffer_temp (XBUFFER (dst_object));
8323           coding->dst_pos = PT;
8324           coding->dst_pos_byte = PT_BYTE;
8325           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8326           set_buffer_temp (current);
8327         }
8328       coding->dst_multibyte
8329         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8330     }
8331   else if (EQ (dst_object, Qt))
8332     {
8333       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8334       coding->dst_object = Qnil;
8335       coding->destination = xmalloc (dst_bytes);
8336       coding->dst_bytes = dst_bytes;
8337       coding->dst_multibyte = 0;
8338     }
8339   else
8340     {
8341       coding->dst_object = Qnil;
8342       coding->dst_multibyte = 0;
8343     }
8344
8345   encode_coding (coding);
8346
8347   if (EQ (dst_object, Qt))
8348     {
8349       if (BUFFERP (coding->dst_object))
8350         coding->dst_object = Fbuffer_string ();
8351       else if (coding->raw_destination)
8352         /* This is used to avoid creating huge Lisp string.
8353            NOTE: caller who sets `raw_destination' is also
8354            responsible for freeing `destination' buffer.  */
8355         coding->dst_object = Qnil;
8356       else
8357         {
8358           coding->dst_object
8359             = make_unibyte_string ((char *) coding->destination,
8360                                    coding->produced);
8361           xfree (coding->destination);
8362         }
8363     }
8364
8365   if (saved_pt >= 0)
8366     {
8367       /* This is the case of:
8368          (BUFFERP (src_object) && EQ (src_object, dst_object))
8369          As we have moved PT while replacing the original buffer
8370          contents, we must recover it now.  */
8371       set_buffer_internal (XBUFFER (src_object));
8372       if (saved_pt < from)
8373         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8374       else if (saved_pt < from + chars)
8375         TEMP_SET_PT_BOTH (from, from_byte);
8376       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8377         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8378                           saved_pt_byte + (coding->produced - bytes));
8379       else
8380         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8381                           saved_pt_byte + (coding->produced - bytes));
8382
8383       if (need_marker_adjustment)
8384         {
8385           struct Lisp_Marker *tail;
8386
8387           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8388             if (tail->need_adjustment)
8389               {
8390                 tail->need_adjustment = 0;
8391                 if (tail->insertion_type)
8392                   {
8393                     tail->bytepos = from_byte;
8394                     tail->charpos = from;
8395                   }
8396                 else
8397                   {
8398                     tail->bytepos = from_byte + coding->produced;
8399                     tail->charpos
8400                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8401                          ? tail->bytepos : from + coding->produced_char);
8402                   }
8403               }
8404         }
8405     }
8406
8407   if (kill_src_buffer)
8408     Fkill_buffer (coding->src_object);
8409
8410   Vdeactivate_mark = old_deactivate_mark;
8411   unbind_to (count, Qnil);
8412 }
8413
8414
8415 Lisp_Object
8416 preferred_coding_system (void)
8417 {
8418   int id = coding_categories[coding_priorities[0]].id;
8419
8420   return CODING_ID_NAME (id);
8421 }
8422
8423 #if defined (WINDOWSNT) || defined (CYGWIN)
8424
8425 Lisp_Object
8426 from_unicode (Lisp_Object str)
8427 {
8428   CHECK_STRING (str);
8429   if (!STRING_MULTIBYTE (str) &&
8430       SBYTES (str) & 1)
8431     {
8432       str = Fsubstring (str, make_number (0), make_number (-1));
8433     }
8434
8435   return code_convert_string_norecord (str, Qutf_16le, 0);
8436 }
8437
8438 Lisp_Object
8439 from_unicode_buffer (const wchar_t *wstr)
8440 {
8441   /* We get one of the two final null bytes for free.  */
8442   ptrdiff_t len = 1 + sizeof (wchar_t) * wcslen (wstr);
8443   AUTO_STRING_WITH_LEN (str, (char *) wstr, len);
8444   return from_unicode (str);
8445 }
8446
8447 wchar_t *
8448 to_unicode (Lisp_Object str, Lisp_Object *buf)
8449 {
8450   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8451   /* We need to make another copy (in addition to the one made by
8452      code_convert_string_norecord) to ensure that the final string is
8453      _doubly_ zero terminated --- that is, that the string is
8454      terminated by two zero bytes and one utf-16le null character.
8455      Because strings are already terminated with a single zero byte,
8456      we just add one additional zero. */
8457   str = make_uninit_string (SBYTES (*buf) + 1);
8458   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8459   SDATA (str) [SBYTES (*buf)] = '\0';
8460   *buf = str;
8461   return WCSDATA (*buf);
8462 }
8463
8464 #endif /* WINDOWSNT || CYGWIN */
8465
8466 \f
8467 #ifdef emacs
8468 /*** 8. Emacs Lisp library functions ***/
8469
8470 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8471        doc: /* Return t if OBJECT is nil or a coding-system.
8472 See the documentation of `define-coding-system' for information
8473 about coding-system objects.  */)
8474   (Lisp_Object object)
8475 {
8476   if (NILP (object)
8477       || CODING_SYSTEM_ID (object) >= 0)
8478     return Qt;
8479   if (! SYMBOLP (object)
8480       || NILP (Fget (object, Qcoding_system_define_form)))
8481     return Qnil;
8482   return Qt;
8483 }
8484
8485 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8486        Sread_non_nil_coding_system, 1, 1, 0,
8487        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8488   (Lisp_Object prompt)
8489 {
8490   Lisp_Object val;
8491   do
8492     {
8493       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8494                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8495     }
8496   while (SCHARS (val) == 0);
8497   return (Fintern (val, Qnil));
8498 }
8499
8500 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8501        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8502 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8503 Ignores case when completing coding systems (all Emacs coding systems
8504 are lower-case).  */)
8505   (Lisp_Object prompt, Lisp_Object default_coding_system)
8506 {
8507   Lisp_Object val;
8508   ptrdiff_t count = SPECPDL_INDEX ();
8509
8510   if (SYMBOLP (default_coding_system))
8511     default_coding_system = SYMBOL_NAME (default_coding_system);
8512   specbind (Qcompletion_ignore_case, Qt);
8513   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8514                           Qt, Qnil, Qcoding_system_history,
8515                           default_coding_system, Qnil);
8516   unbind_to (count, Qnil);
8517   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8518 }
8519
8520 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8521        1, 1, 0,
8522        doc: /* Check validity of CODING-SYSTEM.
8523 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8524 It is valid if it is nil or a symbol defined as a coding system by the
8525 function `define-coding-system'.  */)
8526   (Lisp_Object coding_system)
8527 {
8528   Lisp_Object define_form;
8529
8530   define_form = Fget (coding_system, Qcoding_system_define_form);
8531   if (! NILP (define_form))
8532     {
8533       Fput (coding_system, Qcoding_system_define_form, Qnil);
8534       safe_eval (define_form);
8535     }
8536   if (!NILP (Fcoding_system_p (coding_system)))
8537     return coding_system;
8538   xsignal1 (Qcoding_system_error, coding_system);
8539 }
8540
8541 \f
8542 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8543    HIGHEST, return the coding system of the highest
8544    priority among the detected coding systems.  Otherwise return a
8545    list of detected coding systems sorted by their priorities.  If
8546    MULTIBYTEP, it is assumed that the bytes are in correct
8547    multibyte form but contains only ASCII and eight-bit chars.
8548    Otherwise, the bytes are raw bytes.
8549
8550    CODING-SYSTEM controls the detection as below:
8551
8552    If it is nil, detect both text-format and eol-format.  If the
8553    text-format part of CODING-SYSTEM is already specified
8554    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8555    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8556    detect only text-format.  */
8557
8558 Lisp_Object
8559 detect_coding_system (const unsigned char *src,
8560                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8561                       bool highest, bool multibytep,
8562                       Lisp_Object coding_system)
8563 {
8564   const unsigned char *src_end = src + src_bytes;
8565   Lisp_Object attrs, eol_type;
8566   Lisp_Object val = Qnil;
8567   struct coding_system coding;
8568   ptrdiff_t id;
8569   struct coding_detection_info detect_info;
8570   enum coding_category base_category;
8571   bool null_byte_found = 0, eight_bit_found = 0;
8572
8573   if (NILP (coding_system))
8574     coding_system = Qundecided;
8575   setup_coding_system (coding_system, &coding);
8576   attrs = CODING_ID_ATTRS (coding.id);
8577   eol_type = CODING_ID_EOL_TYPE (coding.id);
8578   coding_system = CODING_ATTR_BASE_NAME (attrs);
8579
8580   coding.source = src;
8581   coding.src_chars = src_chars;
8582   coding.src_bytes = src_bytes;
8583   coding.src_multibyte = multibytep;
8584   coding.consumed = 0;
8585   coding.mode |= CODING_MODE_LAST_BLOCK;
8586   coding.head_ascii = 0;
8587
8588   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8589
8590   /* At first, detect text-format if necessary.  */
8591   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8592   if (base_category == coding_category_undecided)
8593     {
8594       enum coding_category category UNINIT;
8595       struct coding_system *this UNINIT;
8596       int c, i;
8597       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8598                                        inhibit_null_byte_detection);
8599       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8600                                        inhibit_iso_escape_detection);
8601       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8602
8603       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8604       for (; src < src_end; src++)
8605         {
8606           c = *src;
8607           if (c & 0x80)
8608             {
8609               eight_bit_found = 1;
8610               if (null_byte_found)
8611                 break;
8612             }
8613           else if (c < 0x20)
8614             {
8615               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8616                   && ! inhibit_ied
8617                   && ! detect_info.checked)
8618                 {
8619                   if (detect_coding_iso_2022 (&coding, &detect_info))
8620                     {
8621                       /* We have scanned the whole data.  */
8622                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8623                         {
8624                           /* We didn't find an 8-bit code.  We may
8625                              have found a null-byte, but it's very
8626                              rare that a binary file confirm to
8627                              ISO-2022.  */
8628                           src = src_end;
8629                           coding.head_ascii = src - coding.source;
8630                         }
8631                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8632                       break;
8633                     }
8634                 }
8635               else if (! c && !inhibit_nbd)
8636                 {
8637                   null_byte_found = 1;
8638                   if (eight_bit_found)
8639                     break;
8640                 }
8641               if (! eight_bit_found)
8642                 coding.head_ascii++;
8643             }
8644           else if (! eight_bit_found)
8645             coding.head_ascii++;
8646         }
8647
8648       if (null_byte_found || eight_bit_found
8649           || coding.head_ascii < coding.src_bytes
8650           || detect_info.found)
8651         {
8652           if (coding.head_ascii == coding.src_bytes)
8653             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8654             for (i = 0; i < coding_category_raw_text; i++)
8655               {
8656                 category = coding_priorities[i];
8657                 this = coding_categories + category;
8658                 if (detect_info.found & (1 << category))
8659                   break;
8660               }
8661           else
8662             {
8663               if (null_byte_found)
8664                 {
8665                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8666                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8667                 }
8668               else if (prefer_utf_8
8669                        && detect_coding_utf_8 (&coding, &detect_info))
8670                 {
8671                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8672                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8673                 }
8674               for (i = 0; i < coding_category_raw_text; i++)
8675                 {
8676                   category = coding_priorities[i];
8677                   this = coding_categories + category;
8678
8679                   if (this->id < 0)
8680                     {
8681                       /* No coding system of this category is defined.  */
8682                       detect_info.rejected |= (1 << category);
8683                     }
8684                   else if (category >= coding_category_raw_text)
8685                     continue;
8686                   else if (detect_info.checked & (1 << category))
8687                     {
8688                       if (highest
8689                           && (detect_info.found & (1 << category)))
8690                         break;
8691                     }
8692                   else if ((*(this->detector)) (&coding, &detect_info)
8693                            && highest
8694                            && (detect_info.found & (1 << category)))
8695                     {
8696                       if (category == coding_category_utf_16_auto)
8697                         {
8698                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8699                             category = coding_category_utf_16_le;
8700                           else
8701                             category = coding_category_utf_16_be;
8702                         }
8703                       break;
8704                     }
8705                 }
8706             }
8707         }
8708
8709       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8710           || null_byte_found)
8711         {
8712           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8713           id = CODING_SYSTEM_ID (Qno_conversion);
8714           val = list1 (make_number (id));
8715         }
8716       else if (! detect_info.rejected && ! detect_info.found)
8717         {
8718           detect_info.found = CATEGORY_MASK_ANY;
8719           id = coding_categories[coding_category_undecided].id;
8720           val = list1 (make_number (id));
8721         }
8722       else if (highest)
8723         {
8724           if (detect_info.found)
8725             {
8726               detect_info.found = 1 << category;
8727               val = list1 (make_number (this->id));
8728             }
8729           else
8730             for (i = 0; i < coding_category_raw_text; i++)
8731               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8732                 {
8733                   detect_info.found = 1 << coding_priorities[i];
8734                   id = coding_categories[coding_priorities[i]].id;
8735                   val = list1 (make_number (id));
8736                   break;
8737                 }
8738         }
8739       else
8740         {
8741           int mask = detect_info.rejected | detect_info.found;
8742           int found = 0;
8743
8744           for (i = coding_category_raw_text - 1; i >= 0; i--)
8745             {
8746               category = coding_priorities[i];
8747               if (! (mask & (1 << category)))
8748                 {
8749                   found |= 1 << category;
8750                   id = coding_categories[category].id;
8751                   if (id >= 0)
8752                     val = list1 (make_number (id));
8753                 }
8754             }
8755           for (i = coding_category_raw_text - 1; i >= 0; i--)
8756             {
8757               category = coding_priorities[i];
8758               if (detect_info.found & (1 << category))
8759                 {
8760                   id = coding_categories[category].id;
8761                   val = Fcons (make_number (id), val);
8762                 }
8763             }
8764           detect_info.found |= found;
8765         }
8766     }
8767   else if (base_category == coding_category_utf_8_auto)
8768     {
8769       if (detect_coding_utf_8 (&coding, &detect_info))
8770         {
8771           struct coding_system *this;
8772
8773           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8774             this = coding_categories + coding_category_utf_8_sig;
8775           else
8776             this = coding_categories + coding_category_utf_8_nosig;
8777           val = list1 (make_number (this->id));
8778         }
8779     }
8780   else if (base_category == coding_category_utf_16_auto)
8781     {
8782       if (detect_coding_utf_16 (&coding, &detect_info))
8783         {
8784           struct coding_system *this;
8785
8786           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8787             this = coding_categories + coding_category_utf_16_le;
8788           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8789             this = coding_categories + coding_category_utf_16_be;
8790           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8791             this = coding_categories + coding_category_utf_16_be_nosig;
8792           else
8793             this = coding_categories + coding_category_utf_16_le_nosig;
8794           val = list1 (make_number (this->id));
8795         }
8796     }
8797   else
8798     {
8799       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8800       val = list1 (make_number (coding.id));
8801     }
8802
8803   /* Then, detect eol-format if necessary.  */
8804   {
8805     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8806     Lisp_Object tail;
8807
8808     if (VECTORP (eol_type))
8809       {
8810         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8811           {
8812             if (null_byte_found)
8813               normal_eol = EOL_SEEN_LF;
8814             else
8815               normal_eol = detect_eol (coding.source, src_bytes,
8816                                        coding_category_raw_text);
8817           }
8818         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8819                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8820           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8821                                       coding_category_utf_16_be);
8822         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8823                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8824           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8825                                       coding_category_utf_16_le);
8826       }
8827     else
8828       {
8829         if (EQ (eol_type, Qunix))
8830           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8831         else if (EQ (eol_type, Qdos))
8832           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8833         else
8834           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8835       }
8836
8837     for (tail = val; CONSP (tail); tail = XCDR (tail))
8838       {
8839         enum coding_category category;
8840         int this_eol;
8841
8842         id = XINT (XCAR (tail));
8843         attrs = CODING_ID_ATTRS (id);
8844         category = XINT (CODING_ATTR_CATEGORY (attrs));
8845         eol_type = CODING_ID_EOL_TYPE (id);
8846         if (VECTORP (eol_type))
8847           {
8848             if (category == coding_category_utf_16_be
8849                 || category == coding_category_utf_16_be_nosig)
8850               this_eol = utf_16_be_eol;
8851             else if (category == coding_category_utf_16_le
8852                      || category == coding_category_utf_16_le_nosig)
8853               this_eol = utf_16_le_eol;
8854             else
8855               this_eol = normal_eol;
8856
8857             if (this_eol == EOL_SEEN_LF)
8858               XSETCAR (tail, AREF (eol_type, 0));
8859             else if (this_eol == EOL_SEEN_CRLF)
8860               XSETCAR (tail, AREF (eol_type, 1));
8861             else if (this_eol == EOL_SEEN_CR)
8862               XSETCAR (tail, AREF (eol_type, 2));
8863             else
8864               XSETCAR (tail, CODING_ID_NAME (id));
8865           }
8866         else
8867           XSETCAR (tail, CODING_ID_NAME (id));
8868       }
8869   }
8870
8871   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8872 }
8873
8874
8875 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8876        2, 3, 0,
8877        doc: /* Detect coding system of the text in the region between START and END.
8878 Return a list of possible coding systems ordered by priority.
8879 The coding systems to try and their priorities follows what
8880 the function `coding-system-priority-list' (which see) returns.
8881
8882 If only ASCII characters are found (except for such ISO-2022 control
8883 characters as ESC), it returns a list of single element `undecided'
8884 or its subsidiary coding system according to a detected end-of-line
8885 format.
8886
8887 If optional argument HIGHEST is non-nil, return the coding system of
8888 highest priority.  */)
8889   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8890 {
8891   ptrdiff_t from, to;
8892   ptrdiff_t from_byte, to_byte;
8893
8894   validate_region (&start, &end);
8895   from = XINT (start), to = XINT (end);
8896   from_byte = CHAR_TO_BYTE (from);
8897   to_byte = CHAR_TO_BYTE (to);
8898
8899   if (from < GPT && to >= GPT)
8900     move_gap_both (to, to_byte);
8901
8902   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8903                                to - from, to_byte - from_byte,
8904                                !NILP (highest),
8905                                !NILP (BVAR (current_buffer
8906                                       , enable_multibyte_characters)),
8907                                Qnil);
8908 }
8909
8910 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8911        1, 2, 0,
8912        doc: /* Detect coding system of the text in STRING.
8913 Return a list of possible coding systems ordered by priority.
8914 The coding systems to try and their priorities follows what
8915 the function `coding-system-priority-list' (which see) returns.
8916
8917 If only ASCII characters are found (except for such ISO-2022 control
8918 characters as ESC), it returns a list of single element `undecided'
8919 or its subsidiary coding system according to a detected end-of-line
8920 format.
8921
8922 If optional argument HIGHEST is non-nil, return the coding system of
8923 highest priority.  */)
8924   (Lisp_Object string, Lisp_Object highest)
8925 {
8926   CHECK_STRING (string);
8927
8928   return detect_coding_system (SDATA (string),
8929                                SCHARS (string), SBYTES (string),
8930                                !NILP (highest), STRING_MULTIBYTE (string),
8931                                Qnil);
8932 }
8933
8934
8935 static bool
8936 char_encodable_p (int c, Lisp_Object attrs)
8937 {
8938   Lisp_Object tail;
8939   struct charset *charset;
8940   Lisp_Object translation_table;
8941
8942   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8943   if (! NILP (translation_table))
8944     c = translate_char (translation_table, c);
8945   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8946        CONSP (tail); tail = XCDR (tail))
8947     {
8948       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8949       if (CHAR_CHARSET_P (c, charset))
8950         break;
8951     }
8952   return (! NILP (tail));
8953 }
8954
8955
8956 /* Return a list of coding systems that safely encode the text between
8957    START and END.  If EXCLUDE is non-nil, it is a list of coding
8958    systems not to check.  The returned list doesn't contain any such
8959    coding systems.  In any case, if the text contains only ASCII or is
8960    unibyte, return t.  */
8961
8962 DEFUN ("find-coding-systems-region-internal",
8963        Ffind_coding_systems_region_internal,
8964        Sfind_coding_systems_region_internal, 2, 3, 0,
8965        doc: /* Internal use only.  */)
8966   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8967 {
8968   Lisp_Object coding_attrs_list, safe_codings;
8969   ptrdiff_t start_byte, end_byte;
8970   const unsigned char *p, *pbeg, *pend;
8971   int c;
8972   Lisp_Object tail, elt, work_table;
8973
8974   if (STRINGP (start))
8975     {
8976       if (!STRING_MULTIBYTE (start)
8977           || SCHARS (start) == SBYTES (start))
8978         return Qt;
8979       start_byte = 0;
8980       end_byte = SBYTES (start);
8981     }
8982   else
8983     {
8984       CHECK_NUMBER_COERCE_MARKER (start);
8985       CHECK_NUMBER_COERCE_MARKER (end);
8986       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8987         args_out_of_range (start, end);
8988       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8989         return Qt;
8990       start_byte = CHAR_TO_BYTE (XINT (start));
8991       end_byte = CHAR_TO_BYTE (XINT (end));
8992       if (XINT (end) - XINT (start) == end_byte - start_byte)
8993         return Qt;
8994
8995       if (XINT (start) < GPT && XINT (end) > GPT)
8996         {
8997           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8998             move_gap_both (XINT (start), start_byte);
8999           else
9000             move_gap_both (XINT (end), end_byte);
9001         }
9002     }
9003
9004   coding_attrs_list = Qnil;
9005   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9006     if (NILP (exclude)
9007         || NILP (Fmemq (XCAR (tail), exclude)))
9008       {
9009         Lisp_Object attrs;
9010
9011         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9012         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9013           {
9014             ASET (attrs, coding_attr_trans_tbl,
9015                   get_translation_table (attrs, 1, NULL));
9016             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9017           }
9018       }
9019
9020   if (STRINGP (start))
9021     p = pbeg = SDATA (start);
9022   else
9023     p = pbeg = BYTE_POS_ADDR (start_byte);
9024   pend = p + (end_byte - start_byte);
9025
9026   while (p < pend && ASCII_CHAR_P (*p)) p++;
9027   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9028
9029   work_table = Fmake_char_table (Qnil, Qnil);
9030   while (p < pend)
9031     {
9032       if (ASCII_CHAR_P (*p))
9033         p++;
9034       else
9035         {
9036           c = STRING_CHAR_ADVANCE (p);
9037           if (!NILP (char_table_ref (work_table, c)))
9038             /* This character was already checked.  Ignore it.  */
9039             continue;
9040
9041           charset_map_loaded = 0;
9042           for (tail = coding_attrs_list; CONSP (tail);)
9043             {
9044               elt = XCAR (tail);
9045               if (NILP (elt))
9046                 tail = XCDR (tail);
9047               else if (char_encodable_p (c, elt))
9048                 tail = XCDR (tail);
9049               else if (CONSP (XCDR (tail)))
9050                 {
9051                   XSETCAR (tail, XCAR (XCDR (tail)));
9052                   XSETCDR (tail, XCDR (XCDR (tail)));
9053                 }
9054               else
9055                 {
9056                   XSETCAR (tail, Qnil);
9057                   tail = XCDR (tail);
9058                 }
9059             }
9060           if (charset_map_loaded)
9061             {
9062               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9063
9064               if (STRINGP (start))
9065                 pbeg = SDATA (start);
9066               else
9067                 pbeg = BYTE_POS_ADDR (start_byte);
9068               p = pbeg + p_offset;
9069               pend = pbeg + pend_offset;
9070             }
9071           char_table_set (work_table, c, Qt);
9072         }
9073     }
9074
9075   safe_codings = list2 (Qraw_text, Qno_conversion);
9076   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9077     if (! NILP (XCAR (tail)))
9078       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9079
9080   return safe_codings;
9081 }
9082
9083
9084 DEFUN ("unencodable-char-position", Funencodable_char_position,
9085        Sunencodable_char_position, 3, 5, 0,
9086        doc: /* Return position of first un-encodable character in a region.
9087 START and END specify the region and CODING-SYSTEM specifies the
9088 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9089
9090 If optional 4th argument COUNT is non-nil, it specifies at most how
9091 many un-encodable characters to search.  In this case, the value is a
9092 list of positions.
9093
9094 If optional 5th argument STRING is non-nil, it is a string to search
9095 for un-encodable characters.  In that case, START and END are indexes
9096 to the string and treated as in `substring'.  */)
9097   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9098    Lisp_Object count, Lisp_Object string)
9099 {
9100   EMACS_INT n;
9101   struct coding_system coding;
9102   Lisp_Object attrs, charset_list, translation_table;
9103   Lisp_Object positions;
9104   ptrdiff_t from, to;
9105   const unsigned char *p, *stop, *pend;
9106   bool ascii_compatible;
9107
9108   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9109   attrs = CODING_ID_ATTRS (coding.id);
9110   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9111     return Qnil;
9112   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9113   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9114   translation_table = get_translation_table (attrs, 1, NULL);
9115
9116   if (NILP (string))
9117     {
9118       validate_region (&start, &end);
9119       from = XINT (start);
9120       to = XINT (end);
9121       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9122           || (ascii_compatible
9123               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9124         return Qnil;
9125       p = CHAR_POS_ADDR (from);
9126       pend = CHAR_POS_ADDR (to);
9127       if (from < GPT && to >= GPT)
9128         stop = GPT_ADDR;
9129       else
9130         stop = pend;
9131     }
9132   else
9133     {
9134       CHECK_STRING (string);
9135       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9136       if (! STRING_MULTIBYTE (string))
9137         return Qnil;
9138       p = SDATA (string) + string_char_to_byte (string, from);
9139       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9140       if (ascii_compatible && (to - from) == (pend - p))
9141         return Qnil;
9142     }
9143
9144   if (NILP (count))
9145     n = 1;
9146   else
9147     {
9148       CHECK_NATNUM (count);
9149       n = XINT (count);
9150     }
9151
9152   positions = Qnil;
9153   charset_map_loaded = 0;
9154   while (1)
9155     {
9156       int c;
9157
9158       if (ascii_compatible)
9159         while (p < stop && ASCII_CHAR_P (*p))
9160           p++, from++;
9161       if (p >= stop)
9162         {
9163           if (p >= pend)
9164             break;
9165           stop = pend;
9166           p = GAP_END_ADDR;
9167         }
9168
9169       c = STRING_CHAR_ADVANCE (p);
9170       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9171           && ! char_charset (translate_char (translation_table, c),
9172                              charset_list, NULL))
9173         {
9174           positions = Fcons (make_number (from), positions);
9175           n--;
9176           if (n == 0)
9177             break;
9178         }
9179
9180       from++;
9181       if (charset_map_loaded && NILP (string))
9182         {
9183           p = CHAR_POS_ADDR (from);
9184           pend = CHAR_POS_ADDR (to);
9185           if (from < GPT && to >= GPT)
9186             stop = GPT_ADDR;
9187           else
9188             stop = pend;
9189           charset_map_loaded = 0;
9190         }
9191     }
9192
9193   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9194 }
9195
9196
9197 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9198        Scheck_coding_systems_region, 3, 3, 0,
9199        doc: /* Check if the region is encodable by coding systems.
9200
9201 START and END are buffer positions specifying the region.
9202 CODING-SYSTEM-LIST is a list of coding systems to check.
9203
9204 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9205 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9206 whole region, POS0, POS1, ... are buffer positions where non-encodable
9207 characters are found.
9208
9209 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9210 value is nil.
9211
9212 START may be a string.  In that case, check if the string is
9213 encodable, and the value contains indices to the string instead of
9214 buffer positions.  END is ignored.
9215
9216 If the current buffer (or START if it is a string) is unibyte, the value
9217 is nil.  */)
9218   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9219 {
9220   Lisp_Object list;
9221   ptrdiff_t start_byte, end_byte;
9222   ptrdiff_t pos;
9223   const unsigned char *p, *pbeg, *pend;
9224   int c;
9225   Lisp_Object tail, elt, attrs;
9226
9227   if (STRINGP (start))
9228     {
9229       if (!STRING_MULTIBYTE (start)
9230           || SCHARS (start) == SBYTES (start))
9231         return Qnil;
9232       start_byte = 0;
9233       end_byte = SBYTES (start);
9234       pos = 0;
9235     }
9236   else
9237     {
9238       CHECK_NUMBER_COERCE_MARKER (start);
9239       CHECK_NUMBER_COERCE_MARKER (end);
9240       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9241         args_out_of_range (start, end);
9242       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9243         return Qnil;
9244       start_byte = CHAR_TO_BYTE (XINT (start));
9245       end_byte = CHAR_TO_BYTE (XINT (end));
9246       if (XINT (end) - XINT (start) == end_byte - start_byte)
9247         return Qnil;
9248
9249       if (XINT (start) < GPT && XINT (end) > GPT)
9250         {
9251           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9252             move_gap_both (XINT (start), start_byte);
9253           else
9254             move_gap_both (XINT (end), end_byte);
9255         }
9256       pos = XINT (start);
9257     }
9258
9259   list = Qnil;
9260   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9261     {
9262       elt = XCAR (tail);
9263       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9264       ASET (attrs, coding_attr_trans_tbl,
9265             get_translation_table (attrs, 1, NULL));
9266       list = Fcons (list2 (elt, attrs), list);
9267     }
9268
9269   if (STRINGP (start))
9270     p = pbeg = SDATA (start);
9271   else
9272     p = pbeg = BYTE_POS_ADDR (start_byte);
9273   pend = p + (end_byte - start_byte);
9274
9275   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9276   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9277
9278   while (p < pend)
9279     {
9280       if (ASCII_CHAR_P (*p))
9281         p++;
9282       else
9283         {
9284           c = STRING_CHAR_ADVANCE (p);
9285
9286           charset_map_loaded = 0;
9287           for (tail = list; CONSP (tail); tail = XCDR (tail))
9288             {
9289               elt = XCDR (XCAR (tail));
9290               if (! char_encodable_p (c, XCAR (elt)))
9291                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9292             }
9293           if (charset_map_loaded)
9294             {
9295               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9296
9297               if (STRINGP (start))
9298                 pbeg = SDATA (start);
9299               else
9300                 pbeg = BYTE_POS_ADDR (start_byte);
9301               p = pbeg + p_offset;
9302               pend = pbeg + pend_offset;
9303             }
9304         }
9305       pos++;
9306     }
9307
9308   tail = list;
9309   list = Qnil;
9310   for (; CONSP (tail); tail = XCDR (tail))
9311     {
9312       elt = XCAR (tail);
9313       if (CONSP (XCDR (XCDR (elt))))
9314         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9315                       list);
9316     }
9317
9318   return list;
9319 }
9320
9321
9322 static Lisp_Object
9323 code_convert_region (Lisp_Object start, Lisp_Object end,
9324                      Lisp_Object coding_system, Lisp_Object dst_object,
9325                      bool encodep, bool norecord)
9326 {
9327   struct coding_system coding;
9328   ptrdiff_t from, from_byte, to, to_byte;
9329   Lisp_Object src_object;
9330
9331   if (NILP (coding_system))
9332     coding_system = Qno_conversion;
9333   else
9334     CHECK_CODING_SYSTEM (coding_system);
9335   src_object = Fcurrent_buffer ();
9336   if (NILP (dst_object))
9337     dst_object = src_object;
9338   else if (! EQ (dst_object, Qt))
9339     CHECK_BUFFER (dst_object);
9340
9341   validate_region (&start, &end);
9342   from = XFASTINT (start);
9343   from_byte = CHAR_TO_BYTE (from);
9344   to = XFASTINT (end);
9345   to_byte = CHAR_TO_BYTE (to);
9346
9347   setup_coding_system (coding_system, &coding);
9348   coding.mode |= CODING_MODE_LAST_BLOCK;
9349
9350   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9351     {
9352       struct buffer *buf = XBUFFER (dst_object);
9353       ptrdiff_t buf_pt = BUF_PT (buf);
9354
9355       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9356     }
9357
9358   if (encodep)
9359     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9360                           dst_object);
9361   else
9362     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9363                           dst_object);
9364   if (! norecord)
9365     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9366
9367   return (BUFFERP (dst_object)
9368           ? make_number (coding.produced_char)
9369           : coding.dst_object);
9370 }
9371
9372
9373 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9374        3, 4, "r\nzCoding system: ",
9375        doc: /* Decode the current region from the specified coding system.
9376 When called from a program, takes four arguments:
9377         START, END, CODING-SYSTEM, and DESTINATION.
9378 START and END are buffer positions.
9379
9380 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9381 If nil, the region between START and END is replaced by the decoded text.
9382 If buffer, the decoded text is inserted in that buffer after point (point
9383 does not move).
9384 In those cases, the length of the decoded text is returned.
9385 If DESTINATION is t, the decoded text is returned.
9386
9387 This function sets `last-coding-system-used' to the precise coding system
9388 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9389 not fully specified.)  */)
9390   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9391 {
9392   return code_convert_region (start, end, coding_system, destination, 0, 0);
9393 }
9394
9395 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9396        3, 4, "r\nzCoding system: ",
9397        doc: /* Encode the current region by specified coding system.
9398 When called from a program, takes four arguments:
9399         START, END, CODING-SYSTEM and DESTINATION.
9400 START and END are buffer positions.
9401
9402 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9403 If nil, the region between START and END is replace by the encoded text.
9404 If buffer, the encoded text is inserted in that buffer after point (point
9405 does not move).
9406 In those cases, the length of the encoded text is returned.
9407 If DESTINATION is t, the encoded text is returned.
9408
9409 This function sets `last-coding-system-used' to the precise coding system
9410 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9411 not fully specified.)  */)
9412   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9413 {
9414   return code_convert_region (start, end, coding_system, destination, 1, 0);
9415 }
9416
9417 Lisp_Object
9418 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9419                      Lisp_Object dst_object, bool encodep, bool nocopy,
9420                      bool norecord)
9421 {
9422   struct coding_system coding;
9423   ptrdiff_t chars, bytes;
9424
9425   CHECK_STRING (string);
9426   if (NILP (coding_system))
9427     {
9428       if (! norecord)
9429         Vlast_coding_system_used = Qno_conversion;
9430       if (NILP (dst_object))
9431         return (nocopy ? Fcopy_sequence (string) : string);
9432     }
9433
9434   if (NILP (coding_system))
9435     coding_system = Qno_conversion;
9436   else
9437     CHECK_CODING_SYSTEM (coding_system);
9438   if (NILP (dst_object))
9439     dst_object = Qt;
9440   else if (! EQ (dst_object, Qt))
9441     CHECK_BUFFER (dst_object);
9442
9443   setup_coding_system (coding_system, &coding);
9444   coding.mode |= CODING_MODE_LAST_BLOCK;
9445   chars = SCHARS (string);
9446   bytes = SBYTES (string);
9447
9448   if (BUFFERP (dst_object))
9449     {
9450       struct buffer *buf = XBUFFER (dst_object);
9451       ptrdiff_t buf_pt = BUF_PT (buf);
9452
9453       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9454     }
9455
9456   if (encodep)
9457     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9458   else
9459     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9460   if (! norecord)
9461     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9462
9463   return (BUFFERP (dst_object)
9464           ? make_number (coding.produced_char)
9465           : coding.dst_object);
9466 }
9467
9468
9469 /* Encode or decode STRING according to CODING_SYSTEM.
9470    Do not set Vlast_coding_system_used.
9471
9472    This function is called only from macros DECODE_FILE and
9473    ENCODE_FILE, thus we ignore character composition.  */
9474
9475 Lisp_Object
9476 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9477                               bool encodep)
9478 {
9479   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9480 }
9481
9482 /* Encode or decode a file name, to or from a unibyte string suitable
9483    for passing to C library functions.  */
9484 Lisp_Object
9485 decode_file_name (Lisp_Object fname)
9486 {
9487 #ifdef WINDOWSNT
9488   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9489      converts the file names either to UTF-16LE or to the system ANSI
9490      codepage internally, depending on the underlying OS; see w32.c.  */
9491   if (! NILP (Fcoding_system_p (Qutf_8)))
9492     return code_convert_string_norecord (fname, Qutf_8, 0);
9493   return fname;
9494 #else  /* !WINDOWSNT */
9495   if (! NILP (Vfile_name_coding_system))
9496     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9497   else if (! NILP (Vdefault_file_name_coding_system))
9498     return code_convert_string_norecord (fname,
9499                                          Vdefault_file_name_coding_system, 0);
9500   else
9501     return fname;
9502 #endif
9503 }
9504
9505 Lisp_Object
9506 encode_file_name (Lisp_Object fname)
9507 {
9508   /* This is especially important during bootstrap and dumping, when
9509      file-name encoding is not yet known, and therefore any non-ASCII
9510      file names are unibyte strings, and could only be thrashed if we
9511      try to encode them.  */
9512   if (!STRING_MULTIBYTE (fname))
9513     return fname;
9514 #ifdef WINDOWSNT
9515   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9516      converts the file names either to UTF-16LE or to the system ANSI
9517      codepage internally, depending on the underlying OS; see w32.c.  */
9518   if (! NILP (Fcoding_system_p (Qutf_8)))
9519     return code_convert_string_norecord (fname, Qutf_8, 1);
9520   return fname;
9521 #else  /* !WINDOWSNT */
9522   if (! NILP (Vfile_name_coding_system))
9523     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9524   else if (! NILP (Vdefault_file_name_coding_system))
9525     return code_convert_string_norecord (fname,
9526                                          Vdefault_file_name_coding_system, 1);
9527   else
9528     return fname;
9529 #endif
9530 }
9531
9532 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9533        2, 4, 0,
9534        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9535
9536 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9537 if the decoding operation is trivial.
9538
9539 Optional fourth arg BUFFER non-nil means that the decoded text is
9540 inserted in that buffer after point (point does not move).  In this
9541 case, the return value is the length of the decoded text.
9542
9543 This function sets `last-coding-system-used' to the precise coding system
9544 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9545 not fully specified.)  */)
9546   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9547 {
9548   return code_convert_string (string, coding_system, buffer,
9549                               0, ! NILP (nocopy), 0);
9550 }
9551
9552 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9553        2, 4, 0,
9554        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9555
9556 Optional third arg NOCOPY non-nil means it is OK to return STRING
9557 itself if the encoding operation is trivial.
9558
9559 Optional fourth arg BUFFER non-nil means that the encoded text is
9560 inserted in that buffer after point (point does not move).  In this
9561 case, the return value is the length of the encoded text.
9562
9563 This function sets `last-coding-system-used' to the precise coding system
9564 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9565 not fully specified.)  */)
9566   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9567 {
9568   return code_convert_string (string, coding_system, buffer,
9569                               1, ! NILP (nocopy), 0);
9570 }
9571
9572 \f
9573 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9574        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9575 Return the corresponding character.  */)
9576   (Lisp_Object code)
9577 {
9578   Lisp_Object spec, attrs, val;
9579   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9580   EMACS_INT ch;
9581   int c;
9582
9583   CHECK_NATNUM (code);
9584   ch = XFASTINT (code);
9585   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9586   attrs = AREF (spec, 0);
9587
9588   if (ASCII_CHAR_P (ch)
9589       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9590     return code;
9591
9592   val = CODING_ATTR_CHARSET_LIST (attrs);
9593   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9594   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9595   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9596
9597   if (ch <= 0x7F)
9598     {
9599       c = ch;
9600       charset = charset_roman;
9601     }
9602   else if (ch >= 0xA0 && ch < 0xDF)
9603     {
9604       c = ch - 0x80;
9605       charset = charset_kana;
9606     }
9607   else
9608     {
9609       EMACS_INT c1 = ch >> 8;
9610       int c2 = ch & 0xFF;
9611
9612       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9613           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9614         error ("Invalid code: %"pI"d", ch);
9615       c = ch;
9616       SJIS_TO_JIS (c);
9617       charset = charset_kanji;
9618     }
9619   c = DECODE_CHAR (charset, c);
9620   if (c < 0)
9621     error ("Invalid code: %"pI"d", ch);
9622   return make_number (c);
9623 }
9624
9625
9626 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9627        doc: /* Encode a Japanese character CH to shift_jis encoding.
9628 Return the corresponding code in SJIS.  */)
9629   (Lisp_Object ch)
9630 {
9631   Lisp_Object spec, attrs, charset_list;
9632   int c;
9633   struct charset *charset;
9634   unsigned code;
9635
9636   CHECK_CHARACTER (ch);
9637   c = XFASTINT (ch);
9638   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9639   attrs = AREF (spec, 0);
9640
9641   if (ASCII_CHAR_P (c)
9642       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9643     return ch;
9644
9645   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9646   charset = char_charset (c, charset_list, &code);
9647   if (code == CHARSET_INVALID_CODE (charset))
9648     error ("Can't encode by shift_jis encoding: %c", c);
9649   JIS_TO_SJIS (code);
9650
9651   return make_number (code);
9652 }
9653
9654 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9655        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9656 Return the corresponding character.  */)
9657   (Lisp_Object code)
9658 {
9659   Lisp_Object spec, attrs, val;
9660   struct charset *charset_roman, *charset_big5, *charset;
9661   EMACS_INT ch;
9662   int c;
9663
9664   CHECK_NATNUM (code);
9665   ch = XFASTINT (code);
9666   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9667   attrs = AREF (spec, 0);
9668
9669   if (ASCII_CHAR_P (ch)
9670       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9671     return code;
9672
9673   val = CODING_ATTR_CHARSET_LIST (attrs);
9674   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9675   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9676
9677   if (ch <= 0x7F)
9678     {
9679       c = ch;
9680       charset = charset_roman;
9681     }
9682   else
9683     {
9684       EMACS_INT b1 = ch >> 8;
9685       int b2 = ch & 0x7F;
9686       if (b1 < 0xA1 || b1 > 0xFE
9687           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9688         error ("Invalid code: %"pI"d", ch);
9689       c = ch;
9690       charset = charset_big5;
9691     }
9692   c = DECODE_CHAR (charset, c);
9693   if (c < 0)
9694     error ("Invalid code: %"pI"d", ch);
9695   return make_number (c);
9696 }
9697
9698 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9699        doc: /* Encode the Big5 character CH to BIG5 coding system.
9700 Return the corresponding character code in Big5.  */)
9701   (Lisp_Object ch)
9702 {
9703   Lisp_Object spec, attrs, charset_list;
9704   struct charset *charset;
9705   int c;
9706   unsigned code;
9707
9708   CHECK_CHARACTER (ch);
9709   c = XFASTINT (ch);
9710   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9711   attrs = AREF (spec, 0);
9712   if (ASCII_CHAR_P (c)
9713       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9714     return ch;
9715
9716   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9717   charset = char_charset (c, charset_list, &code);
9718   if (code == CHARSET_INVALID_CODE (charset))
9719     error ("Can't encode by Big5 encoding: %c", c);
9720
9721   return make_number (code);
9722 }
9723
9724 \f
9725 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9726        Sset_terminal_coding_system_internal, 1, 2, 0,
9727        doc: /* Internal use only.  */)
9728   (Lisp_Object coding_system, Lisp_Object terminal)
9729 {
9730   struct terminal *term = decode_live_terminal (terminal);
9731   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9732   CHECK_SYMBOL (coding_system);
9733   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9734   /* We had better not send unsafe characters to terminal.  */
9735   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9736   /* Character composition should be disabled.  */
9737   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9738   terminal_coding->src_multibyte = 1;
9739   terminal_coding->dst_multibyte = 0;
9740   tset_charset_list
9741     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9742             ? coding_charset_list (terminal_coding)
9743             : list1 (make_number (charset_ascii))));
9744   return Qnil;
9745 }
9746
9747 DEFUN ("set-safe-terminal-coding-system-internal",
9748        Fset_safe_terminal_coding_system_internal,
9749        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9750        doc: /* Internal use only.  */)
9751   (Lisp_Object coding_system)
9752 {
9753   CHECK_SYMBOL (coding_system);
9754   setup_coding_system (Fcheck_coding_system (coding_system),
9755                        &safe_terminal_coding);
9756   /* Character composition should be disabled.  */
9757   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9758   safe_terminal_coding.src_multibyte = 1;
9759   safe_terminal_coding.dst_multibyte = 0;
9760   return Qnil;
9761 }
9762
9763 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9764        Sterminal_coding_system, 0, 1, 0,
9765        doc: /* Return coding system specified for terminal output on the given terminal.
9766 TERMINAL may be a terminal object, a frame, or nil for the selected
9767 frame's terminal device.  */)
9768   (Lisp_Object terminal)
9769 {
9770   struct coding_system *terminal_coding
9771     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
9772   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9773
9774   /* For backward compatibility, return nil if it is `undecided'.  */
9775   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9776 }
9777
9778 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9779        Sset_keyboard_coding_system_internal, 1, 2, 0,
9780        doc: /* Internal use only.  */)
9781   (Lisp_Object coding_system, Lisp_Object terminal)
9782 {
9783   struct terminal *t = decode_live_terminal (terminal);
9784   CHECK_SYMBOL (coding_system);
9785   if (NILP (coding_system))
9786     coding_system = Qno_conversion;
9787   else
9788     Fcheck_coding_system (coding_system);
9789   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9790   /* Character composition should be disabled.  */
9791   TERMINAL_KEYBOARD_CODING (t)->common_flags
9792     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9793   return Qnil;
9794 }
9795
9796 DEFUN ("keyboard-coding-system",
9797        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9798        doc: /* Return coding system specified for decoding keyboard input.  */)
9799   (Lisp_Object terminal)
9800 {
9801   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9802                          (decode_live_terminal (terminal))->id);
9803 }
9804
9805 \f
9806 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9807        Sfind_operation_coding_system,  1, MANY, 0,
9808        doc: /* Choose a coding system for an operation based on the target name.
9809 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9810 DECODING-SYSTEM is the coding system to use for decoding
9811 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9812 for encoding (in case OPERATION does encoding).
9813
9814 The first argument OPERATION specifies an I/O primitive:
9815   For file I/O, `insert-file-contents' or `write-region'.
9816   For process I/O, `call-process', `call-process-region', or `start-process'.
9817   For network I/O, `open-network-stream'.
9818
9819 The remaining arguments should be the same arguments that were passed
9820 to the primitive.  Depending on which primitive, one of those arguments
9821 is selected as the TARGET.  For example, if OPERATION does file I/O,
9822 whichever argument specifies the file name is TARGET.
9823
9824 TARGET has a meaning which depends on OPERATION:
9825   For file I/O, TARGET is a file name (except for the special case below).
9826   For process I/O, TARGET is a process name.
9827   For network I/O, TARGET is a service name or a port number.
9828
9829 This function looks up what is specified for TARGET in
9830 `file-coding-system-alist', `process-coding-system-alist',
9831 or `network-coding-system-alist' depending on OPERATION.
9832 They may specify a coding system, a cons of coding systems,
9833 or a function symbol to call.
9834 In the last case, we call the function with one argument,
9835 which is a list of all the arguments given to this function.
9836 If the function can't decide a coding system, it can return
9837 `undecided' so that the normal code-detection is performed.
9838
9839 If OPERATION is `insert-file-contents', the argument corresponding to
9840 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9841 file name to look up, and BUFFER is a buffer that contains the file's
9842 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9843 function to call for FILENAME, that function should examine the
9844 contents of BUFFER instead of reading the file.
9845
9846 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9847   (ptrdiff_t nargs, Lisp_Object *args)
9848 {
9849   Lisp_Object operation, target_idx, target, val;
9850   register Lisp_Object chain;
9851
9852   if (nargs < 2)
9853     error ("Too few arguments");
9854   operation = args[0];
9855   if (!SYMBOLP (operation)
9856       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9857     error ("Invalid first argument");
9858   if (nargs <= 1 + XFASTINT (target_idx))
9859     error ("Too few arguments for operation `%s'",
9860            SDATA (SYMBOL_NAME (operation)));
9861   target = args[XFASTINT (target_idx) + 1];
9862   if (!(STRINGP (target)
9863         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9864             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9865         || (EQ (operation, Qopen_network_stream)
9866             && (INTEGERP (target) || EQ (target, Qt)))))
9867     error ("Invalid argument %"pI"d of operation `%s'",
9868            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9869   if (CONSP (target))
9870     target = XCAR (target);
9871
9872   chain = ((EQ (operation, Qinsert_file_contents)
9873             || EQ (operation, Qwrite_region))
9874            ? Vfile_coding_system_alist
9875            : (EQ (operation, Qopen_network_stream)
9876               ? Vnetwork_coding_system_alist
9877               : Vprocess_coding_system_alist));
9878   if (NILP (chain))
9879     return Qnil;
9880
9881   for (; CONSP (chain); chain = XCDR (chain))
9882     {
9883       Lisp_Object elt;
9884
9885       elt = XCAR (chain);
9886       if (CONSP (elt)
9887           && ((STRINGP (target)
9888                && STRINGP (XCAR (elt))
9889                && fast_string_match (XCAR (elt), target) >= 0)
9890               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9891         {
9892           val = XCDR (elt);
9893           /* Here, if VAL is both a valid coding system and a valid
9894              function symbol, we return VAL as a coding system.  */
9895           if (CONSP (val))
9896             return val;
9897           if (! SYMBOLP (val))
9898             return Qnil;
9899           if (! NILP (Fcoding_system_p (val)))
9900             return Fcons (val, val);
9901           if (! NILP (Ffboundp (val)))
9902             {
9903               /* We use call1 rather than safe_call1
9904                  so as to get bug reports about functions called here
9905                  which don't handle the current interface.  */
9906               val = call1 (val, Flist (nargs, args));
9907               if (CONSP (val))
9908                 return val;
9909               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9910                 return Fcons (val, val);
9911             }
9912           return Qnil;
9913         }
9914     }
9915   return Qnil;
9916 }
9917
9918 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9919        Sset_coding_system_priority, 0, MANY, 0,
9920        doc: /* Assign higher priority to the coding systems given as arguments.
9921 If multiple coding systems belong to the same category,
9922 all but the first one are ignored.
9923
9924 usage: (set-coding-system-priority &rest coding-systems)  */)
9925   (ptrdiff_t nargs, Lisp_Object *args)
9926 {
9927   ptrdiff_t i, j;
9928   bool changed[coding_category_max];
9929   enum coding_category priorities[coding_category_max];
9930
9931   memset (changed, 0, sizeof changed);
9932
9933   for (i = j = 0; i < nargs; i++)
9934     {
9935       enum coding_category category;
9936       Lisp_Object spec, attrs;
9937
9938       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9939       attrs = AREF (spec, 0);
9940       category = XINT (CODING_ATTR_CATEGORY (attrs));
9941       if (changed[category])
9942         /* Ignore this coding system because a coding system of the
9943            same category already had a higher priority.  */
9944         continue;
9945       changed[category] = 1;
9946       priorities[j++] = category;
9947       if (coding_categories[category].id >= 0
9948           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9949         setup_coding_system (args[i], &coding_categories[category]);
9950       Fset (AREF (Vcoding_category_table, category), args[i]);
9951     }
9952
9953   /* Now we have decided top J priorities.  Reflect the order of the
9954      original priorities to the remaining priorities.  */
9955
9956   for (i = j, j = 0; i < coding_category_max; i++, j++)
9957     {
9958       while (j < coding_category_max
9959              && changed[coding_priorities[j]])
9960         j++;
9961       if (j == coding_category_max)
9962         emacs_abort ();
9963       priorities[i] = coding_priorities[j];
9964     }
9965
9966   memcpy (coding_priorities, priorities, sizeof priorities);
9967
9968   /* Update `coding-category-list'.  */
9969   Vcoding_category_list = Qnil;
9970   for (i = coding_category_max; i-- > 0; )
9971     Vcoding_category_list
9972       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9973                Vcoding_category_list);
9974
9975   return Qnil;
9976 }
9977
9978 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9979        Scoding_system_priority_list, 0, 1, 0,
9980        doc: /* Return a list of coding systems ordered by their priorities.
9981 The list contains a subset of coding systems; i.e. coding systems
9982 assigned to each coding category (see `coding-category-list').
9983
9984 HIGHESTP non-nil means just return the highest priority one.  */)
9985   (Lisp_Object highestp)
9986 {
9987   int i;
9988   Lisp_Object val;
9989
9990   for (i = 0, val = Qnil; i < coding_category_max; i++)
9991     {
9992       enum coding_category category = coding_priorities[i];
9993       int id = coding_categories[category].id;
9994       Lisp_Object attrs;
9995
9996       if (id < 0)
9997         continue;
9998       attrs = CODING_ID_ATTRS (id);
9999       if (! NILP (highestp))
10000         return CODING_ATTR_BASE_NAME (attrs);
10001       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10002     }
10003   return Fnreverse (val);
10004 }
10005
10006 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10007
10008 static Lisp_Object
10009 make_subsidiaries (Lisp_Object base)
10010 {
10011   Lisp_Object subsidiaries;
10012   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10013   USE_SAFE_ALLOCA;
10014   char *buf = SAFE_ALLOCA (base_name_len + 6);
10015   int i;
10016
10017   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10018   subsidiaries = make_uninit_vector (3);
10019   for (i = 0; i < 3; i++)
10020     {
10021       strcpy (buf + base_name_len, suffixes[i]);
10022       ASET (subsidiaries, i, intern (buf));
10023     }
10024   SAFE_FREE ();
10025   return subsidiaries;
10026 }
10027
10028
10029 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10030        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10031        doc: /* For internal use only.
10032 usage: (define-coding-system-internal ...)  */)
10033   (ptrdiff_t nargs, Lisp_Object *args)
10034 {
10035   Lisp_Object name;
10036   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10037   Lisp_Object attrs;            /* Vector of attributes.  */
10038   Lisp_Object eol_type;
10039   Lisp_Object aliases;
10040   Lisp_Object coding_type, charset_list, safe_charsets;
10041   enum coding_category category;
10042   Lisp_Object tail, val;
10043   int max_charset_id = 0;
10044   int i;
10045
10046   if (nargs < coding_arg_max)
10047     goto short_args;
10048
10049   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10050
10051   name = args[coding_arg_name];
10052   CHECK_SYMBOL (name);
10053   ASET (attrs, coding_attr_base_name, name);
10054
10055   val = args[coding_arg_mnemonic];
10056   if (! STRINGP (val))
10057     CHECK_CHARACTER (val);
10058   ASET (attrs, coding_attr_mnemonic, val);
10059
10060   coding_type = args[coding_arg_coding_type];
10061   CHECK_SYMBOL (coding_type);
10062   ASET (attrs, coding_attr_type, coding_type);
10063
10064   charset_list = args[coding_arg_charset_list];
10065   if (SYMBOLP (charset_list))
10066     {
10067       if (EQ (charset_list, Qiso_2022))
10068         {
10069           if (! EQ (coding_type, Qiso_2022))
10070             error ("Invalid charset-list");
10071           charset_list = Viso_2022_charset_list;
10072         }
10073       else if (EQ (charset_list, Qemacs_mule))
10074         {
10075           if (! EQ (coding_type, Qemacs_mule))
10076             error ("Invalid charset-list");
10077           charset_list = Vemacs_mule_charset_list;
10078         }
10079       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10080         {
10081           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10082             error ("Invalid charset-list");
10083           if (max_charset_id < XFASTINT (XCAR (tail)))
10084             max_charset_id = XFASTINT (XCAR (tail));
10085         }
10086     }
10087   else
10088     {
10089       charset_list = Fcopy_sequence (charset_list);
10090       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10091         {
10092           struct charset *charset;
10093
10094           val = XCAR (tail);
10095           CHECK_CHARSET_GET_CHARSET (val, charset);
10096           if (EQ (coding_type, Qiso_2022)
10097               ? CHARSET_ISO_FINAL (charset) < 0
10098               : EQ (coding_type, Qemacs_mule)
10099               ? CHARSET_EMACS_MULE_ID (charset) < 0
10100               : 0)
10101             error ("Can't handle charset `%s'",
10102                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10103
10104           XSETCAR (tail, make_number (charset->id));
10105           if (max_charset_id < charset->id)
10106             max_charset_id = charset->id;
10107         }
10108     }
10109   ASET (attrs, coding_attr_charset_list, charset_list);
10110
10111   safe_charsets = make_uninit_string (max_charset_id + 1);
10112   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10113   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10114     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10115   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10116
10117   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10118
10119   val = args[coding_arg_decode_translation_table];
10120   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10121     CHECK_SYMBOL (val);
10122   ASET (attrs, coding_attr_decode_tbl, val);
10123
10124   val = args[coding_arg_encode_translation_table];
10125   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10126     CHECK_SYMBOL (val);
10127   ASET (attrs, coding_attr_encode_tbl, val);
10128
10129   val = args[coding_arg_post_read_conversion];
10130   CHECK_SYMBOL (val);
10131   ASET (attrs, coding_attr_post_read, val);
10132
10133   val = args[coding_arg_pre_write_conversion];
10134   CHECK_SYMBOL (val);
10135   ASET (attrs, coding_attr_pre_write, val);
10136
10137   val = args[coding_arg_default_char];
10138   if (NILP (val))
10139     ASET (attrs, coding_attr_default_char, make_number (' '));
10140   else
10141     {
10142       CHECK_CHARACTER (val);
10143       ASET (attrs, coding_attr_default_char, val);
10144     }
10145
10146   val = args[coding_arg_for_unibyte];
10147   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10148
10149   val = args[coding_arg_plist];
10150   CHECK_LIST (val);
10151   ASET (attrs, coding_attr_plist, val);
10152
10153   if (EQ (coding_type, Qcharset))
10154     {
10155       /* Generate a lisp vector of 256 elements.  Each element is nil,
10156          integer, or a list of charset IDs.
10157
10158          If Nth element is nil, the byte code N is invalid in this
10159          coding system.
10160
10161          If Nth element is a number NUM, N is the first byte of a
10162          charset whose ID is NUM.
10163
10164          If Nth element is a list of charset IDs, N is the first byte
10165          of one of them.  The list is sorted by dimensions of the
10166          charsets.  A charset of smaller dimension comes first. */
10167       val = Fmake_vector (make_number (256), Qnil);
10168
10169       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10170         {
10171           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10172           int dim = CHARSET_DIMENSION (charset);
10173           int idx = (dim - 1) * 4;
10174
10175           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10176             ASET (attrs, coding_attr_ascii_compat, Qt);
10177
10178           for (i = charset->code_space[idx];
10179                i <= charset->code_space[idx + 1]; i++)
10180             {
10181               Lisp_Object tmp, tmp2;
10182               int dim2;
10183
10184               tmp = AREF (val, i);
10185               if (NILP (tmp))
10186                 tmp = XCAR (tail);
10187               else if (NUMBERP (tmp))
10188                 {
10189                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10190                   if (dim < dim2)
10191                     tmp = list2 (XCAR (tail), tmp);
10192                   else
10193                     tmp = list2 (tmp, XCAR (tail));
10194                 }
10195               else
10196                 {
10197                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10198                     {
10199                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10200                       if (dim < dim2)
10201                         break;
10202                     }
10203                   if (NILP (tmp2))
10204                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10205                   else
10206                     {
10207                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10208                       XSETCAR (tmp2, XCAR (tail));
10209                     }
10210                 }
10211               ASET (val, i, tmp);
10212             }
10213         }
10214       ASET (attrs, coding_attr_charset_valids, val);
10215       category = coding_category_charset;
10216     }
10217   else if (EQ (coding_type, Qccl))
10218     {
10219       Lisp_Object valids;
10220
10221       if (nargs < coding_arg_ccl_max)
10222         goto short_args;
10223
10224       val = args[coding_arg_ccl_decoder];
10225       CHECK_CCL_PROGRAM (val);
10226       if (VECTORP (val))
10227         val = Fcopy_sequence (val);
10228       ASET (attrs, coding_attr_ccl_decoder, val);
10229
10230       val = args[coding_arg_ccl_encoder];
10231       CHECK_CCL_PROGRAM (val);
10232       if (VECTORP (val))
10233         val = Fcopy_sequence (val);
10234       ASET (attrs, coding_attr_ccl_encoder, val);
10235
10236       val = args[coding_arg_ccl_valids];
10237       valids = Fmake_string (make_number (256), make_number (0));
10238       for (tail = val; CONSP (tail); tail = XCDR (tail))
10239         {
10240           int from, to;
10241
10242           val = XCAR (tail);
10243           if (INTEGERP (val))
10244             {
10245               if (! (0 <= XINT (val) && XINT (val) <= 255))
10246                 args_out_of_range_3 (val, make_number (0), make_number (255));
10247               from = to = XINT (val);
10248             }
10249           else
10250             {
10251               CHECK_CONS (val);
10252               CHECK_NATNUM_CAR (val);
10253               CHECK_NUMBER_CDR (val);
10254               if (XINT (XCAR (val)) > 255)
10255                 args_out_of_range_3 (XCAR (val),
10256                                      make_number (0), make_number (255));
10257               from = XINT (XCAR (val));
10258               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10259                 args_out_of_range_3 (XCDR (val),
10260                                      XCAR (val), make_number (255));
10261               to = XINT (XCDR (val));
10262             }
10263           for (i = from; i <= to; i++)
10264             SSET (valids, i, 1);
10265         }
10266       ASET (attrs, coding_attr_ccl_valids, valids);
10267
10268       category = coding_category_ccl;
10269     }
10270   else if (EQ (coding_type, Qutf_16))
10271     {
10272       Lisp_Object bom, endian;
10273
10274       ASET (attrs, coding_attr_ascii_compat, Qnil);
10275
10276       if (nargs < coding_arg_utf16_max)
10277         goto short_args;
10278
10279       bom = args[coding_arg_utf16_bom];
10280       if (! NILP (bom) && ! EQ (bom, Qt))
10281         {
10282           CHECK_CONS (bom);
10283           val = XCAR (bom);
10284           CHECK_CODING_SYSTEM (val);
10285           val = XCDR (bom);
10286           CHECK_CODING_SYSTEM (val);
10287         }
10288       ASET (attrs, coding_attr_utf_bom, bom);
10289
10290       endian = args[coding_arg_utf16_endian];
10291       CHECK_SYMBOL (endian);
10292       if (NILP (endian))
10293         endian = Qbig;
10294       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10295         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10296       ASET (attrs, coding_attr_utf_16_endian, endian);
10297
10298       category = (CONSP (bom)
10299                   ? coding_category_utf_16_auto
10300                   : NILP (bom)
10301                   ? (EQ (endian, Qbig)
10302                      ? coding_category_utf_16_be_nosig
10303                      : coding_category_utf_16_le_nosig)
10304                   : (EQ (endian, Qbig)
10305                      ? coding_category_utf_16_be
10306                      : coding_category_utf_16_le));
10307     }
10308   else if (EQ (coding_type, Qiso_2022))
10309     {
10310       Lisp_Object initial, reg_usage, request, flags;
10311
10312       if (nargs < coding_arg_iso2022_max)
10313         goto short_args;
10314
10315       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10316       CHECK_VECTOR (initial);
10317       for (i = 0; i < 4; i++)
10318         {
10319           val = AREF (initial, i);
10320           if (! NILP (val))
10321             {
10322               struct charset *charset;
10323
10324               CHECK_CHARSET_GET_CHARSET (val, charset);
10325               ASET (initial, i, make_number (CHARSET_ID (charset)));
10326               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10327                 ASET (attrs, coding_attr_ascii_compat, Qt);
10328             }
10329           else
10330             ASET (initial, i, make_number (-1));
10331         }
10332
10333       reg_usage = args[coding_arg_iso2022_reg_usage];
10334       CHECK_CONS (reg_usage);
10335       CHECK_NUMBER_CAR (reg_usage);
10336       CHECK_NUMBER_CDR (reg_usage);
10337
10338       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10339       for (tail = request; CONSP (tail); tail = XCDR (tail))
10340         {
10341           int id;
10342           Lisp_Object tmp1;
10343
10344           val = XCAR (tail);
10345           CHECK_CONS (val);
10346           tmp1 = XCAR (val);
10347           CHECK_CHARSET_GET_ID (tmp1, id);
10348           CHECK_NATNUM_CDR (val);
10349           if (XINT (XCDR (val)) >= 4)
10350             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10351           XSETCAR (val, make_number (id));
10352         }
10353
10354       flags = args[coding_arg_iso2022_flags];
10355       CHECK_NATNUM (flags);
10356       i = XINT (flags) & INT_MAX;
10357       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10358         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10359       flags = make_number (i);
10360
10361       ASET (attrs, coding_attr_iso_initial, initial);
10362       ASET (attrs, coding_attr_iso_usage, reg_usage);
10363       ASET (attrs, coding_attr_iso_request, request);
10364       ASET (attrs, coding_attr_iso_flags, flags);
10365       setup_iso_safe_charsets (attrs);
10366
10367       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10368         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10369                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10370                     ? coding_category_iso_7_else
10371                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10372                     ? coding_category_iso_7
10373                     : coding_category_iso_7_tight);
10374       else
10375         {
10376           int id = XINT (AREF (initial, 1));
10377
10378           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10379                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10380                        || id < 0)
10381                       ? coding_category_iso_8_else
10382                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10383                       ? coding_category_iso_8_1
10384                       : coding_category_iso_8_2);
10385         }
10386       if (category != coding_category_iso_8_1
10387           && category != coding_category_iso_8_2)
10388         ASET (attrs, coding_attr_ascii_compat, Qnil);
10389     }
10390   else if (EQ (coding_type, Qemacs_mule))
10391     {
10392       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10393         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10394       ASET (attrs, coding_attr_ascii_compat, Qt);
10395       category = coding_category_emacs_mule;
10396     }
10397   else if (EQ (coding_type, Qshift_jis))
10398     {
10399
10400       struct charset *charset;
10401
10402       if (XINT (Flength (charset_list)) != 3
10403           && XINT (Flength (charset_list)) != 4)
10404         error ("There should be three or four charsets");
10405
10406       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10407       if (CHARSET_DIMENSION (charset) != 1)
10408         error ("Dimension of charset %s is not one",
10409                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10410       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10411         ASET (attrs, coding_attr_ascii_compat, Qt);
10412
10413       charset_list = XCDR (charset_list);
10414       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10415       if (CHARSET_DIMENSION (charset) != 1)
10416         error ("Dimension of charset %s is not one",
10417                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10418
10419       charset_list = XCDR (charset_list);
10420       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10421       if (CHARSET_DIMENSION (charset) != 2)
10422         error ("Dimension of charset %s is not two",
10423                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10424
10425       charset_list = XCDR (charset_list);
10426       if (! NILP (charset_list))
10427         {
10428           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10429           if (CHARSET_DIMENSION (charset) != 2)
10430             error ("Dimension of charset %s is not two",
10431                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10432         }
10433
10434       category = coding_category_sjis;
10435       Vsjis_coding_system = name;
10436     }
10437   else if (EQ (coding_type, Qbig5))
10438     {
10439       struct charset *charset;
10440
10441       if (XINT (Flength (charset_list)) != 2)
10442         error ("There should be just two charsets");
10443
10444       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10445       if (CHARSET_DIMENSION (charset) != 1)
10446         error ("Dimension of charset %s is not one",
10447                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10448       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10449         ASET (attrs, coding_attr_ascii_compat, Qt);
10450
10451       charset_list = XCDR (charset_list);
10452       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10453       if (CHARSET_DIMENSION (charset) != 2)
10454         error ("Dimension of charset %s is not two",
10455                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10456
10457       category = coding_category_big5;
10458       Vbig5_coding_system = name;
10459     }
10460   else if (EQ (coding_type, Qraw_text))
10461     {
10462       category = coding_category_raw_text;
10463       ASET (attrs, coding_attr_ascii_compat, Qt);
10464     }
10465   else if (EQ (coding_type, Qutf_8))
10466     {
10467       Lisp_Object bom;
10468
10469       if (nargs < coding_arg_utf8_max)
10470         goto short_args;
10471
10472       bom = args[coding_arg_utf8_bom];
10473       if (! NILP (bom) && ! EQ (bom, Qt))
10474         {
10475           CHECK_CONS (bom);
10476           val = XCAR (bom);
10477           CHECK_CODING_SYSTEM (val);
10478           val = XCDR (bom);
10479           CHECK_CODING_SYSTEM (val);
10480         }
10481       ASET (attrs, coding_attr_utf_bom, bom);
10482       if (NILP (bom))
10483         ASET (attrs, coding_attr_ascii_compat, Qt);
10484
10485       category = (CONSP (bom) ? coding_category_utf_8_auto
10486                   : NILP (bom) ? coding_category_utf_8_nosig
10487                   : coding_category_utf_8_sig);
10488     }
10489   else if (EQ (coding_type, Qundecided))
10490     {
10491       if (nargs < coding_arg_undecided_max)
10492         goto short_args;
10493       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10494             args[coding_arg_undecided_inhibit_null_byte_detection]);
10495       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10496             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10497       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10498             args[coding_arg_undecided_prefer_utf_8]);
10499       category = coding_category_undecided;
10500     }
10501   else
10502     error ("Invalid coding system type: %s",
10503            SDATA (SYMBOL_NAME (coding_type)));
10504
10505   ASET (attrs, coding_attr_category, make_number (category));
10506   ASET (attrs, coding_attr_plist,
10507         Fcons (QCcategory,
10508                Fcons (AREF (Vcoding_category_table, category),
10509                       CODING_ATTR_PLIST (attrs))));
10510   ASET (attrs, coding_attr_plist,
10511         Fcons (QCascii_compatible_p,
10512                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10513                       CODING_ATTR_PLIST (attrs))));
10514
10515   eol_type = args[coding_arg_eol_type];
10516   if (! NILP (eol_type)
10517       && ! EQ (eol_type, Qunix)
10518       && ! EQ (eol_type, Qdos)
10519       && ! EQ (eol_type, Qmac))
10520     error ("Invalid eol-type");
10521
10522   aliases = list1 (name);
10523
10524   if (NILP (eol_type))
10525     {
10526       eol_type = make_subsidiaries (name);
10527       for (i = 0; i < 3; i++)
10528         {
10529           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10530
10531           this_name = AREF (eol_type, i);
10532           this_aliases = list1 (this_name);
10533           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10534           this_spec = make_uninit_vector (3);
10535           ASET (this_spec, 0, attrs);
10536           ASET (this_spec, 1, this_aliases);
10537           ASET (this_spec, 2, this_eol_type);
10538           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10539           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10540           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10541           if (NILP (val))
10542             Vcoding_system_alist
10543               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10544                        Vcoding_system_alist);
10545         }
10546     }
10547
10548   spec_vec = make_uninit_vector (3);
10549   ASET (spec_vec, 0, attrs);
10550   ASET (spec_vec, 1, aliases);
10551   ASET (spec_vec, 2, eol_type);
10552
10553   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10554   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10555   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10556   if (NILP (val))
10557     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10558                                   Vcoding_system_alist);
10559
10560   {
10561     int id = coding_categories[category].id;
10562
10563     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10564       setup_coding_system (name, &coding_categories[category]);
10565   }
10566
10567   return Qnil;
10568
10569  short_args:
10570   Fsignal (Qwrong_number_of_arguments,
10571            Fcons (intern ("define-coding-system-internal"),
10572                   make_number (nargs)));
10573 }
10574
10575
10576 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10577        3, 3, 0,
10578        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10579   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10580 {
10581   Lisp_Object spec, attrs;
10582
10583   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10584   attrs = AREF (spec, 0);
10585   if (EQ (prop, QCmnemonic))
10586     {
10587       if (! STRINGP (val))
10588         CHECK_CHARACTER (val);
10589       ASET (attrs, coding_attr_mnemonic, val);
10590     }
10591   else if (EQ (prop, QCdefault_char))
10592     {
10593       if (NILP (val))
10594         val = make_number (' ');
10595       else
10596         CHECK_CHARACTER (val);
10597       ASET (attrs, coding_attr_default_char, val);
10598     }
10599   else if (EQ (prop, QCdecode_translation_table))
10600     {
10601       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10602         CHECK_SYMBOL (val);
10603       ASET (attrs, coding_attr_decode_tbl, val);
10604     }
10605   else if (EQ (prop, QCencode_translation_table))
10606     {
10607       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10608         CHECK_SYMBOL (val);
10609       ASET (attrs, coding_attr_encode_tbl, val);
10610     }
10611   else if (EQ (prop, QCpost_read_conversion))
10612     {
10613       CHECK_SYMBOL (val);
10614       ASET (attrs, coding_attr_post_read, val);
10615     }
10616   else if (EQ (prop, QCpre_write_conversion))
10617     {
10618       CHECK_SYMBOL (val);
10619       ASET (attrs, coding_attr_pre_write, val);
10620     }
10621   else if (EQ (prop, QCascii_compatible_p))
10622     {
10623       ASET (attrs, coding_attr_ascii_compat, val);
10624     }
10625
10626   ASET (attrs, coding_attr_plist,
10627         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10628   return val;
10629 }
10630
10631
10632 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10633        Sdefine_coding_system_alias, 2, 2, 0,
10634        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10635   (Lisp_Object alias, Lisp_Object coding_system)
10636 {
10637   Lisp_Object spec, aliases, eol_type, val;
10638
10639   CHECK_SYMBOL (alias);
10640   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10641   aliases = AREF (spec, 1);
10642   /* ALIASES should be a list of length more than zero, and the first
10643      element is a base coding system.  Append ALIAS at the tail of the
10644      list.  */
10645   while (!NILP (XCDR (aliases)))
10646     aliases = XCDR (aliases);
10647   XSETCDR (aliases, list1 (alias));
10648
10649   eol_type = AREF (spec, 2);
10650   if (VECTORP (eol_type))
10651     {
10652       Lisp_Object subsidiaries;
10653       int i;
10654
10655       subsidiaries = make_subsidiaries (alias);
10656       for (i = 0; i < 3; i++)
10657         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10658                                      AREF (eol_type, i));
10659     }
10660
10661   Fputhash (alias, spec, Vcoding_system_hash_table);
10662   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10663   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10664   if (NILP (val))
10665     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10666                                   Vcoding_system_alist);
10667
10668   return Qnil;
10669 }
10670
10671 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10672        1, 1, 0,
10673        doc: /* Return the base of CODING-SYSTEM.
10674 Any alias or subsidiary coding system is not a base coding system.  */)
10675   (Lisp_Object coding_system)
10676 {
10677   Lisp_Object spec, attrs;
10678
10679   if (NILP (coding_system))
10680     return (Qno_conversion);
10681   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10682   attrs = AREF (spec, 0);
10683   return CODING_ATTR_BASE_NAME (attrs);
10684 }
10685
10686 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10687        1, 1, 0,
10688        doc: /* Return the property list of CODING-SYSTEM.  */)
10689   (Lisp_Object coding_system)
10690 {
10691   Lisp_Object spec, attrs;
10692
10693   if (NILP (coding_system))
10694     coding_system = Qno_conversion;
10695   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10696   attrs = AREF (spec, 0);
10697   return CODING_ATTR_PLIST (attrs);
10698 }
10699
10700
10701 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10702        1, 1, 0,
10703        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10704   (Lisp_Object coding_system)
10705 {
10706   Lisp_Object spec;
10707
10708   if (NILP (coding_system))
10709     coding_system = Qno_conversion;
10710   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10711   return AREF (spec, 1);
10712 }
10713
10714 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10715        Scoding_system_eol_type, 1, 1, 0,
10716        doc: /* Return eol-type of CODING-SYSTEM.
10717 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10718
10719 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10720 and CR respectively.
10721
10722 A vector value indicates that a format of end-of-line should be
10723 detected automatically.  Nth element of the vector is the subsidiary
10724 coding system whose eol-type is N.  */)
10725   (Lisp_Object coding_system)
10726 {
10727   Lisp_Object spec, eol_type;
10728   int n;
10729
10730   if (NILP (coding_system))
10731     coding_system = Qno_conversion;
10732   if (! CODING_SYSTEM_P (coding_system))
10733     return Qnil;
10734   spec = CODING_SYSTEM_SPEC (coding_system);
10735   eol_type = AREF (spec, 2);
10736   if (VECTORP (eol_type))
10737     return Fcopy_sequence (eol_type);
10738   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10739   return make_number (n);
10740 }
10741
10742 #endif /* emacs */
10743
10744 \f
10745 /*** 9. Post-amble ***/
10746
10747 void
10748 init_coding_once (void)
10749 {
10750   int i;
10751
10752   for (i = 0; i < coding_category_max; i++)
10753     {
10754       coding_categories[i].id = -1;
10755       coding_priorities[i] = i;
10756     }
10757
10758   /* ISO2022 specific initialize routine.  */
10759   for (i = 0; i < 0x20; i++)
10760     iso_code_class[i] = ISO_control_0;
10761   for (i = 0x21; i < 0x7F; i++)
10762     iso_code_class[i] = ISO_graphic_plane_0;
10763   for (i = 0x80; i < 0xA0; i++)
10764     iso_code_class[i] = ISO_control_1;
10765   for (i = 0xA1; i < 0xFF; i++)
10766     iso_code_class[i] = ISO_graphic_plane_1;
10767   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10768   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10769   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10770   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10771   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10772   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10773   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10774   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10775   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10776
10777   for (i = 0; i < 256; i++)
10778     {
10779       emacs_mule_bytes[i] = 1;
10780     }
10781   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10782   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10783   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10784   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10785 }
10786
10787 #ifdef emacs
10788
10789 void
10790 syms_of_coding (void)
10791 {
10792   staticpro (&Vcoding_system_hash_table);
10793   Vcoding_system_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
10794
10795   staticpro (&Vsjis_coding_system);
10796   Vsjis_coding_system = Qnil;
10797
10798   staticpro (&Vbig5_coding_system);
10799   Vbig5_coding_system = Qnil;
10800
10801   staticpro (&Vcode_conversion_reused_workbuf);
10802   Vcode_conversion_reused_workbuf = Qnil;
10803
10804   staticpro (&Vcode_conversion_workbuf_name);
10805   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10806
10807   reused_workbuf_in_use = 0;
10808
10809   DEFSYM (Qcharset, "charset");
10810   DEFSYM (Qtarget_idx, "target-idx");
10811   DEFSYM (Qcoding_system_history, "coding-system-history");
10812   Fset (Qcoding_system_history, Qnil);
10813
10814   /* Target FILENAME is the first argument.  */
10815   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10816   /* Target FILENAME is the third argument.  */
10817   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10818
10819   DEFSYM (Qcall_process, "call-process");
10820   /* Target PROGRAM is the first argument.  */
10821   Fput (Qcall_process, Qtarget_idx, make_number (0));
10822
10823   DEFSYM (Qcall_process_region, "call-process-region");
10824   /* Target PROGRAM is the third argument.  */
10825   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10826
10827   DEFSYM (Qstart_process, "start-process");
10828   /* Target PROGRAM is the third argument.  */
10829   Fput (Qstart_process, Qtarget_idx, make_number (2));
10830
10831   DEFSYM (Qopen_network_stream, "open-network-stream");
10832   /* Target SERVICE is the fourth argument.  */
10833   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10834
10835   DEFSYM (Qunix, "unix");
10836   DEFSYM (Qdos, "dos");
10837   DEFSYM (Qmac, "mac");
10838
10839   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10840   DEFSYM (Qundecided, "undecided");
10841   DEFSYM (Qno_conversion, "no-conversion");
10842   DEFSYM (Qraw_text, "raw-text");
10843
10844   DEFSYM (Qiso_2022, "iso-2022");
10845
10846   DEFSYM (Qutf_8, "utf-8");
10847   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10848
10849 #if defined (WINDOWSNT) || defined (CYGWIN)
10850   /* No, not utf-16-le: that one has a BOM.  */
10851   DEFSYM (Qutf_16le, "utf-16le");
10852 #endif
10853
10854   DEFSYM (Qutf_16, "utf-16");
10855   DEFSYM (Qbig, "big");
10856   DEFSYM (Qlittle, "little");
10857
10858   DEFSYM (Qshift_jis, "shift-jis");
10859   DEFSYM (Qbig5, "big5");
10860
10861   DEFSYM (Qcoding_system_p, "coding-system-p");
10862
10863   /* Error signaled when there's a problem with detecting a coding system.  */
10864   DEFSYM (Qcoding_system_error, "coding-system-error");
10865   Fput (Qcoding_system_error, Qerror_conditions,
10866         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10867   Fput (Qcoding_system_error, Qerror_message,
10868         build_pure_c_string ("Invalid coding system"));
10869
10870   DEFSYM (Qtranslation_table, "translation-table");
10871   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10872   DEFSYM (Qtranslation_table_id, "translation-table-id");
10873
10874   /* Coding system emacs-mule and raw-text are for converting only
10875      end-of-line format.  */
10876   DEFSYM (Qemacs_mule, "emacs-mule");
10877
10878   DEFSYM (QCcategory, ":category");
10879   DEFSYM (QCmnemonic, ":mnemonic");
10880   DEFSYM (QCdefault_char, ":default-char");
10881   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10882   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10883   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10884   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10885   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10886
10887   Vcoding_category_table
10888     = Fmake_vector (make_number (coding_category_max), Qnil);
10889   staticpro (&Vcoding_category_table);
10890   /* Followings are target of code detection.  */
10891   ASET (Vcoding_category_table, coding_category_iso_7,
10892         intern_c_string ("coding-category-iso-7"));
10893   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10894         intern_c_string ("coding-category-iso-7-tight"));
10895   ASET (Vcoding_category_table, coding_category_iso_8_1,
10896         intern_c_string ("coding-category-iso-8-1"));
10897   ASET (Vcoding_category_table, coding_category_iso_8_2,
10898         intern_c_string ("coding-category-iso-8-2"));
10899   ASET (Vcoding_category_table, coding_category_iso_7_else,
10900         intern_c_string ("coding-category-iso-7-else"));
10901   ASET (Vcoding_category_table, coding_category_iso_8_else,
10902         intern_c_string ("coding-category-iso-8-else"));
10903   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10904         intern_c_string ("coding-category-utf-8-auto"));
10905   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10906         intern_c_string ("coding-category-utf-8"));
10907   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10908         intern_c_string ("coding-category-utf-8-sig"));
10909   ASET (Vcoding_category_table, coding_category_utf_16_be,
10910         intern_c_string ("coding-category-utf-16-be"));
10911   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10912         intern_c_string ("coding-category-utf-16-auto"));
10913   ASET (Vcoding_category_table, coding_category_utf_16_le,
10914         intern_c_string ("coding-category-utf-16-le"));
10915   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10916         intern_c_string ("coding-category-utf-16-be-nosig"));
10917   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10918         intern_c_string ("coding-category-utf-16-le-nosig"));
10919   ASET (Vcoding_category_table, coding_category_charset,
10920         intern_c_string ("coding-category-charset"));
10921   ASET (Vcoding_category_table, coding_category_sjis,
10922         intern_c_string ("coding-category-sjis"));
10923   ASET (Vcoding_category_table, coding_category_big5,
10924         intern_c_string ("coding-category-big5"));
10925   ASET (Vcoding_category_table, coding_category_ccl,
10926         intern_c_string ("coding-category-ccl"));
10927   ASET (Vcoding_category_table, coding_category_emacs_mule,
10928         intern_c_string ("coding-category-emacs-mule"));
10929   /* Followings are NOT target of code detection.  */
10930   ASET (Vcoding_category_table, coding_category_raw_text,
10931         intern_c_string ("coding-category-raw-text"));
10932   ASET (Vcoding_category_table, coding_category_undecided,
10933         intern_c_string ("coding-category-undecided"));
10934
10935   DEFSYM (Qinsufficient_source, "insufficient-source");
10936   DEFSYM (Qinvalid_source, "invalid-source");
10937   DEFSYM (Qinterrupted, "interrupted");
10938
10939   /* If a symbol has this property, evaluate the value to define the
10940      symbol as a coding system.  */
10941   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10942
10943   defsubr (&Scoding_system_p);
10944   defsubr (&Sread_coding_system);
10945   defsubr (&Sread_non_nil_coding_system);
10946   defsubr (&Scheck_coding_system);
10947   defsubr (&Sdetect_coding_region);
10948   defsubr (&Sdetect_coding_string);
10949   defsubr (&Sfind_coding_systems_region_internal);
10950   defsubr (&Sunencodable_char_position);
10951   defsubr (&Scheck_coding_systems_region);
10952   defsubr (&Sdecode_coding_region);
10953   defsubr (&Sencode_coding_region);
10954   defsubr (&Sdecode_coding_string);
10955   defsubr (&Sencode_coding_string);
10956   defsubr (&Sdecode_sjis_char);
10957   defsubr (&Sencode_sjis_char);
10958   defsubr (&Sdecode_big5_char);
10959   defsubr (&Sencode_big5_char);
10960   defsubr (&Sset_terminal_coding_system_internal);
10961   defsubr (&Sset_safe_terminal_coding_system_internal);
10962   defsubr (&Sterminal_coding_system);
10963   defsubr (&Sset_keyboard_coding_system_internal);
10964   defsubr (&Skeyboard_coding_system);
10965   defsubr (&Sfind_operation_coding_system);
10966   defsubr (&Sset_coding_system_priority);
10967   defsubr (&Sdefine_coding_system_internal);
10968   defsubr (&Sdefine_coding_system_alias);
10969   defsubr (&Scoding_system_put);
10970   defsubr (&Scoding_system_base);
10971   defsubr (&Scoding_system_plist);
10972   defsubr (&Scoding_system_aliases);
10973   defsubr (&Scoding_system_eol_type);
10974   defsubr (&Scoding_system_priority_list);
10975
10976   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10977                doc: /* List of coding systems.
10978
10979 Do not alter the value of this variable manually.  This variable should be
10980 updated by the functions `define-coding-system' and
10981 `define-coding-system-alias'.  */);
10982   Vcoding_system_list = Qnil;
10983
10984   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10985                doc: /* Alist of coding system names.
10986 Each element is one element list of coding system name.
10987 This variable is given to `completing-read' as COLLECTION argument.
10988
10989 Do not alter the value of this variable manually.  This variable should be
10990 updated by the functions `make-coding-system' and
10991 `define-coding-system-alias'.  */);
10992   Vcoding_system_alist = Qnil;
10993
10994   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10995                doc: /* List of coding-categories (symbols) ordered by priority.
10996
10997 On detecting a coding system, Emacs tries code detection algorithms
10998 associated with each coding-category one by one in this order.  When
10999 one algorithm agrees with a byte sequence of source text, the coding
11000 system bound to the corresponding coding-category is selected.
11001
11002 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
11003   {
11004     int i;
11005
11006     Vcoding_category_list = Qnil;
11007     for (i = coding_category_max - 1; i >= 0; i--)
11008       Vcoding_category_list
11009         = Fcons (AREF (Vcoding_category_table, i),
11010                  Vcoding_category_list);
11011   }
11012
11013   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11014                doc: /* Specify the coding system for read operations.
11015 It is useful to bind this variable with `let', but do not set it globally.
11016 If the value is a coding system, it is used for decoding on read operation.
11017 If not, an appropriate element is used from one of the coding system alists.
11018 There are three such tables: `file-coding-system-alist',
11019 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11020   Vcoding_system_for_read = Qnil;
11021
11022   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11023                doc: /* Specify the coding system for write operations.
11024 Programs bind this variable with `let', but you should not set it globally.
11025 If the value is a coding system, it is used for encoding of output,
11026 when writing it to a file and when sending it to a file or subprocess.
11027
11028 If this does not specify a coding system, an appropriate element
11029 is used from one of the coding system alists.
11030 There are three such tables: `file-coding-system-alist',
11031 `process-coding-system-alist', and `network-coding-system-alist'.
11032 For output to files, if the above procedure does not specify a coding system,
11033 the value of `buffer-file-coding-system' is used.  */);
11034   Vcoding_system_for_write = Qnil;
11035
11036   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11037                doc: /*
11038 Coding system used in the latest file or process I/O.  */);
11039   Vlast_coding_system_used = Qnil;
11040
11041   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11042                doc: /*
11043 Error status of the last code conversion.
11044
11045 When an error was detected in the last code conversion, this variable
11046 is set to one of the following symbols.
11047   `insufficient-source'
11048   `inconsistent-eol'
11049   `invalid-source'
11050   `interrupted'
11051   `insufficient-memory'
11052 When no error was detected, the value doesn't change.  So, to check
11053 the error status of a code conversion by this variable, you must
11054 explicitly set this variable to nil before performing code
11055 conversion.  */);
11056   Vlast_code_conversion_error = Qnil;
11057
11058   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11059                doc: /*
11060 Non-nil means always inhibit code conversion of end-of-line format.
11061 See info node `Coding Systems' and info node `Text and Binary' concerning
11062 such conversion.  */);
11063   inhibit_eol_conversion = 0;
11064
11065   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11066                doc: /*
11067 Non-nil means process buffer inherits coding system of process output.
11068 Bind it to t if the process output is to be treated as if it were a file
11069 read from some filesystem.  */);
11070   inherit_process_coding_system = 0;
11071
11072   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11073                doc: /*
11074 Alist to decide a coding system to use for a file I/O operation.
11075 The format is ((PATTERN . VAL) ...),
11076 where PATTERN is a regular expression matching a file name,
11077 VAL is a coding system, a cons of coding systems, or a function symbol.
11078 If VAL is a coding system, it is used for both decoding and encoding
11079 the file contents.
11080 If VAL is a cons of coding systems, the car part is used for decoding,
11081 and the cdr part is used for encoding.
11082 If VAL is a function symbol, the function must return a coding system
11083 or a cons of coding systems which are used as above.  The function is
11084 called with an argument that is a list of the arguments with which
11085 `find-operation-coding-system' was called.  If the function can't decide
11086 a coding system, it can return `undecided' so that the normal
11087 code-detection is performed.
11088
11089 See also the function `find-operation-coding-system'
11090 and the variable `auto-coding-alist'.  */);
11091   Vfile_coding_system_alist = Qnil;
11092
11093   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11094                doc: /*
11095 Alist to decide a coding system to use for a process I/O operation.
11096 The format is ((PATTERN . VAL) ...),
11097 where PATTERN is a regular expression matching a program name,
11098 VAL is a coding system, a cons of coding systems, or a function symbol.
11099 If VAL is a coding system, it is used for both decoding what received
11100 from the program and encoding what sent to the program.
11101 If VAL is a cons of coding systems, the car part is used for decoding,
11102 and the cdr part is used for encoding.
11103 If VAL is a function symbol, the function must return a coding system
11104 or a cons of coding systems which are used as above.
11105
11106 See also the function `find-operation-coding-system'.  */);
11107   Vprocess_coding_system_alist = Qnil;
11108
11109   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11110                doc: /*
11111 Alist to decide a coding system to use for a network I/O operation.
11112 The format is ((PATTERN . VAL) ...),
11113 where PATTERN is a regular expression matching a network service name
11114 or is a port number to connect to,
11115 VAL is a coding system, a cons of coding systems, or a function symbol.
11116 If VAL is a coding system, it is used for both decoding what received
11117 from the network stream and encoding what sent to the network stream.
11118 If VAL is a cons of coding systems, the car part is used for decoding,
11119 and the cdr part is used for encoding.
11120 If VAL is a function symbol, the function must return a coding system
11121 or a cons of coding systems which are used as above.
11122
11123 See also the function `find-operation-coding-system'.  */);
11124   Vnetwork_coding_system_alist = Qnil;
11125
11126   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11127                doc: /* Coding system to use with system messages.
11128 Also used for decoding keyboard input on X Window system, and for
11129 encoding standard output and error streams.  */);
11130   Vlocale_coding_system = Qnil;
11131
11132   /* The eol mnemonics are reset in startup.el system-dependently.  */
11133   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11134                doc: /*
11135 String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11136   eol_mnemonic_unix = build_pure_c_string (":");
11137
11138   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11139                doc: /*
11140 String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11141   eol_mnemonic_dos = build_pure_c_string ("\\");
11142
11143   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11144                doc: /*
11145 String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11146   eol_mnemonic_mac = build_pure_c_string ("/");
11147
11148   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11149                doc: /*
11150 String displayed in mode line when end-of-line format is not yet determined.  */);
11151   eol_mnemonic_undecided = build_pure_c_string (":");
11152
11153   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11154                doc: /*
11155 Non-nil enables character translation while encoding and decoding.  */);
11156   Venable_character_translation = Qt;
11157
11158   DEFVAR_LISP ("standard-translation-table-for-decode",
11159                Vstandard_translation_table_for_decode,
11160                doc: /* Table for translating characters while decoding.  */);
11161   Vstandard_translation_table_for_decode = Qnil;
11162
11163   DEFVAR_LISP ("standard-translation-table-for-encode",
11164                Vstandard_translation_table_for_encode,
11165                doc: /* Table for translating characters while encoding.  */);
11166   Vstandard_translation_table_for_encode = Qnil;
11167
11168   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11169                doc: /* Alist of charsets vs revision numbers.
11170 While encoding, if a charset (car part of an element) is found,
11171 designate it with the escape sequence identifying revision (cdr part
11172 of the element).  */);
11173   Vcharset_revision_table = Qnil;
11174
11175   DEFVAR_LISP ("default-process-coding-system",
11176                Vdefault_process_coding_system,
11177                doc: /* Cons of coding systems used for process I/O by default.
11178 The car part is used for decoding a process output,
11179 the cdr part is used for encoding a text to be sent to a process.  */);
11180   Vdefault_process_coding_system = Qnil;
11181
11182   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11183                doc: /*
11184 Table of extra Latin codes in the range 128..159 (inclusive).
11185 This is a vector of length 256.
11186 If Nth element is non-nil, the existence of code N in a file
11187 \(or output of subprocess) doesn't prevent it to be detected as
11188 a coding system of ISO 2022 variant which has a flag
11189 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11190 or reading output of a subprocess.
11191 Only 128th through 159th elements have a meaning.  */);
11192   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11193
11194   DEFVAR_LISP ("select-safe-coding-system-function",
11195                Vselect_safe_coding_system_function,
11196                doc: /*
11197 Function to call to select safe coding system for encoding a text.
11198
11199 If set, this function is called to force a user to select a proper
11200 coding system which can encode the text in the case that a default
11201 coding system used in each operation can't encode the text.  The
11202 function should take care that the buffer is not modified while
11203 the coding system is being selected.
11204
11205 The default value is `select-safe-coding-system' (which see).  */);
11206   Vselect_safe_coding_system_function = Qnil;
11207
11208   DEFVAR_BOOL ("coding-system-require-warning",
11209                coding_system_require_warning,
11210                doc: /* Internal use only.
11211 If non-nil, on writing a file, `select-safe-coding-system-function' is
11212 called even if `coding-system-for-write' is non-nil.  The command
11213 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11214   coding_system_require_warning = 0;
11215
11216
11217   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11218                inhibit_iso_escape_detection,
11219                doc: /*
11220 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11221
11222 When Emacs reads text, it tries to detect how the text is encoded.
11223 This code detection is sensitive to escape sequences.  If Emacs sees
11224 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11225 of the ISO2022 encodings, and decodes text by the corresponding coding
11226 system (e.g. `iso-2022-7bit').
11227
11228 However, there may be a case that you want to read escape sequences in
11229 a file as is.  In such a case, you can set this variable to non-nil.
11230 Then the code detection will ignore any escape sequences, and no text is
11231 detected as encoded in some ISO-2022 encoding.  The result is that all
11232 escape sequences become visible in a buffer.
11233
11234 The default value is nil, and it is strongly recommended not to change
11235 it.  That is because many Emacs Lisp source files that contain
11236 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11237 in Emacs's distribution, and they won't be decoded correctly on
11238 reading if you suppress escape sequence detection.
11239
11240 The other way to read escape sequences in a file without decoding is
11241 to explicitly specify some coding system that doesn't use ISO-2022
11242 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11243   inhibit_iso_escape_detection = 0;
11244
11245   DEFVAR_BOOL ("inhibit-null-byte-detection",
11246                inhibit_null_byte_detection,
11247                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11248 By default, Emacs treats it as binary data, and does not attempt to
11249 decode it.  The effect is as if you specified `no-conversion' for
11250 reading that text.
11251
11252 Set this to non-nil when a regular text happens to include null bytes.
11253 Examples are Index nodes of Info files and null-byte delimited output
11254 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11255 decode text as usual.  */);
11256   inhibit_null_byte_detection = 0;
11257
11258   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11259                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11260 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11261   disable_ascii_optimization = 0;
11262
11263   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11264                doc: /* Char table for translating self-inserting characters.
11265 This is applied to the result of input methods, not their input.
11266 See also `keyboard-translate-table'.
11267
11268 Use of this variable for character code unification was rendered
11269 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11270 internal character representation.  */);
11271   Vtranslation_table_for_input = Qnil;
11272
11273   Lisp_Object args[coding_arg_undecided_max];
11274   memclear (args, sizeof args);
11275
11276   Lisp_Object plist[] =
11277     {
11278       QCname,
11279       args[coding_arg_name] = Qno_conversion,
11280       QCmnemonic,
11281       args[coding_arg_mnemonic] = make_number ('='),
11282       intern_c_string (":coding-type"),
11283       args[coding_arg_coding_type] = Qraw_text,
11284       QCascii_compatible_p,
11285       args[coding_arg_ascii_compatible_p] = Qt,
11286       QCdefault_char,
11287       args[coding_arg_default_char] = make_number (0),
11288       intern_c_string (":for-unibyte"),
11289       args[coding_arg_for_unibyte] = Qt,
11290       intern_c_string (":docstring"),
11291       (build_pure_c_string
11292        ("Do no conversion.\n"
11293         "\n"
11294         "When you visit a file with this coding, the file is read into a\n"
11295         "unibyte buffer as is, thus each byte of a file is treated as a\n"
11296         "character.")),
11297       intern_c_string (":eol-type"),
11298       args[coding_arg_eol_type] = Qunix,
11299     };
11300   args[coding_arg_plist] = CALLMANY (Flist, plist);
11301   Fdefine_coding_system_internal (coding_arg_max, args);
11302
11303   plist[1] = args[coding_arg_name] = Qundecided;
11304   plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11305   plist[5] = args[coding_arg_coding_type] = Qundecided;
11306   /* This is already set.
11307      plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11308   plist[8] = intern_c_string (":charset-list");
11309   plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11310   plist[11] = args[coding_arg_for_unibyte] = Qnil;
11311   plist[13] = build_pure_c_string ("No conversion on encoding, "
11312                                    "automatic conversion on decoding.");
11313   plist[15] = args[coding_arg_eol_type] = Qnil;
11314   args[coding_arg_plist] = CALLMANY (Flist, plist);
11315   args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11316   args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11317   Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11318
11319   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11320
11321   for (int i = 0; i < coding_category_max; i++)
11322     Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11323
11324 #if defined (DOS_NT)
11325   system_eol_type = Qdos;
11326 #else
11327   system_eol_type = Qunix;
11328 #endif
11329   staticpro (&system_eol_type);
11330 }
11331 #endif /* emacs */