src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2016 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or (at
  16 your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  Classic Mac OS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "termhooks.h"
 301
 302 Lisp_Object Vcoding_system_hash_table;
 303
 304 /* Format of end-of-line decided by system.  This is Qunix on
 305    Unix and Mac, Qdos on DOS/Windows.
 306    This has an effect only for external encoding (i.e. for output to
 307    file and process), not for in-buffer or Lisp string encoding.  */
 308 static Lisp_Object system_eol_type;
 309
 310 #ifdef emacs
 311
 312 /* Coding-systems are handed between Emacs Lisp programs and C internal
 313    routines by the following three variables.  */
 314 /* Coding system to be used to encode text for terminal display when
 315    terminal coding system is nil.  */
 316 struct coding_system safe_terminal_coding;
 317
 318 #endif /* emacs */
 319
 320 /* Two special coding systems.  */
 321 static Lisp_Object Vsjis_coding_system;
 322 static Lisp_Object Vbig5_coding_system;
 323
 324 /* ISO2022 section */
 325
 326 #define CODING_ISO_INITIAL(coding, reg)                 \
 327   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 328                      coding_attr_iso_initial),          \
 329                reg)))
 330
 331
 332 #define CODING_ISO_REQUEST(coding, charset_id)          \
 333   (((charset_id) <= (coding)->max_charset_id            \
 334     ? ((coding)->safe_charsets[charset_id] != 255       \
 335        ? (coding)->safe_charsets[charset_id]            \
 336        : -1)                                            \
 337     : -1))
 338
 339
 340 #define CODING_ISO_FLAGS(coding)        \
 341   ((coding)->spec.iso_2022.flags)
 342 #define CODING_ISO_DESIGNATION(coding, reg)     \
 343   ((coding)->spec.iso_2022.current_designation[reg])
 344 #define CODING_ISO_INVOCATION(coding, plane)    \
 345   ((coding)->spec.iso_2022.current_invocation[plane])
 346 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 347   ((coding)->spec.iso_2022.single_shifting)
 348 #define CODING_ISO_BOL(coding)  \
 349   ((coding)->spec.iso_2022.bol)
 350 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 351   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1       \
 352    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
 353 #define CODING_ISO_CMP_STATUS(coding)   \
 354   (&(coding)->spec.iso_2022.cmp_status)
 355 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 356   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 357 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 358   ((coding)->spec.iso_2022.embedded_utf_8)
 359
 360 /* Control characters of ISO2022.  */
 361                         /* code */      /* function */
 362 #define ISO_CODE_SO     0x0E            /* shift-out */
 363 #define ISO_CODE_SI     0x0F            /* shift-in */
 364 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 365 #define ISO_CODE_ESC    0x1B            /* escape */
 366 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 367 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 368 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 369
 370 /* All code (1-byte) of ISO2022 is classified into one of the
 371    followings.  */
 372 enum iso_code_class_type
 373   {
 374     ISO_control_0,              /* Control codes in the range
 375                                    0x00..0x1F and 0x7F, except for the
 376                                    following 5 codes.  */
 377     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 378     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 379     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 380     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 381     ISO_control_1,              /* Control codes in the range
 382                                    0x80..0x9F, except for the
 383                                    following 3 codes.  */
 384     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 385     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 386     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 387     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 388     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 389     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 390     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 391   };
 392
 393 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 394     `iso-flags' attribute of an iso2022 coding system.  */
 395
 396 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 397    instead of the correct short-form sequence (e.g. ESC $ A).  */
 398 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 399
 400 /* If set, reset graphic planes and registers at end-of-line to the
 401    initial state.  */
 402 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 403
 404 /* If set, reset graphic planes and registers before any control
 405    characters to the initial state.  */
 406 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 407
 408 /* If set, encode by 7-bit environment.  */
 409 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 410
 411 /* If set, use locking-shift function.  */
 412 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 413
 414 /* If set, use single-shift function.  Overwrite
 415    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 416 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 417
 418 /* If set, use designation escape sequence.  */
 419 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 420
 421 /* If set, produce revision number sequence.  */
 422 #define CODING_ISO_FLAG_REVISION        0x0080
 423
 424 /* If set, produce ISO6429's direction specifying sequence.  */
 425 #define CODING_ISO_FLAG_DIRECTION       0x0100
 426
 427 /* If set, assume designation states are reset at beginning of line on
 428    output.  */
 429 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 430
 431 /* If set, designation sequence should be placed at beginning of line
 432    on output.  */
 433 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 434
 435 /* If set, do not encode unsafe characters on output.  */
 436 #define CODING_ISO_FLAG_SAFE            0x0800
 437
 438 /* If set, extra latin codes (128..159) are accepted as a valid code
 439    on input.  */
 440 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 441
 442 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 443
 444 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 445
 446 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 447
 448 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 449
 450 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 451
 452 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 453
 454 /* A character to be produced on output if encoding of the original
 455    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 456 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 457
 458 /* UTF-8 section */
 459 #define CODING_UTF_8_BOM(coding)        \
 460   ((coding)->spec.utf_8_bom)
 461
 462 /* UTF-16 section */
 463 #define CODING_UTF_16_BOM(coding)       \
 464   ((coding)->spec.utf_16.bom)
 465
 466 #define CODING_UTF_16_ENDIAN(coding)    \
 467   ((coding)->spec.utf_16.endian)
 468
 469 #define CODING_UTF_16_SURROGATE(coding) \
 470   ((coding)->spec.utf_16.surrogate)
 471
 472
 473 /* CCL section */
 474 #define CODING_CCL_DECODER(coding)      \
 475   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 476 #define CODING_CCL_ENCODER(coding)      \
 477   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 478 #define CODING_CCL_VALIDS(coding)                                          \
 479   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 480
 481 /* Index for each coding category in `coding_categories' */
 482
 483 enum coding_category
 484   {
 485     coding_category_iso_7,
 486     coding_category_iso_7_tight,
 487     coding_category_iso_8_1,
 488     coding_category_iso_8_2,
 489     coding_category_iso_7_else,
 490     coding_category_iso_8_else,
 491     coding_category_utf_8_auto,
 492     coding_category_utf_8_nosig,
 493     coding_category_utf_8_sig,
 494     coding_category_utf_16_auto,
 495     coding_category_utf_16_be,
 496     coding_category_utf_16_le,
 497     coding_category_utf_16_be_nosig,
 498     coding_category_utf_16_le_nosig,
 499     coding_category_charset,
 500     coding_category_sjis,
 501     coding_category_big5,
 502     coding_category_ccl,
 503     coding_category_emacs_mule,
 504     /* All above are targets of code detection.  */
 505     coding_category_raw_text,
 506     coding_category_undecided,
 507     coding_category_max
 508   };
 509
 510 /* Definitions of flag bits used in detect_coding_XXXX.  */
 511 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 512 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 513 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 514 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 515 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 516 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 517 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 518 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 519 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 520 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 521 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 522 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 523 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 524 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 525 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 526 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 527 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 528 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 529 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 530 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 531
 532 /* This value is returned if detect_coding_mask () find nothing other
 533    than ASCII characters.  */
 534 #define CATEGORY_MASK_ANY               \
 535   (CATEGORY_MASK_ISO_7                  \
 536    | CATEGORY_MASK_ISO_7_TIGHT          \
 537    | CATEGORY_MASK_ISO_8_1              \
 538    | CATEGORY_MASK_ISO_8_2              \
 539    | CATEGORY_MASK_ISO_7_ELSE           \
 540    | CATEGORY_MASK_ISO_8_ELSE           \
 541    | CATEGORY_MASK_UTF_8_AUTO           \
 542    | CATEGORY_MASK_UTF_8_NOSIG          \
 543    | CATEGORY_MASK_UTF_8_SIG            \
 544    | CATEGORY_MASK_UTF_16_AUTO          \
 545    | CATEGORY_MASK_UTF_16_BE            \
 546    | CATEGORY_MASK_UTF_16_LE            \
 547    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 548    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 549    | CATEGORY_MASK_CHARSET              \
 550    | CATEGORY_MASK_SJIS                 \
 551    | CATEGORY_MASK_BIG5                 \
 552    | CATEGORY_MASK_CCL                  \
 553    | CATEGORY_MASK_EMACS_MULE)
 554
 555
 556 #define CATEGORY_MASK_ISO_7BIT \
 557   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 558
 559 #define CATEGORY_MASK_ISO_8BIT \
 560   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 561
 562 #define CATEGORY_MASK_ISO_ELSE \
 563   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 564
 565 #define CATEGORY_MASK_ISO_ESCAPE        \
 566   (CATEGORY_MASK_ISO_7                  \
 567    | CATEGORY_MASK_ISO_7_TIGHT          \
 568    | CATEGORY_MASK_ISO_7_ELSE           \
 569    | CATEGORY_MASK_ISO_8_ELSE)
 570
 571 #define CATEGORY_MASK_ISO       \
 572   (  CATEGORY_MASK_ISO_7BIT     \
 573      | CATEGORY_MASK_ISO_8BIT   \
 574      | CATEGORY_MASK_ISO_ELSE)
 575
 576 #define CATEGORY_MASK_UTF_16            \
 577   (CATEGORY_MASK_UTF_16_AUTO            \
 578    | CATEGORY_MASK_UTF_16_BE            \
 579    | CATEGORY_MASK_UTF_16_LE            \
 580    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 581    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 582
 583 #define CATEGORY_MASK_UTF_8     \
 584   (CATEGORY_MASK_UTF_8_AUTO     \
 585    | CATEGORY_MASK_UTF_8_NOSIG  \
 586    | CATEGORY_MASK_UTF_8_SIG)
 587
 588 /* Table of coding categories (Lisp symbols).  This variable is for
 589    internal use only.  */
 590 static Lisp_Object Vcoding_category_table;
 591
 592 /* Table of coding-categories ordered by priority.  */
 593 static enum coding_category coding_priorities[coding_category_max];
 594
 595 /* Nth element is a coding context for the coding system bound to the
 596    Nth coding category.  */
 597 static struct coding_system coding_categories[coding_category_max];
 598
 599 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 600
 601 static int
 602 encode_inhibit_flag (Lisp_Object flag)
 603 {
 604   return NILP (flag) ? -1 : EQ (flag, Qt);
 605 }
 606
 607 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 608    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 609
 610 static bool
 611 inhibit_flag (int encoded_flag, bool var)
 612 {
 613   return 0 < encoded_flag + var;
 614 }
 615
 616 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 617   do {                                                  \
 618     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 619     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 620   } while (0)
 621
 622 static void
 623 CHECK_NATNUM_CAR (Lisp_Object x)
 624 {
 625   Lisp_Object tmp = XCAR (x);
 626   CHECK_NATNUM (tmp);
 627   XSETCAR (x, tmp);
 628 }
 629
 630 static void
 631 CHECK_NATNUM_CDR (Lisp_Object x)
 632 {
 633   Lisp_Object tmp = XCDR (x);
 634   CHECK_NATNUM (tmp);
 635   XSETCDR (x, tmp);
 636 }
 637
 638 /* True if CODING's destination can be grown.  */
 639
 640 static bool
 641 growable_destination (struct coding_system *coding)
 642 {
 643   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
 644 }
 645
 646
 647 /* Safely get one byte from the source text pointed by SRC which ends
 648    at SRC_END, and set C to that byte.  If there are not enough bytes
 649    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 650    and a multibyte character is found at SRC, set C to the
 651    negative value of the character code.  The caller should declare
 652    and set these variables appropriately in advance:
 653         src, src_end, multibytep */
 654
 655 #define ONE_MORE_BYTE(c)                                \
 656   do {                                                  \
 657     if (src == src_end)                                 \
 658       {                                                 \
 659         if (src_base < src)                             \
 660           record_conversion_result                      \
 661             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 662         goto no_more_source;                            \
 663       }                                                 \
 664     c = *src++;                                         \
 665     if (multibytep && (c & 0x80))                       \
 666       {                                                 \
 667         if ((c & 0xFE) == 0xC0)                         \
 668           c = ((c & 1) << 6) | *src++;                  \
 669         else                                            \
 670           {                                             \
 671             src--;                                      \
 672             c = - string_char (src, &src, NULL);        \
 673             record_conversion_result                    \
 674               (coding, CODING_RESULT_INVALID_SRC);      \
 675           }                                             \
 676       }                                                 \
 677     consumed_chars++;                                   \
 678   } while (0)
 679
 680 /* Safely get two bytes from the source text pointed by SRC which ends
 681    at SRC_END, and set C1 and C2 to those bytes while skipping the
 682    heading multibyte characters.  If there are not enough bytes in the
 683    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 684    a multibyte character is found for C2, set C2 to the negative value
 685    of the character code.  The caller should declare and set these
 686    variables appropriately in advance:
 687         src, src_end, multibytep
 688    It is intended that this macro is used in detect_coding_utf_16.  */
 689
 690 #define TWO_MORE_BYTES(c1, c2)                          \
 691   do {                                                  \
 692     do {                                                \
 693       if (src == src_end)                               \
 694         goto no_more_source;                            \
 695       c1 = *src++;                                      \
 696       if (multibytep && (c1 & 0x80))                    \
 697         {                                               \
 698           if ((c1 & 0xFE) == 0xC0)                      \
 699             c1 = ((c1 & 1) << 6) | *src++;              \
 700           else                                          \
 701             {                                           \
 702               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 703               c1 = -1;                                  \
 704             }                                           \
 705         }                                               \
 706     } while (c1 < 0);                                   \
 707     if (src == src_end)                                 \
 708       goto no_more_source;                              \
 709     c2 = *src++;                                        \
 710     if (multibytep && (c2 & 0x80))                      \
 711       {                                                 \
 712         if ((c2 & 0xFE) == 0xC0)                        \
 713           c2 = ((c2 & 1) << 6) | *src++;                \
 714         else                                            \
 715           c2 = -1;                                      \
 716       }                                                 \
 717   } while (0)
 718
 719
 720 /* Store a byte C in the place pointed by DST and increment DST to the
 721    next free point, and increment PRODUCED_CHARS.  The caller should
 722    assure that C is 0..127, and declare and set the variable `dst'
 723    appropriately in advance.
 724 */
 725
 726
 727 #define EMIT_ONE_ASCII_BYTE(c)  \
 728   do {                          \
 729     produced_chars++;           \
 730     *dst++ = (c);               \
 731   } while (0)
 732
 733
 734 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 735
 736 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 737   do {                                  \
 738     produced_chars += 2;                \
 739     *dst++ = (c1), *dst++ = (c2);       \
 740   } while (0)
 741
 742
 743 /* Store a byte C in the place pointed by DST and increment DST to the
 744    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 745    store in an appropriate multibyte form.  The caller should
 746    declare and set the variables `dst' and `multibytep' appropriately
 747    in advance.  */
 748
 749 #define EMIT_ONE_BYTE(c)                \
 750   do {                                  \
 751     produced_chars++;                   \
 752     if (multibytep)                     \
 753       {                                 \
 754         unsigned ch = (c);              \
 755         if (ch >= 0x80)                 \
 756           ch = BYTE8_TO_CHAR (ch);      \
 757         CHAR_STRING_ADVANCE (ch, dst);  \
 758       }                                 \
 759     else                                \
 760       *dst++ = (c);                     \
 761   } while (0)
 762
 763
 764 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 765
 766 #define EMIT_TWO_BYTES(c1, c2)          \
 767   do {                                  \
 768     produced_chars += 2;                \
 769     if (multibytep)                     \
 770       {                                 \
 771         unsigned ch;                    \
 772                                         \
 773         ch = (c1);                      \
 774         if (ch >= 0x80)                 \
 775           ch = BYTE8_TO_CHAR (ch);      \
 776         CHAR_STRING_ADVANCE (ch, dst);  \
 777         ch = (c2);                      \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781       }                                 \
 782     else                                \
 783       {                                 \
 784         *dst++ = (c1);                  \
 785         *dst++ = (c2);                  \
 786       }                                 \
 787   } while (0)
 788
 789
 790 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 791   do {                                  \
 792     EMIT_ONE_BYTE (c1);                 \
 793     EMIT_TWO_BYTES (c2, c3);            \
 794   } while (0)
 795
 796
 797 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 798   do {                                          \
 799     EMIT_TWO_BYTES (c1, c2);                    \
 800     EMIT_TWO_BYTES (c3, c4);                    \
 801   } while (0)
 802
 803
 804 static void
 805 record_conversion_result (struct coding_system *coding,
 806                           enum coding_result_code result)
 807 {
 808   coding->result = result;
 809   switch (result)
 810     {
 811     case CODING_RESULT_INSUFFICIENT_SRC:
 812       Vlast_code_conversion_error = Qinsufficient_source;
 813       break;
 814     case CODING_RESULT_INVALID_SRC:
 815       Vlast_code_conversion_error = Qinvalid_source;
 816       break;
 817     case CODING_RESULT_INTERRUPT:
 818       Vlast_code_conversion_error = Qinterrupted;
 819       break;
 820     case CODING_RESULT_INSUFFICIENT_DST:
 821       /* Don't record this error in Vlast_code_conversion_error
 822          because it happens just temporarily and is resolved when the
 823          whole conversion is finished.  */
 824       break;
 825     case CODING_RESULT_SUCCESS:
 826       break;
 827     default:
 828       Vlast_code_conversion_error = intern ("Unknown error");
 829     }
 830 }
 831
 832 /* These wrapper macros are used to preserve validity of pointers into
 833    buffer text across calls to decode_char, encode_char, etc, which
 834    could cause relocation of buffers if it loads a charset map,
 835    because loading a charset map allocates large structures.  */
 836
 837 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 838   do {                                                                       \
 839     ptrdiff_t offset;                                                        \
 840                                                                              \
 841     charset_map_loaded = 0;                                                  \
 842     c = DECODE_CHAR (charset, code);                                         \
 843     if (charset_map_loaded                                                   \
 844         && (offset = coding_change_source (coding)))                         \
 845       {                                                                      \
 846         src += offset;                                                       \
 847         src_base += offset;                                                  \
 848         src_end += offset;                                                   \
 849       }                                                                      \
 850   } while (0)
 851
 852 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 853   do {                                                                  \
 854     ptrdiff_t offset;                                                   \
 855                                                                         \
 856     charset_map_loaded = 0;                                             \
 857     code = ENCODE_CHAR (charset, c);                                    \
 858     if (charset_map_loaded                                              \
 859         && (offset = coding_change_destination (coding)))               \
 860       {                                                                 \
 861         dst += offset;                                                  \
 862         dst_end += offset;                                              \
 863       }                                                                 \
 864   } while (0)
 865
 866 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 867   do {                                                                  \
 868     ptrdiff_t offset;                                                   \
 869                                                                         \
 870     charset_map_loaded = 0;                                             \
 871     charset = char_charset (c, charset_list, code_return);              \
 872     if (charset_map_loaded                                              \
 873         && (offset = coding_change_destination (coding)))               \
 874       {                                                                 \
 875         dst += offset;                                                  \
 876         dst_end += offset;                                              \
 877       }                                                                 \
 878   } while (0)
 879
 880 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 881   do {                                                                  \
 882     ptrdiff_t offset;                                                   \
 883                                                                         \
 884     charset_map_loaded = 0;                                             \
 885     result = CHAR_CHARSET_P (c, charset);                               \
 886     if (charset_map_loaded                                              \
 887         && (offset = coding_change_destination (coding)))               \
 888       {                                                                 \
 889         dst += offset;                                                  \
 890         dst_end += offset;                                              \
 891       }                                                                 \
 892   } while (0)
 893
 894
 895 /* If there are at least BYTES length of room at dst, allocate memory
 896    for coding->destination and update dst and dst_end.  We don't have
 897    to take care of coding->source which will be relocated.  It is
 898    handled by calling coding_set_source in encode_coding.  */
 899
 900 #define ASSURE_DESTINATION(bytes)                               \
 901   do {                                                          \
 902     if (dst + (bytes) >= dst_end)                               \
 903       {                                                         \
 904         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 905                                                                 \
 906         dst = alloc_destination (coding, more_bytes, dst);      \
 907         dst_end = coding->destination + coding->dst_bytes;      \
 908       }                                                         \
 909   } while (0)
 910
 911
 912 /* Store multibyte form of the character C in P, and advance P to the
 913    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 914    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 915    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 916
 917 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 918
 919 /* Return the character code of character whose multibyte form is at
 920    P, and advance P to the end of the multibyte form.  This used to be
 921    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 922    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 923
 924 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 925
 926 /* Set coding->source from coding->src_object.  */
 927
 928 static void
 929 coding_set_source (struct coding_system *coding)
 930 {
 931   if (BUFFERP (coding->src_object))
 932     {
 933       struct buffer *buf = XBUFFER (coding->src_object);
 934
 935       if (coding->src_pos < 0)
 936         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 937       else
 938         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 939     }
 940   else if (STRINGP (coding->src_object))
 941     {
 942       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 943     }
 944   else
 945     {
 946       /* Otherwise, the source is C string and is never relocated
 947          automatically.  Thus we don't have to update anything.  */
 948     }
 949 }
 950
 951
 952 /* Set coding->source from coding->src_object, and return how many
 953    bytes coding->source was changed.  */
 954
 955 static ptrdiff_t
 956 coding_change_source (struct coding_system *coding)
 957 {
 958   const unsigned char *orig = coding->source;
 959   coding_set_source (coding);
 960   return coding->source - orig;
 961 }
 962
 963
 964 /* Set coding->destination from coding->dst_object.  */
 965
 966 static void
 967 coding_set_destination (struct coding_system *coding)
 968 {
 969   if (BUFFERP (coding->dst_object))
 970     {
 971       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 972         {
 973           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 974           coding->dst_bytes = (GAP_END_ADDR
 975                                - (coding->src_bytes - coding->consumed)
 976                                - coding->destination);
 977         }
 978       else
 979         {
 980           /* We are sure that coding->dst_pos_byte is before the gap
 981              of the buffer. */
 982           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 983                                  + coding->dst_pos_byte - BEG_BYTE);
 984           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 985                                - coding->destination);
 986         }
 987     }
 988   else
 989     {
 990       /* Otherwise, the destination is C string and is never relocated
 991          automatically.  Thus we don't have to update anything.  */
 992     }
 993 }
 994
 995
 996 /* Set coding->destination from coding->dst_object, and return how
 997    many bytes coding->destination was changed.  */
 998
 999 static ptrdiff_t
1000 coding_change_destination (struct coding_system *coding)
1001 {
1002   const unsigned char *orig = coding->destination;
1003   coding_set_destination (coding);
1004   return coding->destination - orig;
1005 }
1006
1007
1008 static void
1009 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1010 {
1011   ptrdiff_t newbytes;
1012   if (INT_ADD_WRAPV (coding->dst_bytes, bytes, &newbytes)
1013       || SIZE_MAX < newbytes)
1014     string_overflow ();
1015   coding->destination = xrealloc (coding->destination, newbytes);
1016   coding->dst_bytes = newbytes;
1017 }
1018
1019 static void
1020 coding_alloc_by_making_gap (struct coding_system *coding,
1021                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1022 {
1023   if (EQ (coding->src_object, coding->dst_object))
1024     {
1025       /* The gap may contain the produced data at the head and not-yet
1026          consumed data at the tail.  To preserve those data, we at
1027          first make the gap size to zero, then increase the gap
1028          size.  */
1029       ptrdiff_t add = GAP_SIZE;
1030
1031       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1032       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1033       make_gap (bytes);
1034       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1035       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1036     }
1037   else
1038     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1039 }
1040
1041
1042 static unsigned char *
1043 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1044                    unsigned char *dst)
1045 {
1046   ptrdiff_t offset = dst - coding->destination;
1047
1048   if (BUFFERP (coding->dst_object))
1049     {
1050       struct buffer *buf = XBUFFER (coding->dst_object);
1051
1052       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1053     }
1054   else
1055     coding_alloc_by_realloc (coding, nbytes);
1056   coding_set_destination (coding);
1057   dst = coding->destination + offset;
1058   return dst;
1059 }
1060
1061 /** Macros for annotations.  */
1062
1063 /* An annotation data is stored in the array coding->charbuf in this
1064    format:
1065      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1066    LENGTH is the number of elements in the annotation.
1067    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1068    NCHARS is the number of characters in the text annotated.
1069
1070    The format of the following elements depend on ANNOTATION_MASK.
1071
1072    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1073    follows:
1074      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1075
1076    NBYTES is the number of bytes specified in the header part of
1077    old-style emacs-mule encoding, or 0 for the other kind of
1078    composition.
1079
1080    METHOD is one of enum composition_method.
1081
1082    Optional COMPOSITION-COMPONENTS are characters and composition
1083    rules.
1084
1085    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1086    follows.
1087
1088    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1089    recover from an invalid annotation, and should be skipped by
1090    produce_annotation.  */
1091
1092 /* Maximum length of the header of annotation data.  */
1093 #define MAX_ANNOTATION_LENGTH 5
1094
1095 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1096   do {                                                  \
1097     *(buf)++ = -(len);                                  \
1098     *(buf)++ = (mask);                                  \
1099     *(buf)++ = (nchars);                                \
1100     coding->annotated = 1;                              \
1101   } while (0);
1102
1103 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1104   do {                                                                      \
1105     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1106     *buf++ = nbytes;                                                        \
1107     *buf++ = method;                                                        \
1108   } while (0)
1109
1110
1111 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1112   do {                                                                  \
1113     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1114     *buf++ = id;                                                        \
1115   } while (0)
1116
1117
1118 /* Bitmasks for coding->eol_seen.  */
1119
1120 #define EOL_SEEN_NONE   0
1121 #define EOL_SEEN_LF     1
1122 #define EOL_SEEN_CR     2
1123 #define EOL_SEEN_CRLF   4
1124
1125 \f
1126 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1127
1128
1129
1130 \f
1131 /*** 3. UTF-8 ***/
1132
1133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1134    Return true if a text is encoded in UTF-8.  */
1135
1136 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1137 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1138 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1139 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1140 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1141 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1142
1143 #define UTF_8_BOM_1 0xEF
1144 #define UTF_8_BOM_2 0xBB
1145 #define UTF_8_BOM_3 0xBF
1146
1147 /* Unlike the other detect_coding_XXX, this function counts the number
1148    of characters and checks the EOL format.  */
1149
1150 static bool
1151 detect_coding_utf_8 (struct coding_system *coding,
1152                      struct coding_detection_info *detect_info)
1153 {
1154   const unsigned char *src = coding->source, *src_base;
1155   const unsigned char *src_end = coding->source + coding->src_bytes;
1156   bool multibytep = coding->src_multibyte;
1157   ptrdiff_t consumed_chars = 0;
1158   bool bom_found = 0;
1159   ptrdiff_t nchars = coding->head_ascii;
1160   int eol_seen = coding->eol_seen;
1161
1162   detect_info->checked |= CATEGORY_MASK_UTF_8;
1163   /* A coding system of this category is always ASCII compatible.  */
1164   src += nchars;
1165
1166   if (src == coding->source     /* BOM should be at the head.  */
1167       && src + 3 < src_end      /* BOM is 3-byte long.  */
1168       && src[0] == UTF_8_BOM_1
1169       && src[1] == UTF_8_BOM_2
1170       && src[2] == UTF_8_BOM_3)
1171     {
1172       bom_found = 1;
1173       src += 3;
1174       nchars++;
1175     }
1176
1177   while (1)
1178     {
1179       int c, c1, c2, c3, c4;
1180
1181       src_base = src;
1182       ONE_MORE_BYTE (c);
1183       if (c < 0 || UTF_8_1_OCTET_P (c))
1184         {
1185           nchars++;
1186           if (c == '\r')
1187             {
1188               if (src < src_end && *src == '\n')
1189                 {
1190                   eol_seen |= EOL_SEEN_CRLF;
1191                   src++;
1192                   nchars++;
1193                 }
1194               else
1195                 eol_seen |= EOL_SEEN_CR;
1196             }
1197           else if (c == '\n')
1198             eol_seen |= EOL_SEEN_LF;
1199           continue;
1200         }
1201       ONE_MORE_BYTE (c1);
1202       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1203         break;
1204       if (UTF_8_2_OCTET_LEADING_P (c))
1205         {
1206           nchars++;
1207           continue;
1208         }
1209       ONE_MORE_BYTE (c2);
1210       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1211         break;
1212       if (UTF_8_3_OCTET_LEADING_P (c))
1213         {
1214           nchars++;
1215           continue;
1216         }
1217       ONE_MORE_BYTE (c3);
1218       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1219         break;
1220       if (UTF_8_4_OCTET_LEADING_P (c))
1221         {
1222           nchars++;
1223           continue;
1224         }
1225       ONE_MORE_BYTE (c4);
1226       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1227         break;
1228       if (UTF_8_5_OCTET_LEADING_P (c))
1229         {
1230           nchars++;
1231           continue;
1232         }
1233       break;
1234     }
1235   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1236   return 0;
1237
1238  no_more_source:
1239   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1240     {
1241       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1242       return 0;
1243     }
1244   if (bom_found)
1245     {
1246       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1247       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1248     }
1249   else
1250     {
1251       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1252       if (nchars < src_end - coding->source)
1253         /* The found characters are less than source bytes, which
1254            means that we found a valid non-ASCII characters.  */
1255         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1256     }
1257   coding->detected_utf8_bytes = src_base - coding->source;
1258   coding->detected_utf8_chars = nchars;
1259   return 1;
1260 }
1261
1262
1263 static void
1264 decode_coding_utf_8 (struct coding_system *coding)
1265 {
1266   const unsigned char *src = coding->source + coding->consumed;
1267   const unsigned char *src_end = coding->source + coding->src_bytes;
1268   const unsigned char *src_base;
1269   int *charbuf = coding->charbuf + coding->charbuf_used;
1270   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1271   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1272   bool multibytep = coding->src_multibyte;
1273   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1274   bool eol_dos
1275     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1276   int byte_after_cr = -1;
1277
1278   if (bom != utf_without_bom)
1279     {
1280       int c1, c2, c3;
1281
1282       src_base = src;
1283       ONE_MORE_BYTE (c1);
1284       if (! UTF_8_3_OCTET_LEADING_P (c1))
1285         src = src_base;
1286       else
1287         {
1288           ONE_MORE_BYTE (c2);
1289           if (! UTF_8_EXTRA_OCTET_P (c2))
1290             src = src_base;
1291           else
1292             {
1293               ONE_MORE_BYTE (c3);
1294               if (! UTF_8_EXTRA_OCTET_P (c3))
1295                 src = src_base;
1296               else
1297                 {
1298                   if ((c1 != UTF_8_BOM_1)
1299                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1300                     src = src_base;
1301                   else
1302                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1303                 }
1304             }
1305         }
1306     }
1307   CODING_UTF_8_BOM (coding) = utf_without_bom;
1308
1309   while (1)
1310     {
1311       int c, c1, c2, c3, c4, c5;
1312
1313       src_base = src;
1314       consumed_chars_base = consumed_chars;
1315
1316       if (charbuf >= charbuf_end)
1317         {
1318           if (byte_after_cr >= 0)
1319             src_base--;
1320           break;
1321         }
1322
1323       /* In the simple case, rapidly handle ordinary characters */
1324       if (multibytep && ! eol_dos
1325           && charbuf < charbuf_end - 6 && src < src_end - 6)
1326         {
1327           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1328             {
1329               c1 = *src;
1330               if (c1 & 0x80)
1331                 break;
1332               src++;
1333               consumed_chars++;
1334               *charbuf++ = c1;
1335
1336               c1 = *src;
1337               if (c1 & 0x80)
1338                 break;
1339               src++;
1340               consumed_chars++;
1341               *charbuf++ = c1;
1342
1343               c1 = *src;
1344               if (c1 & 0x80)
1345                 break;
1346               src++;
1347               consumed_chars++;
1348               *charbuf++ = c1;
1349
1350               c1 = *src;
1351               if (c1 & 0x80)
1352                 break;
1353               src++;
1354               consumed_chars++;
1355               *charbuf++ = c1;
1356             }
1357           /* If we handled at least one character, restart the main loop.  */
1358           if (src != src_base)
1359             continue;
1360         }
1361
1362       if (byte_after_cr >= 0)
1363         c1 = byte_after_cr, byte_after_cr = -1;
1364       else
1365         ONE_MORE_BYTE (c1);
1366       if (c1 < 0)
1367         {
1368           c = - c1;
1369         }
1370       else if (UTF_8_1_OCTET_P (c1))
1371         {
1372           if (eol_dos && c1 == '\r')
1373             ONE_MORE_BYTE (byte_after_cr);
1374           c = c1;
1375         }
1376       else
1377         {
1378           ONE_MORE_BYTE (c2);
1379           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1380             goto invalid_code;
1381           if (UTF_8_2_OCTET_LEADING_P (c1))
1382             {
1383               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1384               /* Reject overlong sequences here and below.  Encoders
1385                  producing them are incorrect, they can be misleading,
1386                  and they mess up read/write invariance.  */
1387               if (c < 128)
1388                 goto invalid_code;
1389             }
1390           else
1391             {
1392               ONE_MORE_BYTE (c3);
1393               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1394                 goto invalid_code;
1395               if (UTF_8_3_OCTET_LEADING_P (c1))
1396                 {
1397                   c = (((c1 & 0xF) << 12)
1398                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1399                   if (c < 0x800
1400                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1401                     goto invalid_code;
1402                 }
1403               else
1404                 {
1405                   ONE_MORE_BYTE (c4);
1406                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1407                     goto invalid_code;
1408                   if (UTF_8_4_OCTET_LEADING_P (c1))
1409                     {
1410                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1411                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1412                     if (c < 0x10000)
1413                       goto invalid_code;
1414                     }
1415                   else
1416                     {
1417                       ONE_MORE_BYTE (c5);
1418                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1419                         goto invalid_code;
1420                       if (UTF_8_5_OCTET_LEADING_P (c1))
1421                         {
1422                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1423                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1424                                | (c5 & 0x3F));
1425                           if ((c > MAX_CHAR) || (c < 0x200000))
1426                             goto invalid_code;
1427                         }
1428                       else
1429                         goto invalid_code;
1430                     }
1431                 }
1432             }
1433         }
1434
1435       *charbuf++ = c;
1436       continue;
1437
1438     invalid_code:
1439       src = src_base;
1440       consumed_chars = consumed_chars_base;
1441       ONE_MORE_BYTE (c);
1442       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1443     }
1444
1445  no_more_source:
1446   coding->consumed_char += consumed_chars_base;
1447   coding->consumed = src_base - coding->source;
1448   coding->charbuf_used = charbuf - coding->charbuf;
1449 }
1450
1451
1452 static bool
1453 encode_coding_utf_8 (struct coding_system *coding)
1454 {
1455   bool multibytep = coding->dst_multibyte;
1456   int *charbuf = coding->charbuf;
1457   int *charbuf_end = charbuf + coding->charbuf_used;
1458   unsigned char *dst = coding->destination + coding->produced;
1459   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1460   ptrdiff_t produced_chars = 0;
1461   int c;
1462
1463   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1464     {
1465       ASSURE_DESTINATION (3);
1466       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1467       CODING_UTF_8_BOM (coding) = utf_without_bom;
1468     }
1469
1470   if (multibytep)
1471     {
1472       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1473
1474       while (charbuf < charbuf_end)
1475         {
1476           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1477
1478           ASSURE_DESTINATION (safe_room);
1479           c = *charbuf++;
1480           if (CHAR_BYTE8_P (c))
1481             {
1482               c = CHAR_TO_BYTE8 (c);
1483               EMIT_ONE_BYTE (c);
1484             }
1485           else
1486             {
1487               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1488               for (p = str; p < pend; p++)
1489                 EMIT_ONE_BYTE (*p);
1490             }
1491         }
1492     }
1493   else
1494     {
1495       int safe_room = MAX_MULTIBYTE_LENGTH;
1496
1497       while (charbuf < charbuf_end)
1498         {
1499           ASSURE_DESTINATION (safe_room);
1500           c = *charbuf++;
1501           if (CHAR_BYTE8_P (c))
1502             *dst++ = CHAR_TO_BYTE8 (c);
1503           else
1504             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1505         }
1506       produced_chars = dst - (coding->destination + coding->produced);
1507     }
1508   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1509   coding->produced_char += produced_chars;
1510   coding->produced = dst - coding->destination;
1511   return 0;
1512 }
1513
1514
1515 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1516    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1517
1518 #define UTF_16_HIGH_SURROGATE_P(val) \
1519   (((val) & 0xFC00) == 0xD800)
1520
1521 #define UTF_16_LOW_SURROGATE_P(val) \
1522   (((val) & 0xFC00) == 0xDC00)
1523
1524
1525 static bool
1526 detect_coding_utf_16 (struct coding_system *coding,
1527                       struct coding_detection_info *detect_info)
1528 {
1529   const unsigned char *src = coding->source;
1530   const unsigned char *src_end = coding->source + coding->src_bytes;
1531   bool multibytep = coding->src_multibyte;
1532   int c1, c2;
1533
1534   detect_info->checked |= CATEGORY_MASK_UTF_16;
1535   if (coding->mode & CODING_MODE_LAST_BLOCK
1536       && (coding->src_chars & 1))
1537     {
1538       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1539       return 0;
1540     }
1541
1542   TWO_MORE_BYTES (c1, c2);
1543   if ((c1 == 0xFF) && (c2 == 0xFE))
1544     {
1545       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1546                              | CATEGORY_MASK_UTF_16_AUTO);
1547       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1548                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1549                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1550     }
1551   else if ((c1 == 0xFE) && (c2 == 0xFF))
1552     {
1553       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1554                              | CATEGORY_MASK_UTF_16_AUTO);
1555       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1556                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1557                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1558     }
1559   else if (c2 < 0)
1560     {
1561       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1562       return 0;
1563     }
1564   else
1565     {
1566       /* We check the dispersion of Eth and Oth bytes where E is even and
1567          O is odd.  If both are high, we assume binary data.*/
1568       unsigned char e[256], o[256];
1569       unsigned e_num = 1, o_num = 1;
1570
1571       memset (e, 0, 256);
1572       memset (o, 0, 256);
1573       e[c1] = 1;
1574       o[c2] = 1;
1575
1576       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1577                                 |CATEGORY_MASK_UTF_16_BE
1578                                 | CATEGORY_MASK_UTF_16_LE);
1579
1580       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1581              != CATEGORY_MASK_UTF_16)
1582         {
1583           TWO_MORE_BYTES (c1, c2);
1584           if (c2 < 0)
1585             break;
1586           if (! e[c1])
1587             {
1588               e[c1] = 1;
1589               e_num++;
1590               if (e_num >= 128)
1591                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1592             }
1593           if (! o[c2])
1594             {
1595               o[c2] = 1;
1596               o_num++;
1597               if (o_num >= 128)
1598                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1599             }
1600         }
1601       return 0;
1602     }
1603
1604  no_more_source:
1605   return 1;
1606 }
1607
1608 static void
1609 decode_coding_utf_16 (struct coding_system *coding)
1610 {
1611   const unsigned char *src = coding->source + coding->consumed;
1612   const unsigned char *src_end = coding->source + coding->src_bytes;
1613   const unsigned char *src_base;
1614   int *charbuf = coding->charbuf + coding->charbuf_used;
1615   /* We may produces at most 3 chars in one loop.  */
1616   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1617   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1618   bool multibytep = coding->src_multibyte;
1619   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1620   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1621   int surrogate = CODING_UTF_16_SURROGATE (coding);
1622   bool eol_dos
1623     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1624   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1625
1626   if (bom == utf_with_bom)
1627     {
1628       int c, c1, c2;
1629
1630       src_base = src;
1631       ONE_MORE_BYTE (c1);
1632       ONE_MORE_BYTE (c2);
1633       c = (c1 << 8) | c2;
1634
1635       if (endian == utf_16_big_endian
1636           ? c != 0xFEFF : c != 0xFFFE)
1637         {
1638           /* The first two bytes are not BOM.  Treat them as bytes
1639              for a normal character.  */
1640           src = src_base;
1641         }
1642       CODING_UTF_16_BOM (coding) = utf_without_bom;
1643     }
1644   else if (bom == utf_detect_bom)
1645     {
1646       /* We have already tried to detect BOM and failed in
1647          detect_coding.  */
1648       CODING_UTF_16_BOM (coding) = utf_without_bom;
1649     }
1650
1651   while (1)
1652     {
1653       int c, c1, c2;
1654
1655       src_base = src;
1656       consumed_chars_base = consumed_chars;
1657
1658       if (charbuf >= charbuf_end)
1659         {
1660           if (byte_after_cr1 >= 0)
1661             src_base -= 2;
1662           break;
1663         }
1664
1665       if (byte_after_cr1 >= 0)
1666         c1 = byte_after_cr1, byte_after_cr1 = -1;
1667       else
1668         ONE_MORE_BYTE (c1);
1669       if (c1 < 0)
1670         {
1671           *charbuf++ = -c1;
1672           continue;
1673         }
1674       if (byte_after_cr2 >= 0)
1675         c2 = byte_after_cr2, byte_after_cr2 = -1;
1676       else
1677         ONE_MORE_BYTE (c2);
1678       if (c2 < 0)
1679         {
1680           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1681           *charbuf++ = -c2;
1682           continue;
1683         }
1684       c = (endian == utf_16_big_endian
1685            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1686
1687       if (surrogate)
1688         {
1689           if (! UTF_16_LOW_SURROGATE_P (c))
1690             {
1691               if (endian == utf_16_big_endian)
1692                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1693               else
1694                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1695               *charbuf++ = c1;
1696               *charbuf++ = c2;
1697               if (UTF_16_HIGH_SURROGATE_P (c))
1698                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1699               else
1700                 *charbuf++ = c;
1701             }
1702           else
1703             {
1704               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1705               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1706               *charbuf++ = 0x10000 + c;
1707             }
1708         }
1709       else
1710         {
1711           if (UTF_16_HIGH_SURROGATE_P (c))
1712             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1713           else
1714             {
1715               if (eol_dos && c == '\r')
1716                 {
1717                   ONE_MORE_BYTE (byte_after_cr1);
1718                   ONE_MORE_BYTE (byte_after_cr2);
1719                 }
1720               *charbuf++ = c;
1721             }
1722         }
1723     }
1724
1725  no_more_source:
1726   coding->consumed_char += consumed_chars_base;
1727   coding->consumed = src_base - coding->source;
1728   coding->charbuf_used = charbuf - coding->charbuf;
1729 }
1730
1731 static bool
1732 encode_coding_utf_16 (struct coding_system *coding)
1733 {
1734   bool multibytep = coding->dst_multibyte;
1735   int *charbuf = coding->charbuf;
1736   int *charbuf_end = charbuf + coding->charbuf_used;
1737   unsigned char *dst = coding->destination + coding->produced;
1738   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1739   int safe_room = 8;
1740   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1741   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1742   ptrdiff_t produced_chars = 0;
1743   int c;
1744
1745   if (bom != utf_without_bom)
1746     {
1747       ASSURE_DESTINATION (safe_room);
1748       if (big_endian)
1749         EMIT_TWO_BYTES (0xFE, 0xFF);
1750       else
1751         EMIT_TWO_BYTES (0xFF, 0xFE);
1752       CODING_UTF_16_BOM (coding) = utf_without_bom;
1753     }
1754
1755   while (charbuf < charbuf_end)
1756     {
1757       ASSURE_DESTINATION (safe_room);
1758       c = *charbuf++;
1759       if (c > MAX_UNICODE_CHAR)
1760         c = coding->default_char;
1761
1762       if (c < 0x10000)
1763         {
1764           if (big_endian)
1765             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1766           else
1767             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1768         }
1769       else
1770         {
1771           int c1, c2;
1772
1773           c -= 0x10000;
1774           c1 = (c >> 10) + 0xD800;
1775           c2 = (c & 0x3FF) + 0xDC00;
1776           if (big_endian)
1777             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1778           else
1779             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1780         }
1781     }
1782   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1783   coding->produced = dst - coding->destination;
1784   coding->produced_char += produced_chars;
1785   return 0;
1786 }
1787
1788 \f
1789 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1790
1791 /* Emacs' internal format for representation of multiple character
1792    sets is a kind of multi-byte encoding, i.e. characters are
1793    represented by variable-length sequences of one-byte codes.
1794
1795    ASCII characters and control characters (e.g. `tab', `newline') are
1796    represented by one-byte sequences which are their ASCII codes, in
1797    the range 0x00 through 0x7F.
1798
1799    8-bit characters of the range 0x80..0x9F are represented by
1800    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1801    code + 0x20).
1802
1803    8-bit characters of the range 0xA0..0xFF are represented by
1804    one-byte sequences which are their 8-bit code.
1805
1806    The other characters are represented by a sequence of `base
1807    leading-code', optional `extended leading-code', and one or two
1808    `position-code's.  The length of the sequence is determined by the
1809    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1810    whereas extended leading-code and position-code take the range 0xA0
1811    through 0xFF.  See `charset.h' for more details about leading-code
1812    and position-code.
1813
1814    --- CODE RANGE of Emacs' internal format ---
1815    character set        range
1816    -------------        -----
1817    ascii                0x00..0x7F
1818    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1819    eight-bit-graphic    0xA0..0xBF
1820    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1821    ---------------------------------------------
1822
1823    As this is the internal character representation, the format is
1824    usually not used externally (i.e. in a file or in a data sent to a
1825    process).  But, it is possible to have a text externally in this
1826    format (i.e. by encoding by the coding system `emacs-mule').
1827
1828    In that case, a sequence of one-byte codes has a slightly different
1829    form.
1830
1831    At first, all characters in eight-bit-control are represented by
1832    one-byte sequences which are their 8-bit code.
1833
1834    Next, character composition data are represented by the byte
1835    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1836    where,
1837         METHOD is 0xF2 plus one of composition method (enum
1838         composition_method),
1839
1840         BYTES is 0xA0 plus a byte length of this composition data,
1841
1842         CHARS is 0xA0 plus a number of characters composed by this
1843         data,
1844
1845         COMPONENTs are characters of multibyte form or composition
1846         rules encoded by two-byte of ASCII codes.
1847
1848    In addition, for backward compatibility, the following formats are
1849    also recognized as composition data on decoding.
1850
1851    0x80 MSEQ ...
1852    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1853
1854    Here,
1855         MSEQ is a multibyte form but in these special format:
1856           ASCII: 0xA0 ASCII_CODE+0x80,
1857           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1858         RULE is a one byte code of the range 0xA0..0xF0 that
1859         represents a composition rule.
1860   */
1861
1862 char emacs_mule_bytes[256];
1863
1864
1865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1866    Return true if a text is encoded in 'emacs-mule'.  */
1867
1868 static bool
1869 detect_coding_emacs_mule (struct coding_system *coding,
1870                           struct coding_detection_info *detect_info)
1871 {
1872   const unsigned char *src = coding->source, *src_base;
1873   const unsigned char *src_end = coding->source + coding->src_bytes;
1874   bool multibytep = coding->src_multibyte;
1875   ptrdiff_t consumed_chars = 0;
1876   int c;
1877   int found = 0;
1878
1879   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1880   /* A coding system of this category is always ASCII compatible.  */
1881   src += coding->head_ascii;
1882
1883   while (1)
1884     {
1885       src_base = src;
1886       ONE_MORE_BYTE (c);
1887       if (c < 0)
1888         continue;
1889       if (c == 0x80)
1890         {
1891           /* Perhaps the start of composite character.  We simply skip
1892              it because analyzing it is too heavy for detecting.  But,
1893              at least, we check that the composite character
1894              constitutes of more than 4 bytes.  */
1895           const unsigned char *src_start;
1896
1897         repeat:
1898           src_start = src;
1899           do
1900             {
1901               ONE_MORE_BYTE (c);
1902             }
1903           while (c >= 0xA0);
1904
1905           if (src - src_start <= 4)
1906             break;
1907           found = CATEGORY_MASK_EMACS_MULE;
1908           if (c == 0x80)
1909             goto repeat;
1910         }
1911
1912       if (c < 0x80)
1913         {
1914           if (c < 0x20
1915               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1916             break;
1917         }
1918       else
1919         {
1920           int more_bytes = emacs_mule_bytes[c] - 1;
1921
1922           while (more_bytes > 0)
1923             {
1924               ONE_MORE_BYTE (c);
1925               if (c < 0xA0)
1926                 {
1927                   src--;        /* Unread the last byte.  */
1928                   break;
1929                 }
1930               more_bytes--;
1931             }
1932           if (more_bytes != 0)
1933             break;
1934           found = CATEGORY_MASK_EMACS_MULE;
1935         }
1936     }
1937   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1938   return 0;
1939
1940  no_more_source:
1941   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1942     {
1943       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1944       return 0;
1945     }
1946   detect_info->found |= found;
1947   return 1;
1948 }
1949
1950
1951 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1952    character.  If CMP_STATUS indicates that we must expect MSEQ or
1953    RULE described above, decode it and return the negative value of
1954    the decoded character or rule.  If an invalid byte is found, return
1955    -1.  If SRC is too short, return -2.  */
1956
1957 static int
1958 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1959                  int *nbytes, int *nchars, int *id,
1960                  struct composition_status *cmp_status)
1961 {
1962   const unsigned char *src_end = coding->source + coding->src_bytes;
1963   const unsigned char *src_base = src;
1964   bool multibytep = coding->src_multibyte;
1965   int charset_ID;
1966   unsigned code;
1967   int c;
1968   ptrdiff_t consumed_chars = 0;
1969   bool mseq_found = 0;
1970
1971   ONE_MORE_BYTE (c);
1972   if (c < 0)
1973     {
1974       c = -c;
1975       charset_ID = emacs_mule_charset[0];
1976     }
1977   else
1978     {
1979       if (c >= 0xA0)
1980         {
1981           if (cmp_status->state != COMPOSING_NO
1982               && cmp_status->old_form)
1983             {
1984               if (cmp_status->state == COMPOSING_CHAR)
1985                 {
1986                   if (c == 0xA0)
1987                     {
1988                       ONE_MORE_BYTE (c);
1989                       c -= 0x80;
1990                       if (c < 0)
1991                         goto invalid_code;
1992                     }
1993                   else
1994                     c -= 0x20;
1995                   mseq_found = 1;
1996                 }
1997               else
1998                 {
1999                   *nbytes = src - src_base;
2000                   *nchars = consumed_chars;
2001                   return -c;
2002                 }
2003             }
2004           else
2005             goto invalid_code;
2006         }
2007
2008       switch (emacs_mule_bytes[c])
2009         {
2010         case 2:
2011           if ((charset_ID = emacs_mule_charset[c]) < 0)
2012             goto invalid_code;
2013           ONE_MORE_BYTE (c);
2014           if (c < 0xA0)
2015             goto invalid_code;
2016           code = c & 0x7F;
2017           break;
2018
2019         case 3:
2020           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2021               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2022             {
2023               ONE_MORE_BYTE (c);
2024               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2025                 goto invalid_code;
2026               ONE_MORE_BYTE (c);
2027               if (c < 0xA0)
2028                 goto invalid_code;
2029               code = c & 0x7F;
2030             }
2031           else
2032             {
2033               if ((charset_ID = emacs_mule_charset[c]) < 0)
2034                 goto invalid_code;
2035               ONE_MORE_BYTE (c);
2036               if (c < 0xA0)
2037                 goto invalid_code;
2038               code = (c & 0x7F) << 8;
2039               ONE_MORE_BYTE (c);
2040               if (c < 0xA0)
2041                 goto invalid_code;
2042               code |= c & 0x7F;
2043             }
2044           break;
2045
2046         case 4:
2047           ONE_MORE_BYTE (c);
2048           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2049             goto invalid_code;
2050           ONE_MORE_BYTE (c);
2051           if (c < 0xA0)
2052             goto invalid_code;
2053           code = (c & 0x7F) << 8;
2054           ONE_MORE_BYTE (c);
2055           if (c < 0xA0)
2056             goto invalid_code;
2057           code |= c & 0x7F;
2058           break;
2059
2060         case 1:
2061           code = c;
2062           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2063           break;
2064
2065         default:
2066           emacs_abort ();
2067         }
2068       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2069                           CHARSET_FROM_ID (charset_ID), code, c);
2070       if (c < 0)
2071         goto invalid_code;
2072     }
2073   *nbytes = src - src_base;
2074   *nchars = consumed_chars;
2075   if (id)
2076     *id = charset_ID;
2077   return (mseq_found ? -c : c);
2078
2079  no_more_source:
2080   return -2;
2081
2082  invalid_code:
2083   return -1;
2084 }
2085
2086
2087 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2088
2089 /* Handle these composition sequence ('|': the end of header elements,
2090    BYTES and CHARS >= 0xA0):
2091
2092    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2093    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2094    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2095
2096    and these old form:
2097
2098    (4) relative composition: 0x80 | MSEQ ... MSEQ
2099    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2100
2101    When the starter 0x80 and the following header elements are found,
2102    this annotation header is produced.
2103
2104         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2105
2106    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2107    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2108
2109    Then, upon reading the following elements, these codes are produced
2110    until the composition end is found:
2111
2112    (1) CHAR ... CHAR
2113    (2) ALT ... ALT CHAR ... CHAR
2114    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2115    (4) CHAR ... CHAR
2116    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2117
2118    When the composition end is found, LENGTH and NCHARS in the
2119    annotation header is updated as below:
2120
2121    (1) LENGTH: unchanged, NCHARS: unchanged
2122    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2123    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2124    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2125    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2126
2127    If an error is found while composing, the annotation header is
2128    changed to the original composition header (plus filler -1s) as
2129    below:
2130
2131    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2132    (5)          [ 0x80 0xFF -1 -1- -1 ]
2133
2134    and the sequence [ -2 DECODED-RULE ] is changed to the original
2135    byte sequence as below:
2136         o the original byte sequence is B: [ B -1 ]
2137         o the original byte sequence is B1 B2: [ B1 B2 ]
2138
2139    Most of the routines are implemented by macros because many
2140    variables and labels in the caller decode_coding_emacs_mule must be
2141    accessible, and they are usually called just once (thus doesn't
2142    increase the size of compiled object).  */
2143
2144 /* Decode a composition rule represented by C as a component of
2145    composition sequence of Emacs 20 style.  Set RULE to the decoded
2146    rule. */
2147
2148 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2149   do {                                                  \
2150     int gref, nref;                                     \
2151                                                         \
2152     c -= 0xA0;                                          \
2153     if (c < 0 || c >= 81)                               \
2154       goto invalid_code;                                \
2155     gref = c / 9, nref = c % 9;                         \
2156     if (gref == 4) gref = 10;                           \
2157     if (nref == 4) nref = 10;                           \
2158     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2159   } while (0)
2160
2161
2162 /* Decode a composition rule represented by C and the following byte
2163    at SRC as a component of composition sequence of Emacs 21 style.
2164    Set RULE to the decoded rule.  */
2165
2166 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2167   do {                                                  \
2168     int gref, nref;                                     \
2169                                                         \
2170     gref = c - 0x20;                                    \
2171     if (gref < 0 || gref >= 81)                         \
2172       goto invalid_code;                                \
2173     ONE_MORE_BYTE (c);                                  \
2174     nref = c - 0x20;                                    \
2175     if (nref < 0 || nref >= 81)                         \
2176       goto invalid_code;                                \
2177     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2178   } while (0)
2179
2180
2181 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2182    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2183    byte length of this composition information, CHARS is the number of
2184    characters composed by this composition.  */
2185
2186 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2187   do {                                                                  \
2188     enum composition_method method = c - 0xF2;                          \
2189     int nbytes, nchars;                                                 \
2190                                                                         \
2191     ONE_MORE_BYTE (c);                                                  \
2192     if (c < 0)                                                          \
2193       goto invalid_code;                                                \
2194     nbytes = c - 0xA0;                                                  \
2195     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2196       goto invalid_code;                                                \
2197     ONE_MORE_BYTE (c);                                                  \
2198     nchars = c - 0xA0;                                                  \
2199     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2200       goto invalid_code;                                                \
2201     cmp_status->old_form = 0;                                           \
2202     cmp_status->method = method;                                        \
2203     if (method == COMPOSITION_RELATIVE)                                 \
2204       cmp_status->state = COMPOSING_CHAR;                               \
2205     else                                                                \
2206       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2207     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2208     cmp_status->nchars = nchars;                                        \
2209     cmp_status->ncomps = nbytes - 4;                                    \
2210     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2211   } while (0)
2212
2213
2214 /* Start of Emacs 20 style format for relative composition.  */
2215
2216 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2217   do {                                                          \
2218     cmp_status->old_form = 1;                                   \
2219     cmp_status->method = COMPOSITION_RELATIVE;                  \
2220     cmp_status->state = COMPOSING_CHAR;                         \
2221     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2222     cmp_status->nchars = cmp_status->ncomps = 0;                \
2223     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2224   } while (0)
2225
2226
2227 /* Start of Emacs 20 style format for rule-base composition.  */
2228
2229 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2230   do {                                                          \
2231     cmp_status->old_form = 1;                                   \
2232     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2233     cmp_status->state = COMPOSING_CHAR;                         \
2234     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2235     cmp_status->nchars = cmp_status->ncomps = 0;                \
2236     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2237   } while (0)
2238
2239
2240 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2241   do {                                                  \
2242     const unsigned char *current_src = src;             \
2243                                                         \
2244     ONE_MORE_BYTE (c);                                  \
2245     if (c < 0)                                          \
2246       goto invalid_code;                                \
2247     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2248         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2249       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2250     else if (c < 0xA0)                                  \
2251       goto invalid_code;                                \
2252     else if (c < 0xC0)                                  \
2253       {                                                 \
2254         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2255         /* Re-read C as a composition component.  */    \
2256         src = current_src;                              \
2257       }                                                 \
2258     else if (c == 0xFF)                                 \
2259       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2260     else                                                \
2261       goto invalid_code;                                \
2262   } while (0)
2263
2264 #define EMACS_MULE_COMPOSITION_END()                            \
2265   do {                                                          \
2266     int idx = - cmp_status->length;                             \
2267                                                                 \
2268     if (cmp_status->old_form)                                   \
2269       charbuf[idx + 2] = cmp_status->nchars;                    \
2270     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2271       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2272     cmp_status->state = COMPOSING_NO;                           \
2273   } while (0)
2274
2275
2276 static int
2277 emacs_mule_finish_composition (int *charbuf,
2278                                struct composition_status *cmp_status)
2279 {
2280   int idx = - cmp_status->length;
2281   int new_chars;
2282
2283   if (cmp_status->old_form && cmp_status->nchars > 0)
2284     {
2285       charbuf[idx + 2] = cmp_status->nchars;
2286       new_chars = 0;
2287       if (cmp_status->method == COMPOSITION_WITH_RULE
2288           && cmp_status->state == COMPOSING_CHAR)
2289         {
2290           /* The last rule was invalid.  */
2291           int rule = charbuf[-1] + 0xA0;
2292
2293           charbuf[-2] = BYTE8_TO_CHAR (rule);
2294           charbuf[-1] = -1;
2295           new_chars = 1;
2296         }
2297     }
2298   else
2299     {
2300       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2301
2302       if (cmp_status->method == COMPOSITION_WITH_RULE)
2303         {
2304           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2305           charbuf[idx++] = -3;
2306           charbuf[idx++] = 0;
2307           new_chars = 1;
2308         }
2309       else
2310         {
2311           int nchars = charbuf[idx + 1] + 0xA0;
2312           int nbytes = charbuf[idx + 2] + 0xA0;
2313
2314           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2315           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2316           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2317           charbuf[idx++] = -1;
2318           new_chars = 4;
2319         }
2320     }
2321   cmp_status->state = COMPOSING_NO;
2322   return new_chars;
2323 }
2324
2325 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2326   do {                                                                    \
2327     if (cmp_status->state != COMPOSING_NO)                                \
2328       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2329   } while (0)
2330
2331
2332 static void
2333 decode_coding_emacs_mule (struct coding_system *coding)
2334 {
2335   const unsigned char *src = coding->source + coding->consumed;
2336   const unsigned char *src_end = coding->source + coding->src_bytes;
2337   const unsigned char *src_base;
2338   int *charbuf = coding->charbuf + coding->charbuf_used;
2339   /* We may produce two annotations (charset and composition) in one
2340      loop and one more charset annotation at the end.  */
2341   int *charbuf_end
2342     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2343       /* We can produce up to 2 characters in a loop.  */
2344       - 1;
2345   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2346   bool multibytep = coding->src_multibyte;
2347   ptrdiff_t char_offset = coding->produced_char;
2348   ptrdiff_t last_offset = char_offset;
2349   int last_id = charset_ascii;
2350   bool eol_dos
2351     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2352   int byte_after_cr = -1;
2353   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2354
2355   if (cmp_status->state != COMPOSING_NO)
2356     {
2357       int i;
2358
2359       if (charbuf_end - charbuf < cmp_status->length)
2360         emacs_abort ();
2361       for (i = 0; i < cmp_status->length; i++)
2362         *charbuf++ = cmp_status->carryover[i];
2363       coding->annotated = 1;
2364     }
2365
2366   while (1)
2367     {
2368       int c, id IF_LINT (= 0);
2369
2370       src_base = src;
2371       consumed_chars_base = consumed_chars;
2372
2373       if (charbuf >= charbuf_end)
2374         {
2375           if (byte_after_cr >= 0)
2376             src_base--;
2377           break;
2378         }
2379
2380       if (byte_after_cr >= 0)
2381         c = byte_after_cr, byte_after_cr = -1;
2382       else
2383         ONE_MORE_BYTE (c);
2384
2385       if (c < 0 || c == 0x80)
2386         {
2387           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2388           if (c < 0)
2389             {
2390               *charbuf++ = -c;
2391               char_offset++;
2392             }
2393           else
2394             DECODE_EMACS_MULE_COMPOSITION_START ();
2395           continue;
2396         }
2397
2398       if (c < 0x80)
2399         {
2400           if (eol_dos && c == '\r')
2401             ONE_MORE_BYTE (byte_after_cr);
2402           id = charset_ascii;
2403           if (cmp_status->state != COMPOSING_NO)
2404             {
2405               if (cmp_status->old_form)
2406                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2407               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2408                 cmp_status->ncomps--;
2409             }
2410         }
2411       else
2412         {
2413           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2414           /* emacs_mule_char can load a charset map from a file, which
2415              allocates a large structure and might cause buffer text
2416              to be relocated as result.  Thus, we need to remember the
2417              original pointer to buffer text, and fix up all related
2418              pointers after the call.  */
2419           const unsigned char *orig = coding->source;
2420           ptrdiff_t offset;
2421
2422           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2423                                cmp_status);
2424           offset = coding->source - orig;
2425           if (offset)
2426             {
2427               src += offset;
2428               src_base += offset;
2429               src_end += offset;
2430             }
2431           if (c < 0)
2432             {
2433               if (c == -1)
2434                 goto invalid_code;
2435               if (c == -2)
2436                 break;
2437             }
2438           src = src_base + nbytes;
2439           consumed_chars = consumed_chars_base + nchars;
2440           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2441             cmp_status->ncomps -= nchars;
2442         }
2443
2444       /* Now if C >= 0, we found a normally encoded character, if C <
2445          0, we found an old-style composition component character or
2446          rule.  */
2447
2448       if (cmp_status->state == COMPOSING_NO)
2449         {
2450           if (last_id != id)
2451             {
2452               if (last_id != charset_ascii)
2453                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2454                                   last_id);
2455               last_id = id;
2456               last_offset = char_offset;
2457             }
2458           *charbuf++ = c;
2459           char_offset++;
2460         }
2461       else if (cmp_status->state == COMPOSING_CHAR)
2462         {
2463           if (cmp_status->old_form)
2464             {
2465               if (c >= 0)
2466                 {
2467                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2468                   *charbuf++ = c;
2469                   char_offset++;
2470                 }
2471               else
2472                 {
2473                   *charbuf++ = -c;
2474                   cmp_status->nchars++;
2475                   cmp_status->length++;
2476                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2477                     EMACS_MULE_COMPOSITION_END ();
2478                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2479                     cmp_status->state = COMPOSING_RULE;
2480                 }
2481             }
2482           else
2483             {
2484               *charbuf++ = c;
2485               cmp_status->length++;
2486               cmp_status->nchars--;
2487               if (cmp_status->nchars == 0)
2488                 EMACS_MULE_COMPOSITION_END ();
2489             }
2490         }
2491       else if (cmp_status->state == COMPOSING_RULE)
2492         {
2493           int rule;
2494
2495           if (c >= 0)
2496             {
2497               EMACS_MULE_COMPOSITION_END ();
2498               *charbuf++ = c;
2499               char_offset++;
2500             }
2501           else
2502             {
2503               c = -c;
2504               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2505               if (rule < 0)
2506                 goto invalid_code;
2507               *charbuf++ = -2;
2508               *charbuf++ = rule;
2509               cmp_status->length += 2;
2510               cmp_status->state = COMPOSING_CHAR;
2511             }
2512         }
2513       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2514         {
2515           *charbuf++ = c;
2516           cmp_status->length++;
2517           if (cmp_status->ncomps == 0)
2518             cmp_status->state = COMPOSING_CHAR;
2519           else if (cmp_status->ncomps > 0)
2520             {
2521               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2522                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2523             }
2524           else
2525             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2526         }
2527       else                      /* COMPOSING_COMPONENT_RULE */
2528         {
2529           int rule;
2530
2531           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2532           if (rule < 0)
2533             goto invalid_code;
2534           *charbuf++ = -2;
2535           *charbuf++ = rule;
2536           cmp_status->length += 2;
2537           cmp_status->ncomps--;
2538           if (cmp_status->ncomps > 0)
2539             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2540           else
2541             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2542         }
2543       continue;
2544
2545     invalid_code:
2546       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2547       src = src_base;
2548       consumed_chars = consumed_chars_base;
2549       ONE_MORE_BYTE (c);
2550       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2551       char_offset++;
2552     }
2553
2554  no_more_source:
2555   if (cmp_status->state != COMPOSING_NO)
2556     {
2557       if (coding->mode & CODING_MODE_LAST_BLOCK)
2558         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2559       else
2560         {
2561           int i;
2562
2563           charbuf -= cmp_status->length;
2564           for (i = 0; i < cmp_status->length; i++)
2565             cmp_status->carryover[i] = charbuf[i];
2566         }
2567     }
2568   if (last_id != charset_ascii)
2569     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2570   coding->consumed_char += consumed_chars_base;
2571   coding->consumed = src_base - coding->source;
2572   coding->charbuf_used = charbuf - coding->charbuf;
2573 }
2574
2575
2576 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2577   do {                                          \
2578     if (id < 0xA0)                              \
2579       codes[0] = id, codes[1] = 0;              \
2580     else if (id < 0xE0)                         \
2581       codes[0] = 0x9A, codes[1] = id;           \
2582     else if (id < 0xF0)                         \
2583       codes[0] = 0x9B, codes[1] = id;           \
2584     else if (id < 0xF5)                         \
2585       codes[0] = 0x9C, codes[1] = id;           \
2586     else                                        \
2587       codes[0] = 0x9D, codes[1] = id;           \
2588   } while (0);
2589
2590
2591 static bool
2592 encode_coding_emacs_mule (struct coding_system *coding)
2593 {
2594   bool multibytep = coding->dst_multibyte;
2595   int *charbuf = coding->charbuf;
2596   int *charbuf_end = charbuf + coding->charbuf_used;
2597   unsigned char *dst = coding->destination + coding->produced;
2598   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2599   int safe_room = 8;
2600   ptrdiff_t produced_chars = 0;
2601   Lisp_Object attrs, charset_list;
2602   int c;
2603   int preferred_charset_id = -1;
2604
2605   CODING_GET_INFO (coding, attrs, charset_list);
2606   if (! EQ (charset_list, Vemacs_mule_charset_list))
2607     {
2608       charset_list = Vemacs_mule_charset_list;
2609       ASET (attrs, coding_attr_charset_list, charset_list);
2610     }
2611
2612   while (charbuf < charbuf_end)
2613     {
2614       ASSURE_DESTINATION (safe_room);
2615       c = *charbuf++;
2616
2617       if (c < 0)
2618         {
2619           /* Handle an annotation.  */
2620           switch (*charbuf)
2621             {
2622             case CODING_ANNOTATE_COMPOSITION_MASK:
2623               /* Not yet implemented.  */
2624               break;
2625             case CODING_ANNOTATE_CHARSET_MASK:
2626               preferred_charset_id = charbuf[3];
2627               if (preferred_charset_id >= 0
2628                   && NILP (Fmemq (make_number (preferred_charset_id),
2629                                   charset_list)))
2630                 preferred_charset_id = -1;
2631               break;
2632             default:
2633               emacs_abort ();
2634             }
2635           charbuf += -c - 1;
2636           continue;
2637         }
2638
2639       if (ASCII_CHAR_P (c))
2640         EMIT_ONE_ASCII_BYTE (c);
2641       else if (CHAR_BYTE8_P (c))
2642         {
2643           c = CHAR_TO_BYTE8 (c);
2644           EMIT_ONE_BYTE (c);
2645         }
2646       else
2647         {
2648           struct charset *charset;
2649           unsigned code;
2650           int dimension;
2651           int emacs_mule_id;
2652           unsigned char leading_codes[2];
2653
2654           if (preferred_charset_id >= 0)
2655             {
2656               bool result;
2657
2658               charset = CHARSET_FROM_ID (preferred_charset_id);
2659               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2660               if (result)
2661                 code = ENCODE_CHAR (charset, c);
2662               else
2663                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2664                                      &code, charset);
2665             }
2666           else
2667             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2668                                  &code, charset);
2669           if (! charset)
2670             {
2671               c = coding->default_char;
2672               if (ASCII_CHAR_P (c))
2673                 {
2674                   EMIT_ONE_ASCII_BYTE (c);
2675                   continue;
2676                 }
2677               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2678                                    &code, charset);
2679             }
2680           dimension = CHARSET_DIMENSION (charset);
2681           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2682           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2683           EMIT_ONE_BYTE (leading_codes[0]);
2684           if (leading_codes[1])
2685             EMIT_ONE_BYTE (leading_codes[1]);
2686           if (dimension == 1)
2687             EMIT_ONE_BYTE (code | 0x80);
2688           else
2689             {
2690               code |= 0x8080;
2691               EMIT_ONE_BYTE (code >> 8);
2692               EMIT_ONE_BYTE (code & 0xFF);
2693             }
2694         }
2695     }
2696   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2697   coding->produced_char += produced_chars;
2698   coding->produced = dst - coding->destination;
2699   return 0;
2700 }
2701
2702 \f
2703 /*** 7. ISO2022 handlers ***/
2704
2705 /* The following note describes the coding system ISO2022 briefly.
2706    Since the intention of this note is to help understand the
2707    functions in this file, some parts are NOT ACCURATE or are OVERLY
2708    SIMPLIFIED.  For thorough understanding, please refer to the
2709    original document of ISO2022.  This is equivalent to the standard
2710    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2711
2712    ISO2022 provides many mechanisms to encode several character sets
2713    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2714    is encoded using bytes less than 128.  This may make the encoded
2715    text a little bit longer, but the text passes more easily through
2716    several types of gateway, some of which strip off the MSB (Most
2717    Significant Bit).
2718
2719    There are two kinds of character sets: control character sets and
2720    graphic character sets.  The former contain control characters such
2721    as `newline' and `escape' to provide control functions (control
2722    functions are also provided by escape sequences).  The latter
2723    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2724    two control character sets and many graphic character sets.
2725
2726    Graphic character sets are classified into one of the following
2727    four classes, according to the number of bytes (DIMENSION) and
2728    number of characters in one dimension (CHARS) of the set:
2729    - DIMENSION1_CHARS94
2730    - DIMENSION1_CHARS96
2731    - DIMENSION2_CHARS94
2732    - DIMENSION2_CHARS96
2733
2734    In addition, each character set is assigned an identification tag,
2735    unique for each set, called the "final character" (denoted as <F>
2736    hereafter).  The <F> of each character set is decided by ECMA(*)
2737    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2738    (0x30..0x3F are for private use only).
2739
2740    Note (*): ECMA = European Computer Manufacturers Association
2741
2742    Here are examples of graphic character sets [NAME(<F>)]:
2743         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2744         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2745         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2746         o DIMENSION2_CHARS96 -- none for the moment
2747
2748    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2749         C0 [0x00..0x1F] -- control character plane 0
2750         GL [0x20..0x7F] -- graphic character plane 0
2751         C1 [0x80..0x9F] -- control character plane 1
2752         GR [0xA0..0xFF] -- graphic character plane 1
2753
2754    A control character set is directly designated and invoked to C0 or
2755    C1 by an escape sequence.  The most common case is that:
2756    - ISO646's  control character set is designated/invoked to C0, and
2757    - ISO6429's control character set is designated/invoked to C1,
2758    and usually these designations/invocations are omitted in encoded
2759    text.  In a 7-bit environment, only C0 can be used, and a control
2760    character for C1 is encoded by an appropriate escape sequence to
2761    fit into the environment.  All control characters for C1 are
2762    defined to have corresponding escape sequences.
2763
2764    A graphic character set is at first designated to one of four
2765    graphic registers (G0 through G3), then these graphic registers are
2766    invoked to GL or GR.  These designations and invocations can be
2767    done independently.  The most common case is that G0 is invoked to
2768    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2769    these invocations and designations are omitted in encoded text.
2770    In a 7-bit environment, only GL can be used.
2771
2772    When a graphic character set of CHARS94 is invoked to GL, codes
2773    0x20 and 0x7F of the GL area work as control characters SPACE and
2774    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2775    be used.
2776
2777    There are two ways of invocation: locking-shift and single-shift.
2778    With locking-shift, the invocation lasts until the next different
2779    invocation, whereas with single-shift, the invocation affects the
2780    following character only and doesn't affect the locking-shift
2781    state.  Invocations are done by the following control characters or
2782    escape sequences:
2783
2784    ----------------------------------------------------------------------
2785    abbrev  function                  cntrl escape seq   description
2786    ----------------------------------------------------------------------
2787    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2788    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2789    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2790    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2791    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2792    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2793    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2794    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2795    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2796    ----------------------------------------------------------------------
2797    (*) These are not used by any known coding system.
2798
2799    Control characters for these functions are defined by macros
2800    ISO_CODE_XXX in `coding.h'.
2801
2802    Designations are done by the following escape sequences:
2803    ----------------------------------------------------------------------
2804    escape sequence      description
2805    ----------------------------------------------------------------------
2806    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2807    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2808    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2809    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2810    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2811    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2812    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2813    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2814    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2815    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2816    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2817    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2818    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2819    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2820    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2821    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2822    ----------------------------------------------------------------------
2823
2824    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2825    of dimension 1, chars 94, and final character <F>, etc...
2826
2827    Note (*): Although these designations are not allowed in ISO2022,
2828    Emacs accepts them on decoding, and produces them on encoding
2829    CHARS96 character sets in a coding system which is characterized as
2830    7-bit environment, non-locking-shift, and non-single-shift.
2831
2832    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2833    '(' must be omitted.  We refer to this as "short-form" hereafter.
2834
2835    Now you may notice that there are a lot of ways of encoding the
2836    same multilingual text in ISO2022.  Actually, there exist many
2837    coding systems such as Compound Text (used in X11's inter client
2838    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2839    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2840    localized platforms), and all of these are variants of ISO2022.
2841
2842    In addition to the above, Emacs handles two more kinds of escape
2843    sequences: ISO6429's direction specification and Emacs' private
2844    sequence for specifying character composition.
2845
2846    ISO6429's direction specification takes the following form:
2847         o CSI ']'      -- end of the current direction
2848         o CSI '0' ']'  -- end of the current direction
2849         o CSI '1' ']'  -- start of left-to-right text
2850         o CSI '2' ']'  -- start of right-to-left text
2851    The control character CSI (0x9B: control sequence introducer) is
2852    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2853
2854    Character composition specification takes the following form:
2855         o ESC '0' -- start relative composition
2856         o ESC '1' -- end composition
2857         o ESC '2' -- start rule-base composition (*)
2858         o ESC '3' -- start relative composition with alternate chars  (**)
2859         o ESC '4' -- start rule-base composition with alternate chars  (**)
2860   Since these are not standard escape sequences of any ISO standard,
2861   the use of them with these meanings is restricted to Emacs only.
2862
2863   (*) This form is used only in Emacs 20.7 and older versions,
2864   but newer versions can safely decode it.
2865   (**) This form is used only in Emacs 21.1 and newer versions,
2866   and older versions can't decode it.
2867
2868   Here's a list of example usages of these composition escape
2869   sequences (categorized by `enum composition_method').
2870
2871   COMPOSITION_RELATIVE:
2872         ESC 0 CHAR [ CHAR ] ESC 1
2873   COMPOSITION_WITH_RULE:
2874         ESC 2 CHAR [ RULE CHAR ] ESC 1
2875   COMPOSITION_WITH_ALTCHARS:
2876         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2877   COMPOSITION_WITH_RULE_ALTCHARS:
2878         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2879
2880 static enum iso_code_class_type iso_code_class[256];
2881
2882 #define SAFE_CHARSET_P(coding, id)      \
2883   ((id) <= (coding)->max_charset_id     \
2884    && (coding)->safe_charsets[id] != 255)
2885
2886 static void
2887 setup_iso_safe_charsets (Lisp_Object attrs)
2888 {
2889   Lisp_Object charset_list, safe_charsets;
2890   Lisp_Object request;
2891   Lisp_Object reg_usage;
2892   Lisp_Object tail;
2893   EMACS_INT reg94, reg96;
2894   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2895   int max_charset_id;
2896
2897   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2898   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2899       && ! EQ (charset_list, Viso_2022_charset_list))
2900     {
2901       charset_list = Viso_2022_charset_list;
2902       ASET (attrs, coding_attr_charset_list, charset_list);
2903       ASET (attrs, coding_attr_safe_charsets, Qnil);
2904     }
2905
2906   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2907     return;
2908
2909   max_charset_id = 0;
2910   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2911     {
2912       int id = XINT (XCAR (tail));
2913       if (max_charset_id < id)
2914         max_charset_id = id;
2915     }
2916
2917   safe_charsets = make_uninit_string (max_charset_id + 1);
2918   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2919   request = AREF (attrs, coding_attr_iso_request);
2920   reg_usage = AREF (attrs, coding_attr_iso_usage);
2921   reg94 = XINT (XCAR (reg_usage));
2922   reg96 = XINT (XCDR (reg_usage));
2923
2924   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2925     {
2926       Lisp_Object id;
2927       Lisp_Object reg;
2928       struct charset *charset;
2929
2930       id = XCAR (tail);
2931       charset = CHARSET_FROM_ID (XINT (id));
2932       reg = Fcdr (Fassq (id, request));
2933       if (! NILP (reg))
2934         SSET (safe_charsets, XINT (id), XINT (reg));
2935       else if (charset->iso_chars_96)
2936         {
2937           if (reg96 < 4)
2938             SSET (safe_charsets, XINT (id), reg96);
2939         }
2940       else
2941         {
2942           if (reg94 < 4)
2943             SSET (safe_charsets, XINT (id), reg94);
2944         }
2945     }
2946   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2947 }
2948
2949
2950 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2951    Return true if a text is encoded in one of ISO-2022 based coding
2952    systems.  */
2953
2954 static bool
2955 detect_coding_iso_2022 (struct coding_system *coding,
2956                         struct coding_detection_info *detect_info)
2957 {
2958   const unsigned char *src = coding->source, *src_base = src;
2959   const unsigned char *src_end = coding->source + coding->src_bytes;
2960   bool multibytep = coding->src_multibyte;
2961   bool single_shifting = 0;
2962   int id;
2963   int c, c1;
2964   ptrdiff_t consumed_chars = 0;
2965   int i;
2966   int rejected = 0;
2967   int found = 0;
2968   int composition_count = -1;
2969
2970   detect_info->checked |= CATEGORY_MASK_ISO;
2971
2972   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2973     {
2974       struct coding_system *this = &(coding_categories[i]);
2975       Lisp_Object attrs, val;
2976
2977       if (this->id < 0)
2978         continue;
2979       attrs = CODING_ID_ATTRS (this->id);
2980       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2981           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2982         setup_iso_safe_charsets (attrs);
2983       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2984       this->max_charset_id = SCHARS (val) - 1;
2985       this->safe_charsets = SDATA (val);
2986     }
2987
2988   /* A coding system of this category is always ASCII compatible.  */
2989   src += coding->head_ascii;
2990
2991   while (rejected != CATEGORY_MASK_ISO)
2992     {
2993       src_base = src;
2994       ONE_MORE_BYTE (c);
2995       switch (c)
2996         {
2997         case ISO_CODE_ESC:
2998           if (inhibit_iso_escape_detection)
2999             break;
3000           single_shifting = 0;
3001           ONE_MORE_BYTE (c);
3002           if (c == 'N' || c == 'O')
3003             {
3004               /* ESC <Fe> for SS2 or SS3.  */
3005               single_shifting = 1;
3006               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3007             }
3008           else if (c == '1')
3009             {
3010               /* End of composition.  */
3011               if (composition_count < 0
3012                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3013                 /* Invalid */
3014                 break;
3015               composition_count = -1;
3016               found |= CATEGORY_MASK_ISO;
3017             }
3018           else if (c >= '0' && c <= '4')
3019             {
3020               /* ESC <Fp> for start/end composition.  */
3021               composition_count = 0;
3022             }
3023           else
3024             {
3025               if (c >= '(' && c <= '/')
3026                 {
3027                   /* Designation sequence for a charset of dimension 1.  */
3028                   ONE_MORE_BYTE (c1);
3029                   if (c1 < ' ' || c1 >= 0x80
3030                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3031                     {
3032                       /* Invalid designation sequence.  Just ignore.  */
3033                       if (c1 >= 0x80)
3034                         rejected |= (CATEGORY_MASK_ISO_7BIT
3035                                      | CATEGORY_MASK_ISO_7_ELSE);
3036                       break;
3037                     }
3038                 }
3039               else if (c == '$')
3040                 {
3041                   /* Designation sequence for a charset of dimension 2.  */
3042                   ONE_MORE_BYTE (c);
3043                   if (c >= '@' && c <= 'B')
3044                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3045                     id = iso_charset_table[1][0][c];
3046                   else if (c >= '(' && c <= '/')
3047                     {
3048                       ONE_MORE_BYTE (c1);
3049                       if (c1 < ' ' || c1 >= 0x80
3050                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3051                         {
3052                           /* Invalid designation sequence.  Just ignore.  */
3053                           if (c1 >= 0x80)
3054                             rejected |= (CATEGORY_MASK_ISO_7BIT
3055                                          | CATEGORY_MASK_ISO_7_ELSE);
3056                           break;
3057                         }
3058                     }
3059                   else
3060                     {
3061                       /* Invalid designation sequence.  Just ignore it.  */
3062                       if (c >= 0x80)
3063                         rejected |= (CATEGORY_MASK_ISO_7BIT
3064                                      | CATEGORY_MASK_ISO_7_ELSE);
3065                       break;
3066                     }
3067                 }
3068               else
3069                 {
3070                   /* Invalid escape sequence.  Just ignore it.  */
3071                   if (c >= 0x80)
3072                     rejected |= (CATEGORY_MASK_ISO_7BIT
3073                                  | CATEGORY_MASK_ISO_7_ELSE);
3074                   break;
3075                 }
3076
3077               /* We found a valid designation sequence for CHARSET.  */
3078               rejected |= CATEGORY_MASK_ISO_8BIT;
3079               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3080                                   id))
3081                 found |= CATEGORY_MASK_ISO_7;
3082               else
3083                 rejected |= CATEGORY_MASK_ISO_7;
3084               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3085                                   id))
3086                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3087               else
3088                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3089               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3090                                   id))
3091                 found |= CATEGORY_MASK_ISO_7_ELSE;
3092               else
3093                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3094               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3095                                   id))
3096                 found |= CATEGORY_MASK_ISO_8_ELSE;
3097               else
3098                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3099             }
3100           break;
3101
3102         case ISO_CODE_SO:
3103         case ISO_CODE_SI:
3104           /* Locking shift out/in.  */
3105           if (inhibit_iso_escape_detection)
3106             break;
3107           single_shifting = 0;
3108           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3109           break;
3110
3111         case ISO_CODE_CSI:
3112           /* Control sequence introducer.  */
3113           single_shifting = 0;
3114           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3115           found |= CATEGORY_MASK_ISO_8_ELSE;
3116           goto check_extra_latin;
3117
3118         case ISO_CODE_SS2:
3119         case ISO_CODE_SS3:
3120           /* Single shift.   */
3121           if (inhibit_iso_escape_detection)
3122             break;
3123           single_shifting = 0;
3124           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3125           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3126               & CODING_ISO_FLAG_SINGLE_SHIFT)
3127             {
3128               found |= CATEGORY_MASK_ISO_8_1;
3129               single_shifting = 1;
3130             }
3131           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3132               & CODING_ISO_FLAG_SINGLE_SHIFT)
3133             {
3134               found |= CATEGORY_MASK_ISO_8_2;
3135               single_shifting = 1;
3136             }
3137           if (single_shifting)
3138             break;
3139           goto check_extra_latin;
3140
3141         default:
3142           if (c < 0)
3143             continue;
3144           if (c < 0x80)
3145             {
3146               if (composition_count >= 0)
3147                 composition_count++;
3148               single_shifting = 0;
3149               break;
3150             }
3151           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3152           if (c >= 0xA0)
3153             {
3154               found |= CATEGORY_MASK_ISO_8_1;
3155               /* Check the length of succeeding codes of the range
3156                  0xA0..0FF.  If the byte length is even, we include
3157                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3158                  only when we are not single shifting.  */
3159               if (! single_shifting
3160                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3161                 {
3162                   ptrdiff_t len = 1;
3163                   while (src < src_end)
3164                     {
3165                       src_base = src;
3166                       ONE_MORE_BYTE (c);
3167                       if (c < 0xA0)
3168                         {
3169                           src = src_base;
3170                           break;
3171                         }
3172                       len++;
3173                     }
3174
3175                   if (len & 1 && src < src_end)
3176                     {
3177                       rejected |= CATEGORY_MASK_ISO_8_2;
3178                       if (composition_count >= 0)
3179                         composition_count += len;
3180                     }
3181                   else
3182                     {
3183                       found |= CATEGORY_MASK_ISO_8_2;
3184                       if (composition_count >= 0)
3185                         composition_count += len / 2;
3186                     }
3187                 }
3188               break;
3189             }
3190         check_extra_latin:
3191           if (! VECTORP (Vlatin_extra_code_table)
3192               || NILP (AREF (Vlatin_extra_code_table, c)))
3193             {
3194               rejected = CATEGORY_MASK_ISO;
3195               break;
3196             }
3197           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3198               & CODING_ISO_FLAG_LATIN_EXTRA)
3199             found |= CATEGORY_MASK_ISO_8_1;
3200           else
3201             rejected |= CATEGORY_MASK_ISO_8_1;
3202           rejected |= CATEGORY_MASK_ISO_8_2;
3203           break;
3204         }
3205     }
3206   detect_info->rejected |= CATEGORY_MASK_ISO;
3207   return 0;
3208
3209  no_more_source:
3210   detect_info->rejected |= rejected;
3211   detect_info->found |= (found & ~rejected);
3212   return 1;
3213 }
3214
3215
3216 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3217    escape sequence should be kept.  */
3218 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3219   do {                                                                  \
3220     int id, prev;                                                       \
3221                                                                         \
3222     if (final < '0' || final >= 128                                     \
3223         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3224         || !SAFE_CHARSET_P (coding, id))                                \
3225       {                                                                 \
3226         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3227         chars_96 = -1;                                                  \
3228         break;                                                          \
3229       }                                                                 \
3230     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3231     if (id == charset_jisx0201_roman)                                   \
3232       {                                                                 \
3233         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3234           id = charset_ascii;                                           \
3235       }                                                                 \
3236     else if (id == charset_jisx0208_1978)                               \
3237       {                                                                 \
3238         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3239           id = charset_jisx0208;                                        \
3240       }                                                                 \
3241     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3242     /* If there was an invalid designation to REG previously, and this  \
3243        designation is ASCII to REG, we should keep this designation     \
3244        sequence.  */                                                    \
3245     if (prev == -2 && id == charset_ascii)                              \
3246       chars_96 = -1;                                                    \
3247   } while (0)
3248
3249
3250 /* Handle these composition sequence (ALT: alternate char):
3251
3252    (1) relative composition: ESC 0 CHAR ... ESC 1
3253    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3254    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3255    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3256
3257    When the start sequence (ESC 0/2/3/4) is found, this annotation
3258    header is produced.
3259
3260         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3261
3262    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3263    produced until the end sequence (ESC 1) is found:
3264
3265    (1) CHAR ... CHAR
3266    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3267    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3268    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3269
3270    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3271    annotation header is updated as below:
3272
3273    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3274    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3275    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3276    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3277
3278    If an error is found while composing, the annotation header is
3279    changed to:
3280
3281         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3282
3283    and the sequence [ -2 DECODED-RULE ] is changed to the original
3284    byte sequence as below:
3285         o the original byte sequence is B: [ B -1 ]
3286         o the original byte sequence is B1 B2: [ B1 B2 ]
3287    and the sequence [ -1 -1 ] is changed to the original byte
3288    sequence:
3289         [ ESC '0' ]
3290 */
3291
3292 /* Decode a composition rule C1 and maybe one more byte from the
3293    source, and set RULE to the encoded composition rule.  If the rule
3294    is invalid, goto invalid_code.  */
3295
3296 #define DECODE_COMPOSITION_RULE(rule)                                   \
3297   do {                                                                  \
3298     rule = c1 - 32;                                                     \
3299     if (rule < 0)                                                       \
3300       goto invalid_code;                                                \
3301     if (rule < 81)              /* old format (before ver.21) */        \
3302       {                                                                 \
3303         int gref = (rule) / 9;                                          \
3304         int nref = (rule) % 9;                                          \
3305         if (gref == 4) gref = 10;                                       \
3306         if (nref == 4) nref = 10;                                       \
3307         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3308       }                                                                 \
3309     else                        /* new format (after ver.21) */         \
3310       {                                                                 \
3311         int b;                                                          \
3312                                                                         \
3313         ONE_MORE_BYTE (b);                                              \
3314         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3315           goto invalid_code;                                            \
3316         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3317         rule += 0x100;   /* Distinguish it from the old format.  */     \
3318       }                                                                 \
3319   } while (0)
3320
3321 #define ENCODE_COMPOSITION_RULE(rule)                           \
3322   do {                                                          \
3323     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3324                                                                 \
3325     if (rule < 0x100)           /* old format */                \
3326       {                                                         \
3327         if (gref == 10) gref = 4;                               \
3328         if (nref == 10) nref = 4;                               \
3329         charbuf[idx] = 32 + gref * 9 + nref;                    \
3330         charbuf[idx + 1] = -1;                                  \
3331         new_chars++;                                            \
3332       }                                                         \
3333     else                                /* new format */        \
3334       {                                                         \
3335         charbuf[idx] = 32 + 81 + gref;                          \
3336         charbuf[idx + 1] = 32 + nref;                           \
3337         new_chars += 2;                                         \
3338       }                                                         \
3339   } while (0)
3340
3341 /* Finish the current composition as invalid.  */
3342
3343 static int
3344 finish_composition (int *charbuf, struct composition_status *cmp_status)
3345 {
3346   int idx = - cmp_status->length;
3347   int new_chars;
3348
3349   /* Recover the original ESC sequence */
3350   charbuf[idx++] = ISO_CODE_ESC;
3351   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3352                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3353                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3354                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3355                     : '4');
3356   charbuf[idx++] = -2;
3357   charbuf[idx++] = 0;
3358   charbuf[idx++] = -1;
3359   new_chars = cmp_status->nchars;
3360   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3361     for (; idx < 0; idx++)
3362       {
3363         int elt = charbuf[idx];
3364
3365         if (elt == -2)
3366           {
3367             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3368             idx++;
3369           }
3370         else if (elt == -1)
3371           {
3372             charbuf[idx++] = ISO_CODE_ESC;
3373             charbuf[idx] = '0';
3374             new_chars += 2;
3375           }
3376       }
3377   cmp_status->state = COMPOSING_NO;
3378   return new_chars;
3379 }
3380
3381 /* If characters are under composition, finish the composition.  */
3382 #define MAYBE_FINISH_COMPOSITION()                              \
3383   do {                                                          \
3384     if (cmp_status->state != COMPOSING_NO)                      \
3385       char_offset += finish_composition (charbuf, cmp_status);  \
3386   } while (0)
3387
3388 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3389
3390    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3391    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3392    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3393    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3394
3395    Produce this annotation sequence now:
3396
3397    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3398 */
3399
3400 #define DECODE_COMPOSITION_START(c1)                                       \
3401   do {                                                                     \
3402     if (c1 == '0'                                                          \
3403         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3404              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3405             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3406                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3407       {                                                                    \
3408         *charbuf++ = -1;                                                   \
3409         *charbuf++= -1;                                                    \
3410         cmp_status->state = COMPOSING_CHAR;                                \
3411         cmp_status->length += 2;                                           \
3412       }                                                                    \
3413     else                                                                   \
3414       {                                                                    \
3415         MAYBE_FINISH_COMPOSITION ();                                       \
3416         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3417                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3418                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3419                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3420         cmp_status->state                                                  \
3421           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3422         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3423         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3424         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3425         coding->annotated = 1;                                             \
3426       }                                                                    \
3427   } while (0)
3428
3429
3430 /* Handle composition end sequence ESC 1.  */
3431
3432 #define DECODE_COMPOSITION_END()                                        \
3433   do {                                                                  \
3434     if (cmp_status->nchars == 0                                         \
3435         || ((cmp_status->state == COMPOSING_CHAR)                       \
3436             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3437       {                                                                 \
3438         MAYBE_FINISH_COMPOSITION ();                                    \
3439         goto invalid_code;                                              \
3440       }                                                                 \
3441     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3442       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3443     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3444       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3445     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3446     char_offset += cmp_status->nchars;                                  \
3447     cmp_status->state = COMPOSING_NO;                                   \
3448   } while (0)
3449
3450 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3451
3452 #define STORE_COMPOSITION_RULE(rule)    \
3453   do {                                  \
3454     *charbuf++ = -2;                    \
3455     *charbuf++ = rule;                  \
3456     cmp_status->length += 2;            \
3457     cmp_status->state--;                \
3458   } while (0)
3459
3460 /* Store a composed char or a component char C in charbuf, and update
3461    cmp_status.  */
3462
3463 #define STORE_COMPOSITION_CHAR(c)                                       \
3464   do {                                                                  \
3465     *charbuf++ = (c);                                                   \
3466     cmp_status->length++;                                               \
3467     if (cmp_status->state == COMPOSING_CHAR)                            \
3468       cmp_status->nchars++;                                             \
3469     else                                                                \
3470       cmp_status->ncomps++;                                             \
3471     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3472         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3473             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3474       cmp_status->state++;                                              \
3475   } while (0)
3476
3477
3478 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3479
3480 static void
3481 decode_coding_iso_2022 (struct coding_system *coding)
3482 {
3483   const unsigned char *src = coding->source + coding->consumed;
3484   const unsigned char *src_end = coding->source + coding->src_bytes;
3485   const unsigned char *src_base;
3486   int *charbuf = coding->charbuf + coding->charbuf_used;
3487   /* We may produce two annotations (charset and composition) in one
3488      loop and one more charset annotation at the end.  */
3489   int *charbuf_end
3490     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3491   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3492   bool multibytep = coding->src_multibyte;
3493   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3494   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3495   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3496   int charset_id_2, charset_id_3;
3497   struct charset *charset;
3498   int c;
3499   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3500   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3501   ptrdiff_t char_offset = coding->produced_char;
3502   ptrdiff_t last_offset = char_offset;
3503   int last_id = charset_ascii;
3504   bool eol_dos
3505     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3506   int byte_after_cr = -1;
3507   int i;
3508
3509   setup_iso_safe_charsets (attrs);
3510   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3511
3512   if (cmp_status->state != COMPOSING_NO)
3513     {
3514       if (charbuf_end - charbuf < cmp_status->length)
3515         emacs_abort ();
3516       for (i = 0; i < cmp_status->length; i++)
3517         *charbuf++ = cmp_status->carryover[i];
3518       coding->annotated = 1;
3519     }
3520
3521   while (1)
3522     {
3523       int c1, c2, c3;
3524
3525       src_base = src;
3526       consumed_chars_base = consumed_chars;
3527
3528       if (charbuf >= charbuf_end)
3529         {
3530           if (byte_after_cr >= 0)
3531             src_base--;
3532           break;
3533         }
3534
3535       if (byte_after_cr >= 0)
3536         c1 = byte_after_cr, byte_after_cr = -1;
3537       else
3538         ONE_MORE_BYTE (c1);
3539       if (c1 < 0)
3540         goto invalid_code;
3541
3542       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3543         {
3544           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3545           char_offset++;
3546           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3547           continue;
3548         }
3549
3550       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3551         {
3552           if (c1 == ISO_CODE_ESC)
3553             {
3554               if (src + 1 >= src_end)
3555                 goto no_more_source;
3556               *charbuf++ = ISO_CODE_ESC;
3557               char_offset++;
3558               if (src[0] == '%' && src[1] == '@')
3559                 {
3560                   src += 2;
3561                   consumed_chars += 2;
3562                   char_offset += 2;
3563                   /* We are sure charbuf can contain two more chars. */
3564                   *charbuf++ = '%';
3565                   *charbuf++ = '@';
3566                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3567                 }
3568             }
3569           else
3570             {
3571               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3572               char_offset++;
3573             }
3574           continue;
3575         }
3576
3577       if ((cmp_status->state == COMPOSING_RULE
3578            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3579           && c1 != ISO_CODE_ESC)
3580         {
3581           int rule;
3582
3583           DECODE_COMPOSITION_RULE (rule);
3584           STORE_COMPOSITION_RULE (rule);
3585           continue;
3586         }
3587
3588       /* We produce at most one character.  */
3589       switch (iso_code_class [c1])
3590         {
3591         case ISO_0x20_or_0x7F:
3592           if (charset_id_0 < 0
3593               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3594             /* This is SPACE or DEL.  */
3595             charset = CHARSET_FROM_ID (charset_ascii);
3596           else
3597             charset = CHARSET_FROM_ID (charset_id_0);
3598           break;
3599
3600         case ISO_graphic_plane_0:
3601           if (charset_id_0 < 0)
3602             charset = CHARSET_FROM_ID (charset_ascii);
3603           else
3604             charset = CHARSET_FROM_ID (charset_id_0);
3605           break;
3606
3607         case ISO_0xA0_or_0xFF:
3608           if (charset_id_1 < 0
3609               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3610               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3611             goto invalid_code;
3612           /* This is a graphic character, we fall down ... */
3613
3614         case ISO_graphic_plane_1:
3615           if (charset_id_1 < 0)
3616             goto invalid_code;
3617           charset = CHARSET_FROM_ID (charset_id_1);
3618           break;
3619
3620         case ISO_control_0:
3621           if (eol_dos && c1 == '\r')
3622             ONE_MORE_BYTE (byte_after_cr);
3623           MAYBE_FINISH_COMPOSITION ();
3624           charset = CHARSET_FROM_ID (charset_ascii);
3625           break;
3626
3627         case ISO_control_1:
3628           goto invalid_code;
3629
3630         case ISO_shift_out:
3631           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3632               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3633             goto invalid_code;
3634           CODING_ISO_INVOCATION (coding, 0) = 1;
3635           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3636           continue;
3637
3638         case ISO_shift_in:
3639           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3640             goto invalid_code;
3641           CODING_ISO_INVOCATION (coding, 0) = 0;
3642           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3643           continue;
3644
3645         case ISO_single_shift_2_7:
3646           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3647             goto invalid_code;
3648         case ISO_single_shift_2:
3649           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3650             goto invalid_code;
3651           /* SS2 is handled as an escape sequence of ESC 'N' */
3652           c1 = 'N';
3653           goto label_escape_sequence;
3654
3655         case ISO_single_shift_3:
3656           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3657             goto invalid_code;
3658           /* SS2 is handled as an escape sequence of ESC 'O' */
3659           c1 = 'O';
3660           goto label_escape_sequence;
3661
3662         case ISO_control_sequence_introducer:
3663           /* CSI is handled as an escape sequence of ESC '[' ...  */
3664           c1 = '[';
3665           goto label_escape_sequence;
3666
3667         case ISO_escape:
3668           ONE_MORE_BYTE (c1);
3669         label_escape_sequence:
3670           /* Escape sequences handled here are invocation,
3671              designation, direction specification, and character
3672              composition specification.  */
3673           switch (c1)
3674             {
3675             case '&':           /* revision of following character set */
3676               ONE_MORE_BYTE (c1);
3677               if (!(c1 >= '@' && c1 <= '~'))
3678                 goto invalid_code;
3679               ONE_MORE_BYTE (c1);
3680               if (c1 != ISO_CODE_ESC)
3681                 goto invalid_code;
3682               ONE_MORE_BYTE (c1);
3683               goto label_escape_sequence;
3684
3685             case '$':           /* designation of 2-byte character set */
3686               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3687                 goto invalid_code;
3688               {
3689                 int reg, chars96;
3690
3691                 ONE_MORE_BYTE (c1);
3692                 if (c1 >= '@' && c1 <= 'B')
3693                   {     /* designation of JISX0208.1978, GB2312.1980,
3694                            or JISX0208.1980 */
3695                     reg = 0, chars96 = 0;
3696                   }
3697                 else if (c1 >= 0x28 && c1 <= 0x2B)
3698                   { /* designation of DIMENSION2_CHARS94 character set */
3699                     reg = c1 - 0x28, chars96 = 0;
3700                     ONE_MORE_BYTE (c1);
3701                   }
3702                 else if (c1 >= 0x2C && c1 <= 0x2F)
3703                   { /* designation of DIMENSION2_CHARS96 character set */
3704                     reg = c1 - 0x2C, chars96 = 1;
3705                     ONE_MORE_BYTE (c1);
3706                   }
3707                 else
3708                   goto invalid_code;
3709                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3710                 /* We must update these variables now.  */
3711                 if (reg == 0)
3712                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3713                 else if (reg == 1)
3714                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3715                 if (chars96 < 0)
3716                   goto invalid_code;
3717               }
3718               continue;
3719
3720             case 'n':           /* invocation of locking-shift-2 */
3721               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3722                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3723                 goto invalid_code;
3724               CODING_ISO_INVOCATION (coding, 0) = 2;
3725               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3726               continue;
3727
3728             case 'o':           /* invocation of locking-shift-3 */
3729               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3730                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3731                 goto invalid_code;
3732               CODING_ISO_INVOCATION (coding, 0) = 3;
3733               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3734               continue;
3735
3736             case 'N':           /* invocation of single-shift-2 */
3737               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3738                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3739                 goto invalid_code;
3740               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3741               if (charset_id_2 < 0)
3742                 charset = CHARSET_FROM_ID (charset_ascii);
3743               else
3744                 charset = CHARSET_FROM_ID (charset_id_2);
3745               ONE_MORE_BYTE (c1);
3746               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3747                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3748                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3749                           ? c1 >= 0x80 : c1 < 0x80)))
3750                 goto invalid_code;
3751               break;
3752
3753             case 'O':           /* invocation of single-shift-3 */
3754               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3755                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3756                 goto invalid_code;
3757               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3758               if (charset_id_3 < 0)
3759                 charset = CHARSET_FROM_ID (charset_ascii);
3760               else
3761                 charset = CHARSET_FROM_ID (charset_id_3);
3762               ONE_MORE_BYTE (c1);
3763               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3764                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3765                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3766                           ? c1 >= 0x80 : c1 < 0x80)))
3767                 goto invalid_code;
3768               break;
3769
3770             case '0': case '2': case '3': case '4': /* start composition */
3771               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3772                 goto invalid_code;
3773               if (last_id != charset_ascii)
3774                 {
3775                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3776                   last_id = charset_ascii;
3777                   last_offset = char_offset;
3778                 }
3779               DECODE_COMPOSITION_START (c1);
3780               continue;
3781
3782             case '1':           /* end composition */
3783               if (cmp_status->state == COMPOSING_NO)
3784                 goto invalid_code;
3785               DECODE_COMPOSITION_END ();
3786               continue;
3787
3788             case '[':           /* specification of direction */
3789               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3790                 goto invalid_code;
3791               /* For the moment, nested direction is not supported.
3792                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3793                  left-to-right, and nonzero means right-to-left.  */
3794               ONE_MORE_BYTE (c1);
3795               switch (c1)
3796                 {
3797                 case ']':       /* end of the current direction */
3798                   coding->mode &= ~CODING_MODE_DIRECTION;
3799
3800                 case '0':       /* end of the current direction */
3801                 case '1':       /* start of left-to-right direction */
3802                   ONE_MORE_BYTE (c1);
3803                   if (c1 == ']')
3804                     coding->mode &= ~CODING_MODE_DIRECTION;
3805                   else
3806                     goto invalid_code;
3807                   break;
3808
3809                 case '2':       /* start of right-to-left direction */
3810                   ONE_MORE_BYTE (c1);
3811                   if (c1 == ']')
3812                     coding->mode |= CODING_MODE_DIRECTION;
3813                   else
3814                     goto invalid_code;
3815                   break;
3816
3817                 default:
3818                   goto invalid_code;
3819                 }
3820               continue;
3821
3822             case '%':
3823               ONE_MORE_BYTE (c1);
3824               if (c1 == '/')
3825                 {
3826                   /* CTEXT extended segment:
3827                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3828                      We keep these bytes as is for the moment.
3829                      They may be decoded by post-read-conversion.  */
3830                   int dim, M, L;
3831                   int size;
3832
3833                   ONE_MORE_BYTE (dim);
3834                   if (dim < '0' || dim > '4')
3835                     goto invalid_code;
3836                   ONE_MORE_BYTE (M);
3837                   if (M < 128)
3838                     goto invalid_code;
3839                   ONE_MORE_BYTE (L);
3840                   if (L < 128)
3841                     goto invalid_code;
3842                   size = ((M - 128) * 128) + (L - 128);
3843                   if (charbuf + 6 > charbuf_end)
3844                     goto break_loop;
3845                   *charbuf++ = ISO_CODE_ESC;
3846                   *charbuf++ = '%';
3847                   *charbuf++ = '/';
3848                   *charbuf++ = dim;
3849                   *charbuf++ = BYTE8_TO_CHAR (M);
3850                   *charbuf++ = BYTE8_TO_CHAR (L);
3851                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3852                 }
3853               else if (c1 == 'G')
3854                 {
3855                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3856                      ESC % G --UTF-8-BYTES-- ESC % @
3857                      We keep these bytes as is for the moment.
3858                      They may be decoded by post-read-conversion.  */
3859                   if (charbuf + 3 > charbuf_end)
3860                     goto break_loop;
3861                   *charbuf++ = ISO_CODE_ESC;
3862                   *charbuf++ = '%';
3863                   *charbuf++ = 'G';
3864                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3865                 }
3866               else
3867                 goto invalid_code;
3868               continue;
3869               break;
3870
3871             default:
3872               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3873                 goto invalid_code;
3874               {
3875                 int reg, chars96;
3876
3877                 if (c1 >= 0x28 && c1 <= 0x2B)
3878                   { /* designation of DIMENSION1_CHARS94 character set */
3879                     reg = c1 - 0x28, chars96 = 0;
3880                     ONE_MORE_BYTE (c1);
3881                   }
3882                 else if (c1 >= 0x2C && c1 <= 0x2F)
3883                   { /* designation of DIMENSION1_CHARS96 character set */
3884                     reg = c1 - 0x2C, chars96 = 1;
3885                     ONE_MORE_BYTE (c1);
3886                   }
3887                 else
3888                   goto invalid_code;
3889                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3890                 /* We must update these variables now.  */
3891                 if (reg == 0)
3892                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3893                 else if (reg == 1)
3894                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3895                 if (chars96 < 0)
3896                   goto invalid_code;
3897               }
3898               continue;
3899             }
3900           break;
3901
3902         default:
3903           emacs_abort ();
3904         }
3905
3906       if (cmp_status->state == COMPOSING_NO
3907           && charset->id != charset_ascii
3908           && last_id != charset->id)
3909         {
3910           if (last_id != charset_ascii)
3911             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3912           last_id = charset->id;
3913           last_offset = char_offset;
3914         }
3915
3916       /* Now we know CHARSET and 1st position code C1 of a character.
3917          Produce a decoded character while getting 2nd and 3rd
3918          position codes C2, C3 if necessary.  */
3919       if (CHARSET_DIMENSION (charset) > 1)
3920         {
3921           ONE_MORE_BYTE (c2);
3922           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3923               || ((c1 & 0x80) != (c2 & 0x80)))
3924             /* C2 is not in a valid range.  */
3925             goto invalid_code;
3926           if (CHARSET_DIMENSION (charset) == 2)
3927             c1 = (c1 << 8) | c2;
3928           else
3929             {
3930               ONE_MORE_BYTE (c3);
3931               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3932                   || ((c1 & 0x80) != (c3 & 0x80)))
3933                 /* C3 is not in a valid range.  */
3934                 goto invalid_code;
3935               c1 = (c1 << 16) | (c2 << 8) | c2;
3936             }
3937         }
3938       c1 &= 0x7F7F7F;
3939       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3940       if (c < 0)
3941         {
3942           MAYBE_FINISH_COMPOSITION ();
3943           for (; src_base < src; src_base++, char_offset++)
3944             {
3945               if (ASCII_CHAR_P (*src_base))
3946                 *charbuf++ = *src_base;
3947               else
3948                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3949             }
3950         }
3951       else if (cmp_status->state == COMPOSING_NO)
3952         {
3953           *charbuf++ = c;
3954           char_offset++;
3955         }
3956       else if ((cmp_status->state == COMPOSING_CHAR
3957                 ? cmp_status->nchars
3958                 : cmp_status->ncomps)
3959                >= MAX_COMPOSITION_COMPONENTS)
3960         {
3961           /* Too long composition.  */
3962           MAYBE_FINISH_COMPOSITION ();
3963           *charbuf++ = c;
3964           char_offset++;
3965         }
3966       else
3967         STORE_COMPOSITION_CHAR (c);
3968       continue;
3969
3970     invalid_code:
3971       MAYBE_FINISH_COMPOSITION ();
3972       src = src_base;
3973       consumed_chars = consumed_chars_base;
3974       ONE_MORE_BYTE (c);
3975       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
3976       char_offset++;
3977       /* Reset the invocation and designation status to the safest
3978          one; i.e. designate ASCII to the graphic register 0, and
3979          invoke that register to the graphic plane 0.  This typically
3980          helps the case that an designation sequence for ASCII "ESC (
3981          B" is somehow broken (e.g. broken by a newline).  */
3982       CODING_ISO_INVOCATION (coding, 0) = 0;
3983       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3984       charset_id_0 = charset_ascii;
3985       continue;
3986
3987     break_loop:
3988       break;
3989     }
3990
3991  no_more_source:
3992   if (cmp_status->state != COMPOSING_NO)
3993     {
3994       if (coding->mode & CODING_MODE_LAST_BLOCK)
3995         MAYBE_FINISH_COMPOSITION ();
3996       else
3997         {
3998           charbuf -= cmp_status->length;
3999           for (i = 0; i < cmp_status->length; i++)
4000             cmp_status->carryover[i] = charbuf[i];
4001         }
4002     }
4003   else if (last_id != charset_ascii)
4004     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4005   coding->consumed_char += consumed_chars_base;
4006   coding->consumed = src_base - coding->source;
4007   coding->charbuf_used = charbuf - coding->charbuf;
4008 }
4009
4010
4011 /* ISO2022 encoding stuff.  */
4012
4013 /*
4014    It is not enough to say just "ISO2022" on encoding, we have to
4015    specify more details.  In Emacs, each coding system of ISO2022
4016    variant has the following specifications:
4017         1. Initial designation to G0 thru G3.
4018         2. Allows short-form designation?
4019         3. ASCII should be designated to G0 before control characters?
4020         4. ASCII should be designated to G0 at end of line?
4021         5. 7-bit environment or 8-bit environment?
4022         6. Use locking-shift?
4023         7. Use Single-shift?
4024    And the following two are only for Japanese:
4025         8. Use ASCII in place of JIS0201-1976-Roman?
4026         9. Use JISX0208-1983 in place of JISX0208-1978?
4027    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4028    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4029    details.
4030 */
4031
4032 /* Produce codes (escape sequence) for designating CHARSET to graphic
4033    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4034    '@', 'A', or 'B' and the coding system CODING allows, produce
4035    designation sequence of short-form.  */
4036
4037 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4038   do {                                                                  \
4039     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4040     const char *intermediate_char_94 = "()*+";                          \
4041     const char *intermediate_char_96 = ",-./";                          \
4042     int revision = -1;                                                  \
4043                                                                         \
4044     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4045       revision = CHARSET_ISO_REVISION (charset);                        \
4046                                                                         \
4047     if (revision >= 0)                                                  \
4048       {                                                                 \
4049         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4050         EMIT_ONE_BYTE ('@' + revision);                                 \
4051       }                                                                 \
4052     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4053     if (CHARSET_DIMENSION (charset) == 1)                               \
4054       {                                                                 \
4055         int b;                                                          \
4056         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4057           b = intermediate_char_94[reg];                                \
4058         else                                                            \
4059           b = intermediate_char_96[reg];                                \
4060         EMIT_ONE_ASCII_BYTE (b);                                        \
4061       }                                                                 \
4062     else                                                                \
4063       {                                                                 \
4064         EMIT_ONE_ASCII_BYTE ('$');                                      \
4065         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4066           {                                                             \
4067             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4068                 || reg != 0                                             \
4069                 || final_char < '@' || final_char > 'B')                \
4070               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4071           }                                                             \
4072         else                                                            \
4073           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4074       }                                                                 \
4075     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4076                                                                         \
4077     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4078   } while (0)
4079
4080
4081 /* The following two macros produce codes (control character or escape
4082    sequence) for ISO2022 single-shift functions (single-shift-2 and
4083    single-shift-3).  */
4084
4085 #define ENCODE_SINGLE_SHIFT_2                                           \
4086   do {                                                                  \
4087     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4088       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4089     else                                                                \
4090       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4091     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4092   } while (0)
4093
4094
4095 #define ENCODE_SINGLE_SHIFT_3                                           \
4096   do {                                                                  \
4097     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4098       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4099     else                                                                \
4100       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4101     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4102   } while (0)
4103
4104
4105 /* The following four macros produce codes (control character or
4106    escape sequence) for ISO2022 locking-shift functions (shift-in,
4107    shift-out, locking-shift-2, and locking-shift-3).  */
4108
4109 #define ENCODE_SHIFT_IN                                 \
4110   do {                                                  \
4111     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4112     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4113   } while (0)
4114
4115
4116 #define ENCODE_SHIFT_OUT                                \
4117   do {                                                  \
4118     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4119     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4120   } while (0)
4121
4122
4123 #define ENCODE_LOCKING_SHIFT_2                          \
4124   do {                                                  \
4125     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4126     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4127   } while (0)
4128
4129
4130 #define ENCODE_LOCKING_SHIFT_3                          \
4131   do {                                                  \
4132     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4133     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4134   } while (0)
4135
4136
4137 /* Produce codes for a DIMENSION1 character whose character set is
4138    CHARSET and whose position-code is C1.  Designation and invocation
4139    sequences are also produced in advance if necessary.  */
4140
4141 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4142   do {                                                                  \
4143     int id = CHARSET_ID (charset);                                      \
4144                                                                         \
4145     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4146         && id == charset_ascii)                                         \
4147       {                                                                 \
4148         id = charset_jisx0201_roman;                                    \
4149         charset = CHARSET_FROM_ID (id);                                 \
4150       }                                                                 \
4151                                                                         \
4152     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4153       {                                                                 \
4154         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4155           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4156         else                                                            \
4157           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4158         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4159         break;                                                          \
4160       }                                                                 \
4161     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4162       {                                                                 \
4163         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4164         break;                                                          \
4165       }                                                                 \
4166     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4167       {                                                                 \
4168         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4169         break;                                                          \
4170       }                                                                 \
4171     else                                                                \
4172       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4173          must invoke it, or, at first, designate it to some graphic     \
4174          register.  Then repeat the loop to actually produce the        \
4175          character.  */                                                 \
4176       dst = encode_invocation_designation (charset, coding, dst,        \
4177                                            &produced_chars);            \
4178   } while (1)
4179
4180
4181 /* Produce codes for a DIMENSION2 character whose character set is
4182    CHARSET and whose position-codes are C1 and C2.  Designation and
4183    invocation codes are also produced in advance if necessary.  */
4184
4185 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4186   do {                                                                  \
4187     int id = CHARSET_ID (charset);                                      \
4188                                                                         \
4189     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4190         && id == charset_jisx0208)                                      \
4191       {                                                                 \
4192         id = charset_jisx0208_1978;                                     \
4193         charset = CHARSET_FROM_ID (id);                                 \
4194       }                                                                 \
4195                                                                         \
4196     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4197       {                                                                 \
4198         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4199           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4200         else                                                            \
4201           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4202         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4203         break;                                                          \
4204       }                                                                 \
4205     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4206       {                                                                 \
4207         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4208         break;                                                          \
4209       }                                                                 \
4210     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4211       {                                                                 \
4212         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4213         break;                                                          \
4214       }                                                                 \
4215     else                                                                \
4216       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4217          must invoke it, or, at first, designate it to some graphic     \
4218          register.  Then repeat the loop to actually produce the        \
4219          character.  */                                                 \
4220       dst = encode_invocation_designation (charset, coding, dst,        \
4221                                            &produced_chars);            \
4222   } while (1)
4223
4224
4225 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4226   do {                                                                     \
4227     unsigned code;                                                         \
4228     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4229                                                                            \
4230     if (CHARSET_DIMENSION (charset) == 1)                                  \
4231       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4232     else                                                                   \
4233       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4234   } while (0)
4235
4236
4237 /* Produce designation and invocation codes at a place pointed by DST
4238    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4239    Return new DST.  */
4240
4241 static unsigned char *
4242 encode_invocation_designation (struct charset *charset,
4243                                struct coding_system *coding,
4244                                unsigned char *dst, ptrdiff_t *p_nchars)
4245 {
4246   bool multibytep = coding->dst_multibyte;
4247   ptrdiff_t produced_chars = *p_nchars;
4248   int reg;                      /* graphic register number */
4249   int id = CHARSET_ID (charset);
4250
4251   /* At first, check designations.  */
4252   for (reg = 0; reg < 4; reg++)
4253     if (id == CODING_ISO_DESIGNATION (coding, reg))
4254       break;
4255
4256   if (reg >= 4)
4257     {
4258       /* CHARSET is not yet designated to any graphic registers.  */
4259       /* At first check the requested designation.  */
4260       reg = CODING_ISO_REQUEST (coding, id);
4261       if (reg < 0)
4262         /* Since CHARSET requests no special designation, designate it
4263            to graphic register 0.  */
4264         reg = 0;
4265
4266       ENCODE_DESIGNATION (charset, reg, coding);
4267     }
4268
4269   if (CODING_ISO_INVOCATION (coding, 0) != reg
4270       && CODING_ISO_INVOCATION (coding, 1) != reg)
4271     {
4272       /* Since the graphic register REG is not invoked to any graphic
4273          planes, invoke it to graphic plane 0.  */
4274       switch (reg)
4275         {
4276         case 0:                 /* graphic register 0 */
4277           ENCODE_SHIFT_IN;
4278           break;
4279
4280         case 1:                 /* graphic register 1 */
4281           ENCODE_SHIFT_OUT;
4282           break;
4283
4284         case 2:                 /* graphic register 2 */
4285           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4286             ENCODE_SINGLE_SHIFT_2;
4287           else
4288             ENCODE_LOCKING_SHIFT_2;
4289           break;
4290
4291         case 3:                 /* graphic register 3 */
4292           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4293             ENCODE_SINGLE_SHIFT_3;
4294           else
4295             ENCODE_LOCKING_SHIFT_3;
4296           break;
4297
4298         default:
4299           break;
4300         }
4301     }
4302
4303   *p_nchars = produced_chars;
4304   return dst;
4305 }
4306
4307
4308 /* Produce codes for designation and invocation to reset the graphic
4309    planes and registers to initial state.  */
4310 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4311   do {                                                                  \
4312     int reg;                                                            \
4313     struct charset *charset;                                            \
4314                                                                         \
4315     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4316       ENCODE_SHIFT_IN;                                                  \
4317     for (reg = 0; reg < 4; reg++)                                       \
4318       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4319           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4320               != CODING_ISO_INITIAL (coding, reg)))                     \
4321         {                                                               \
4322           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4323           ENCODE_DESIGNATION (charset, reg, coding);                    \
4324         }                                                               \
4325   } while (0)
4326
4327
4328 /* Produce designation sequences of charsets in the line started from
4329    CHARBUF to a place pointed by DST, and return the number of
4330    produced bytes.  DST should not directly point a buffer text area
4331    which may be relocated by char_charset call.
4332
4333    If the current block ends before any end-of-line, we may fail to
4334    find all the necessary designations.  */
4335
4336 static ptrdiff_t
4337 encode_designation_at_bol (struct coding_system *coding,
4338                            int *charbuf, int *charbuf_end,
4339                            unsigned char *dst)
4340 {
4341   unsigned char *orig = dst;
4342   struct charset *charset;
4343   /* Table of charsets to be designated to each graphic register.  */
4344   int r[4];
4345   int c, found = 0, reg;
4346   ptrdiff_t produced_chars = 0;
4347   bool multibytep = coding->dst_multibyte;
4348   Lisp_Object attrs;
4349   Lisp_Object charset_list;
4350
4351   attrs = CODING_ID_ATTRS (coding->id);
4352   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4353   if (EQ (charset_list, Qiso_2022))
4354     charset_list = Viso_2022_charset_list;
4355
4356   for (reg = 0; reg < 4; reg++)
4357     r[reg] = -1;
4358
4359   while (charbuf < charbuf_end && found < 4)
4360     {
4361       int id;
4362
4363       c = *charbuf++;
4364       if (c == '\n')
4365         break;
4366       charset = char_charset (c, charset_list, NULL);
4367       id = CHARSET_ID (charset);
4368       reg = CODING_ISO_REQUEST (coding, id);
4369       if (reg >= 0 && r[reg] < 0)
4370         {
4371           found++;
4372           r[reg] = id;
4373         }
4374     }
4375
4376   if (found)
4377     {
4378       for (reg = 0; reg < 4; reg++)
4379         if (r[reg] >= 0
4380             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4381           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4382     }
4383
4384   return dst - orig;
4385 }
4386
4387 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4388
4389 static bool
4390 encode_coding_iso_2022 (struct coding_system *coding)
4391 {
4392   bool multibytep = coding->dst_multibyte;
4393   int *charbuf = coding->charbuf;
4394   int *charbuf_end = charbuf + coding->charbuf_used;
4395   unsigned char *dst = coding->destination + coding->produced;
4396   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4397   int safe_room = 16;
4398   bool bol_designation
4399     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4400        && CODING_ISO_BOL (coding));
4401   ptrdiff_t produced_chars = 0;
4402   Lisp_Object attrs, eol_type, charset_list;
4403   bool ascii_compatible;
4404   int c;
4405   int preferred_charset_id = -1;
4406
4407   CODING_GET_INFO (coding, attrs, charset_list);
4408   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4409   if (VECTORP (eol_type))
4410     eol_type = Qunix;
4411
4412   setup_iso_safe_charsets (attrs);
4413   /* Charset list may have been changed.  */
4414   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4415   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4416
4417   ascii_compatible
4418     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4419        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4420                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4421
4422   while (charbuf < charbuf_end)
4423     {
4424       ASSURE_DESTINATION (safe_room);
4425
4426       if (bol_designation)
4427         {
4428           /* We have to produce designation sequences if any now.  */
4429           unsigned char desig_buf[16];
4430           ptrdiff_t nbytes;
4431           ptrdiff_t offset;
4432
4433           charset_map_loaded = 0;
4434           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4435                                               desig_buf);
4436           if (charset_map_loaded
4437               && (offset = coding_change_destination (coding)))
4438             {
4439               dst += offset;
4440               dst_end += offset;
4441             }
4442           memcpy (dst, desig_buf, nbytes);
4443           dst += nbytes;
4444           /* We are sure that designation sequences are all ASCII bytes.  */
4445           produced_chars += nbytes;
4446           bol_designation = 0;
4447           ASSURE_DESTINATION (safe_room);
4448         }
4449
4450       c = *charbuf++;
4451
4452       if (c < 0)
4453         {
4454           /* Handle an annotation.  */
4455           switch (*charbuf)
4456             {
4457             case CODING_ANNOTATE_COMPOSITION_MASK:
4458               /* Not yet implemented.  */
4459               break;
4460             case CODING_ANNOTATE_CHARSET_MASK:
4461               preferred_charset_id = charbuf[2];
4462               if (preferred_charset_id >= 0
4463                   && NILP (Fmemq (make_number (preferred_charset_id),
4464                                   charset_list)))
4465                 preferred_charset_id = -1;
4466               break;
4467             default:
4468               emacs_abort ();
4469             }
4470           charbuf += -c - 1;
4471           continue;
4472         }
4473
4474       /* Now encode the character C.  */
4475       if (c < 0x20 || c == 0x7F)
4476         {
4477           if (c == '\n'
4478               || (c == '\r' && EQ (eol_type, Qmac)))
4479             {
4480               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4481                 ENCODE_RESET_PLANE_AND_REGISTER ();
4482               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4483                 {
4484                   int i;
4485
4486                   for (i = 0; i < 4; i++)
4487                     CODING_ISO_DESIGNATION (coding, i)
4488                       = CODING_ISO_INITIAL (coding, i);
4489                 }
4490               bol_designation = ((CODING_ISO_FLAGS (coding)
4491                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4492                                  != 0);
4493             }
4494           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4495             ENCODE_RESET_PLANE_AND_REGISTER ();
4496           EMIT_ONE_ASCII_BYTE (c);
4497         }
4498       else if (ASCII_CHAR_P (c))
4499         {
4500           if (ascii_compatible)
4501             EMIT_ONE_ASCII_BYTE (c);
4502           else
4503             {
4504               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4505               ENCODE_ISO_CHARACTER (charset, c);
4506             }
4507         }
4508       else if (CHAR_BYTE8_P (c))
4509         {
4510           c = CHAR_TO_BYTE8 (c);
4511           EMIT_ONE_BYTE (c);
4512         }
4513       else
4514         {
4515           struct charset *charset;
4516
4517           if (preferred_charset_id >= 0)
4518             {
4519               bool result;
4520
4521               charset = CHARSET_FROM_ID (preferred_charset_id);
4522               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4523               if (! result)
4524                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4525                                      NULL, charset);
4526             }
4527           else
4528             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4529                                  NULL, charset);
4530           if (!charset)
4531             {
4532               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4533                 {
4534                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4535                   charset = CHARSET_FROM_ID (charset_ascii);
4536                 }
4537               else
4538                 {
4539                   c = coding->default_char;
4540                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4541                                        charset_list, NULL, charset);
4542                 }
4543             }
4544           ENCODE_ISO_CHARACTER (charset, c);
4545         }
4546     }
4547
4548   if (coding->mode & CODING_MODE_LAST_BLOCK
4549       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4550     {
4551       ASSURE_DESTINATION (safe_room);
4552       ENCODE_RESET_PLANE_AND_REGISTER ();
4553     }
4554   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4555   CODING_ISO_BOL (coding) = bol_designation;
4556   coding->produced_char += produced_chars;
4557   coding->produced = dst - coding->destination;
4558   return 0;
4559 }
4560
4561 \f
4562 /*** 8,9. SJIS and BIG5 handlers ***/
4563
4564 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4565    quite widely.  So, for the moment, Emacs supports them in the bare
4566    C code.  But, in the future, they may be supported only by CCL.  */
4567
4568 /* SJIS is a coding system encoding three character sets: ASCII, right
4569    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4570    as is.  A character of charset katakana-jisx0201 is encoded by
4571    "position-code + 0x80".  A character of charset japanese-jisx0208
4572    is encoded in 2-byte but two position-codes are divided and shifted
4573    so that it fit in the range below.
4574
4575    --- CODE RANGE of SJIS ---
4576    (character set)      (range)
4577    ASCII                0x00 .. 0x7F
4578    KATAKANA-JISX0201    0xA0 .. 0xDF
4579    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4580             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4581    -------------------------------
4582
4583 */
4584
4585 /* BIG5 is a coding system encoding two character sets: ASCII and
4586    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4587    character set and is encoded in two-byte.
4588
4589    --- CODE RANGE of BIG5 ---
4590    (character set)      (range)
4591    ASCII                0x00 .. 0x7F
4592    Big5 (1st byte)      0xA1 .. 0xFE
4593         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4594    --------------------------
4595
4596   */
4597
4598 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4599    Return true if a text is encoded in SJIS.  */
4600
4601 static bool
4602 detect_coding_sjis (struct coding_system *coding,
4603                     struct coding_detection_info *detect_info)
4604 {
4605   const unsigned char *src = coding->source, *src_base;
4606   const unsigned char *src_end = coding->source + coding->src_bytes;
4607   bool multibytep = coding->src_multibyte;
4608   ptrdiff_t consumed_chars = 0;
4609   int found = 0;
4610   int c;
4611   Lisp_Object attrs, charset_list;
4612   int max_first_byte_of_2_byte_code;
4613
4614   CODING_GET_INFO (coding, attrs, charset_list);
4615   max_first_byte_of_2_byte_code
4616     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4617
4618   detect_info->checked |= CATEGORY_MASK_SJIS;
4619   /* A coding system of this category is always ASCII compatible.  */
4620   src += coding->head_ascii;
4621
4622   while (1)
4623     {
4624       src_base = src;
4625       ONE_MORE_BYTE (c);
4626       if (c < 0x80)
4627         continue;
4628       if ((c >= 0x81 && c <= 0x9F)
4629           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4630         {
4631           ONE_MORE_BYTE (c);
4632           if (c < 0x40 || c == 0x7F || c > 0xFC)
4633             break;
4634           found = CATEGORY_MASK_SJIS;
4635         }
4636       else if (c >= 0xA0 && c < 0xE0)
4637         found = CATEGORY_MASK_SJIS;
4638       else
4639         break;
4640     }
4641   detect_info->rejected |= CATEGORY_MASK_SJIS;
4642   return 0;
4643
4644  no_more_source:
4645   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4646     {
4647       detect_info->rejected |= CATEGORY_MASK_SJIS;
4648       return 0;
4649     }
4650   detect_info->found |= found;
4651   return 1;
4652 }
4653
4654 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4655    Return true if a text is encoded in BIG5.  */
4656
4657 static bool
4658 detect_coding_big5 (struct coding_system *coding,
4659                     struct coding_detection_info *detect_info)
4660 {
4661   const unsigned char *src = coding->source, *src_base;
4662   const unsigned char *src_end = coding->source + coding->src_bytes;
4663   bool multibytep = coding->src_multibyte;
4664   ptrdiff_t consumed_chars = 0;
4665   int found = 0;
4666   int c;
4667
4668   detect_info->checked |= CATEGORY_MASK_BIG5;
4669   /* A coding system of this category is always ASCII compatible.  */
4670   src += coding->head_ascii;
4671
4672   while (1)
4673     {
4674       src_base = src;
4675       ONE_MORE_BYTE (c);
4676       if (c < 0x80)
4677         continue;
4678       if (c >= 0xA1)
4679         {
4680           ONE_MORE_BYTE (c);
4681           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4682             return 0;
4683           found = CATEGORY_MASK_BIG5;
4684         }
4685       else
4686         break;
4687     }
4688   detect_info->rejected |= CATEGORY_MASK_BIG5;
4689   return 0;
4690
4691  no_more_source:
4692   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4693     {
4694       detect_info->rejected |= CATEGORY_MASK_BIG5;
4695       return 0;
4696     }
4697   detect_info->found |= found;
4698   return 1;
4699 }
4700
4701 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4702
4703 static void
4704 decode_coding_sjis (struct coding_system *coding)
4705 {
4706   const unsigned char *src = coding->source + coding->consumed;
4707   const unsigned char *src_end = coding->source + coding->src_bytes;
4708   const unsigned char *src_base;
4709   int *charbuf = coding->charbuf + coding->charbuf_used;
4710   /* We may produce one charset annotation in one loop and one more at
4711      the end.  */
4712   int *charbuf_end
4713     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4714   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4715   bool multibytep = coding->src_multibyte;
4716   struct charset *charset_roman, *charset_kanji, *charset_kana;
4717   struct charset *charset_kanji2;
4718   Lisp_Object attrs, charset_list, val;
4719   ptrdiff_t char_offset = coding->produced_char;
4720   ptrdiff_t last_offset = char_offset;
4721   int last_id = charset_ascii;
4722   bool eol_dos
4723     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4724   int byte_after_cr = -1;
4725
4726   CODING_GET_INFO (coding, attrs, charset_list);
4727
4728   val = charset_list;
4729   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4730   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4731   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4732   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4733
4734   while (1)
4735     {
4736       int c, c1;
4737       struct charset *charset;
4738
4739       src_base = src;
4740       consumed_chars_base = consumed_chars;
4741
4742       if (charbuf >= charbuf_end)
4743         {
4744           if (byte_after_cr >= 0)
4745             src_base--;
4746           break;
4747         }
4748
4749       if (byte_after_cr >= 0)
4750         c = byte_after_cr, byte_after_cr = -1;
4751       else
4752         ONE_MORE_BYTE (c);
4753       if (c < 0)
4754         goto invalid_code;
4755       if (c < 0x80)
4756         {
4757           if (eol_dos && c == '\r')
4758             ONE_MORE_BYTE (byte_after_cr);
4759           charset = charset_roman;
4760         }
4761       else if (c == 0x80 || c == 0xA0)
4762         goto invalid_code;
4763       else if (c >= 0xA1 && c <= 0xDF)
4764         {
4765           /* SJIS -> JISX0201-Kana */
4766           c &= 0x7F;
4767           charset = charset_kana;
4768         }
4769       else if (c <= 0xEF)
4770         {
4771           /* SJIS -> JISX0208 */
4772           ONE_MORE_BYTE (c1);
4773           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4774             goto invalid_code;
4775           c = (c << 8) | c1;
4776           SJIS_TO_JIS (c);
4777           charset = charset_kanji;
4778         }
4779       else if (c <= 0xFC && charset_kanji2)
4780         {
4781           /* SJIS -> JISX0213-2 */
4782           ONE_MORE_BYTE (c1);
4783           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4784             goto invalid_code;
4785           c = (c << 8) | c1;
4786           SJIS_TO_JIS2 (c);
4787           charset = charset_kanji2;
4788         }
4789       else
4790         goto invalid_code;
4791       if (charset->id != charset_ascii
4792           && last_id != charset->id)
4793         {
4794           if (last_id != charset_ascii)
4795             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4796           last_id = charset->id;
4797           last_offset = char_offset;
4798         }
4799       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4800       *charbuf++ = c;
4801       char_offset++;
4802       continue;
4803
4804     invalid_code:
4805       src = src_base;
4806       consumed_chars = consumed_chars_base;
4807       ONE_MORE_BYTE (c);
4808       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4809       char_offset++;
4810     }
4811
4812  no_more_source:
4813   if (last_id != charset_ascii)
4814     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4815   coding->consumed_char += consumed_chars_base;
4816   coding->consumed = src_base - coding->source;
4817   coding->charbuf_used = charbuf - coding->charbuf;
4818 }
4819
4820 static void
4821 decode_coding_big5 (struct coding_system *coding)
4822 {
4823   const unsigned char *src = coding->source + coding->consumed;
4824   const unsigned char *src_end = coding->source + coding->src_bytes;
4825   const unsigned char *src_base;
4826   int *charbuf = coding->charbuf + coding->charbuf_used;
4827   /* We may produce one charset annotation in one loop and one more at
4828      the end.  */
4829   int *charbuf_end
4830     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4831   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4832   bool multibytep = coding->src_multibyte;
4833   struct charset *charset_roman, *charset_big5;
4834   Lisp_Object attrs, charset_list, val;
4835   ptrdiff_t char_offset = coding->produced_char;
4836   ptrdiff_t last_offset = char_offset;
4837   int last_id = charset_ascii;
4838   bool eol_dos
4839     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4840   int byte_after_cr = -1;
4841
4842   CODING_GET_INFO (coding, attrs, charset_list);
4843   val = charset_list;
4844   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4845   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4846
4847   while (1)
4848     {
4849       int c, c1;
4850       struct charset *charset;
4851
4852       src_base = src;
4853       consumed_chars_base = consumed_chars;
4854
4855       if (charbuf >= charbuf_end)
4856         {
4857           if (byte_after_cr >= 0)
4858             src_base--;
4859           break;
4860         }
4861
4862       if (byte_after_cr >= 0)
4863         c = byte_after_cr, byte_after_cr = -1;
4864       else
4865         ONE_MORE_BYTE (c);
4866
4867       if (c < 0)
4868         goto invalid_code;
4869       if (c < 0x80)
4870         {
4871           if (eol_dos && c == '\r')
4872             ONE_MORE_BYTE (byte_after_cr);
4873           charset = charset_roman;
4874         }
4875       else
4876         {
4877           /* BIG5 -> Big5 */
4878           if (c < 0xA1 || c > 0xFE)
4879             goto invalid_code;
4880           ONE_MORE_BYTE (c1);
4881           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4882             goto invalid_code;
4883           c = c << 8 | c1;
4884           charset = charset_big5;
4885         }
4886       if (charset->id != charset_ascii
4887           && last_id != charset->id)
4888         {
4889           if (last_id != charset_ascii)
4890             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4891           last_id = charset->id;
4892           last_offset = char_offset;
4893         }
4894       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4895       *charbuf++ = c;
4896       char_offset++;
4897       continue;
4898
4899     invalid_code:
4900       src = src_base;
4901       consumed_chars = consumed_chars_base;
4902       ONE_MORE_BYTE (c);
4903       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4904       char_offset++;
4905     }
4906
4907  no_more_source:
4908   if (last_id != charset_ascii)
4909     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4910   coding->consumed_char += consumed_chars_base;
4911   coding->consumed = src_base - coding->source;
4912   coding->charbuf_used = charbuf - coding->charbuf;
4913 }
4914
4915 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4916    This function can encode charsets `ascii', `katakana-jisx0201',
4917    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4918    are sure that all these charsets are registered as official charset
4919    (i.e. do not have extended leading-codes).  Characters of other
4920    charsets are produced without any encoding.  */
4921
4922 static bool
4923 encode_coding_sjis (struct coding_system *coding)
4924 {
4925   bool multibytep = coding->dst_multibyte;
4926   int *charbuf = coding->charbuf;
4927   int *charbuf_end = charbuf + coding->charbuf_used;
4928   unsigned char *dst = coding->destination + coding->produced;
4929   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4930   int safe_room = 4;
4931   ptrdiff_t produced_chars = 0;
4932   Lisp_Object attrs, charset_list, val;
4933   bool ascii_compatible;
4934   struct charset *charset_kanji, *charset_kana;
4935   struct charset *charset_kanji2;
4936   int c;
4937
4938   CODING_GET_INFO (coding, attrs, charset_list);
4939   val = XCDR (charset_list);
4940   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4941   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4942   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4943
4944   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4945
4946   while (charbuf < charbuf_end)
4947     {
4948       ASSURE_DESTINATION (safe_room);
4949       c = *charbuf++;
4950       /* Now encode the character C.  */
4951       if (ASCII_CHAR_P (c) && ascii_compatible)
4952         EMIT_ONE_ASCII_BYTE (c);
4953       else if (CHAR_BYTE8_P (c))
4954         {
4955           c = CHAR_TO_BYTE8 (c);
4956           EMIT_ONE_BYTE (c);
4957         }
4958       else
4959         {
4960           unsigned code;
4961           struct charset *charset;
4962           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4963                                &code, charset);
4964
4965           if (!charset)
4966             {
4967               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4968                 {
4969                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4970                   charset = CHARSET_FROM_ID (charset_ascii);
4971                 }
4972               else
4973                 {
4974                   c = coding->default_char;
4975                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4976                                        charset_list, &code, charset);
4977                 }
4978             }
4979           if (code == CHARSET_INVALID_CODE (charset))
4980             emacs_abort ();
4981           if (charset == charset_kanji)
4982             {
4983               int c1, c2;
4984               JIS_TO_SJIS (code);
4985               c1 = code >> 8, c2 = code & 0xFF;
4986               EMIT_TWO_BYTES (c1, c2);
4987             }
4988           else if (charset == charset_kana)
4989             EMIT_ONE_BYTE (code | 0x80);
4990           else if (charset_kanji2 && charset == charset_kanji2)
4991             {
4992               int c1, c2;
4993
4994               c1 = code >> 8;
4995               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4996                   || c1 == 0x28
4997                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4998                 {
4999                   JIS_TO_SJIS2 (code);
5000                   c1 = code >> 8, c2 = code & 0xFF;
5001                   EMIT_TWO_BYTES (c1, c2);
5002                 }
5003               else
5004                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5005             }
5006           else
5007             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5008         }
5009     }
5010   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5011   coding->produced_char += produced_chars;
5012   coding->produced = dst - coding->destination;
5013   return 0;
5014 }
5015
5016 static bool
5017 encode_coding_big5 (struct coding_system *coding)
5018 {
5019   bool multibytep = coding->dst_multibyte;
5020   int *charbuf = coding->charbuf;
5021   int *charbuf_end = charbuf + coding->charbuf_used;
5022   unsigned char *dst = coding->destination + coding->produced;
5023   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5024   int safe_room = 4;
5025   ptrdiff_t produced_chars = 0;
5026   Lisp_Object attrs, charset_list, val;
5027   bool ascii_compatible;
5028   struct charset *charset_big5;
5029   int c;
5030
5031   CODING_GET_INFO (coding, attrs, charset_list);
5032   val = XCDR (charset_list);
5033   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5034   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5035
5036   while (charbuf < charbuf_end)
5037     {
5038       ASSURE_DESTINATION (safe_room);
5039       c = *charbuf++;
5040       /* Now encode the character C.  */
5041       if (ASCII_CHAR_P (c) && ascii_compatible)
5042         EMIT_ONE_ASCII_BYTE (c);
5043       else if (CHAR_BYTE8_P (c))
5044         {
5045           c = CHAR_TO_BYTE8 (c);
5046           EMIT_ONE_BYTE (c);
5047         }
5048       else
5049         {
5050           unsigned code;
5051           struct charset *charset;
5052           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5053                                &code, charset);
5054
5055           if (! charset)
5056             {
5057               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5058                 {
5059                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5060                   charset = CHARSET_FROM_ID (charset_ascii);
5061                 }
5062               else
5063                 {
5064                   c = coding->default_char;
5065                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5066                                        charset_list, &code, charset);
5067                 }
5068             }
5069           if (code == CHARSET_INVALID_CODE (charset))
5070             emacs_abort ();
5071           if (charset == charset_big5)
5072             {
5073               int c1, c2;
5074
5075               c1 = code >> 8, c2 = code & 0xFF;
5076               EMIT_TWO_BYTES (c1, c2);
5077             }
5078           else
5079             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5080         }
5081     }
5082   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5083   coding->produced_char += produced_chars;
5084   coding->produced = dst - coding->destination;
5085   return 0;
5086 }
5087
5088 \f
5089 /*** 10. CCL handlers ***/
5090
5091 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5092    Return true if a text is encoded in a coding system of which
5093    encoder/decoder are written in CCL program.  */
5094
5095 static bool
5096 detect_coding_ccl (struct coding_system *coding,
5097                    struct coding_detection_info *detect_info)
5098 {
5099   const unsigned char *src = coding->source, *src_base;
5100   const unsigned char *src_end = coding->source + coding->src_bytes;
5101   bool multibytep = coding->src_multibyte;
5102   ptrdiff_t consumed_chars = 0;
5103   int found = 0;
5104   unsigned char *valids;
5105   ptrdiff_t head_ascii = coding->head_ascii;
5106   Lisp_Object attrs;
5107
5108   detect_info->checked |= CATEGORY_MASK_CCL;
5109
5110   coding = &coding_categories[coding_category_ccl];
5111   valids = CODING_CCL_VALIDS (coding);
5112   attrs = CODING_ID_ATTRS (coding->id);
5113   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5114     src += head_ascii;
5115
5116   while (1)
5117     {
5118       int c;
5119
5120       src_base = src;
5121       ONE_MORE_BYTE (c);
5122       if (c < 0 || ! valids[c])
5123         break;
5124       if ((valids[c] > 1))
5125         found = CATEGORY_MASK_CCL;
5126     }
5127   detect_info->rejected |= CATEGORY_MASK_CCL;
5128   return 0;
5129
5130  no_more_source:
5131   detect_info->found |= found;
5132   return 1;
5133 }
5134
5135 static void
5136 decode_coding_ccl (struct coding_system *coding)
5137 {
5138   const unsigned char *src = coding->source + coding->consumed;
5139   const unsigned char *src_end = coding->source + coding->src_bytes;
5140   int *charbuf = coding->charbuf + coding->charbuf_used;
5141   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5142   ptrdiff_t consumed_chars = 0;
5143   bool multibytep = coding->src_multibyte;
5144   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5145   int source_charbuf[1024];
5146   int source_byteidx[1025];
5147   Lisp_Object attrs, charset_list;
5148
5149   CODING_GET_INFO (coding, attrs, charset_list);
5150
5151   while (1)
5152     {
5153       const unsigned char *p = src;
5154       ptrdiff_t offset;
5155       int i = 0;
5156
5157       if (multibytep)
5158         {
5159           while (i < 1024 && p < src_end)
5160             {
5161               source_byteidx[i] = p - src;
5162               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5163             }
5164           source_byteidx[i] = p - src;
5165         }
5166       else
5167         while (i < 1024 && p < src_end)
5168           source_charbuf[i++] = *p++;
5169
5170       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5171         ccl->last_block = true;
5172       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5173       charset_map_loaded = 0;
5174       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5175                   charset_list);
5176       if (charset_map_loaded
5177           && (offset = coding_change_source (coding)))
5178         {
5179           p += offset;
5180           src += offset;
5181           src_end += offset;
5182         }
5183       charbuf += ccl->produced;
5184       if (multibytep)
5185         src += source_byteidx[ccl->consumed];
5186       else
5187         src += ccl->consumed;
5188       consumed_chars += ccl->consumed;
5189       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5190         break;
5191     }
5192
5193   switch (ccl->status)
5194     {
5195     case CCL_STAT_SUSPEND_BY_SRC:
5196       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5197       break;
5198     case CCL_STAT_SUSPEND_BY_DST:
5199       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5200       break;
5201     case CCL_STAT_QUIT:
5202     case CCL_STAT_INVALID_CMD:
5203       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5204       break;
5205     default:
5206       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5207       break;
5208     }
5209   coding->consumed_char += consumed_chars;
5210   coding->consumed = src - coding->source;
5211   coding->charbuf_used = charbuf - coding->charbuf;
5212 }
5213
5214 static bool
5215 encode_coding_ccl (struct coding_system *coding)
5216 {
5217   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5218   bool multibytep = coding->dst_multibyte;
5219   int *charbuf = coding->charbuf;
5220   int *charbuf_end = charbuf + coding->charbuf_used;
5221   unsigned char *dst = coding->destination + coding->produced;
5222   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5223   int destination_charbuf[1024];
5224   ptrdiff_t produced_chars = 0;
5225   int i;
5226   Lisp_Object attrs, charset_list;
5227
5228   CODING_GET_INFO (coding, attrs, charset_list);
5229   if (coding->consumed_char == coding->src_chars
5230       && coding->mode & CODING_MODE_LAST_BLOCK)
5231     ccl->last_block = true;
5232
5233   do
5234     {
5235       ptrdiff_t offset;
5236
5237       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5238       charset_map_loaded = 0;
5239       ccl_driver (ccl, charbuf, destination_charbuf,
5240                   charbuf_end - charbuf, 1024, charset_list);
5241       if (charset_map_loaded
5242           && (offset = coding_change_destination (coding)))
5243         dst += offset;
5244       if (multibytep)
5245         {
5246           ASSURE_DESTINATION (ccl->produced * 2);
5247           for (i = 0; i < ccl->produced; i++)
5248             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5249         }
5250       else
5251         {
5252           ASSURE_DESTINATION (ccl->produced);
5253           for (i = 0; i < ccl->produced; i++)
5254             *dst++ = destination_charbuf[i] & 0xFF;
5255           produced_chars += ccl->produced;
5256         }
5257       charbuf += ccl->consumed;
5258       if (ccl->status == CCL_STAT_QUIT
5259           || ccl->status == CCL_STAT_INVALID_CMD)
5260         break;
5261     }
5262   while (charbuf < charbuf_end);
5263
5264   switch (ccl->status)
5265     {
5266     case CCL_STAT_SUSPEND_BY_SRC:
5267       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5268       break;
5269     case CCL_STAT_SUSPEND_BY_DST:
5270       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5271       break;
5272     case CCL_STAT_QUIT:
5273     case CCL_STAT_INVALID_CMD:
5274       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5275       break;
5276     default:
5277       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5278       break;
5279     }
5280
5281   coding->produced_char += produced_chars;
5282   coding->produced = dst - coding->destination;
5283   return 0;
5284 }
5285
5286 \f
5287 /*** 10, 11. no-conversion handlers ***/
5288
5289 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5290
5291 static void
5292 decode_coding_raw_text (struct coding_system *coding)
5293 {
5294   bool eol_dos
5295     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5296
5297   coding->chars_at_source = 1;
5298   coding->consumed_char = coding->src_chars;
5299   coding->consumed = coding->src_bytes;
5300   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5301     {
5302       coding->consumed_char--;
5303       coding->consumed--;
5304       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5305     }
5306   else
5307     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5308 }
5309
5310 static bool
5311 encode_coding_raw_text (struct coding_system *coding)
5312 {
5313   bool multibytep = coding->dst_multibyte;
5314   int *charbuf = coding->charbuf;
5315   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5316   unsigned char *dst = coding->destination + coding->produced;
5317   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5318   ptrdiff_t produced_chars = 0;
5319   int c;
5320
5321   if (multibytep)
5322     {
5323       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5324
5325       if (coding->src_multibyte)
5326         while (charbuf < charbuf_end)
5327           {
5328             ASSURE_DESTINATION (safe_room);
5329             c = *charbuf++;
5330             if (ASCII_CHAR_P (c))
5331               EMIT_ONE_ASCII_BYTE (c);
5332             else if (CHAR_BYTE8_P (c))
5333               {
5334                 c = CHAR_TO_BYTE8 (c);
5335                 EMIT_ONE_BYTE (c);
5336               }
5337             else
5338               {
5339                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5340
5341                 CHAR_STRING_ADVANCE (c, p1);
5342                 do
5343                   {
5344                     EMIT_ONE_BYTE (*p0);
5345                     p0++;
5346                   }
5347                 while (p0 < p1);
5348               }
5349           }
5350       else
5351         while (charbuf < charbuf_end)
5352           {
5353             ASSURE_DESTINATION (safe_room);
5354             c = *charbuf++;
5355             EMIT_ONE_BYTE (c);
5356           }
5357     }
5358   else
5359     {
5360       if (coding->src_multibyte)
5361         {
5362           int safe_room = MAX_MULTIBYTE_LENGTH;
5363
5364           while (charbuf < charbuf_end)
5365             {
5366               ASSURE_DESTINATION (safe_room);
5367               c = *charbuf++;
5368               if (ASCII_CHAR_P (c))
5369                 *dst++ = c;
5370               else if (CHAR_BYTE8_P (c))
5371                 *dst++ = CHAR_TO_BYTE8 (c);
5372               else
5373                 CHAR_STRING_ADVANCE (c, dst);
5374             }
5375         }
5376       else
5377         {
5378           ASSURE_DESTINATION (charbuf_end - charbuf);
5379           while (charbuf < charbuf_end && dst < dst_end)
5380             *dst++ = *charbuf++;
5381         }
5382       produced_chars = dst - (coding->destination + coding->produced);
5383     }
5384   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5385   coding->produced_char += produced_chars;
5386   coding->produced = dst - coding->destination;
5387   return 0;
5388 }
5389
5390 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5391    Return true if a text is encoded in a charset-based coding system.  */
5392
5393 static bool
5394 detect_coding_charset (struct coding_system *coding,
5395                        struct coding_detection_info *detect_info)
5396 {
5397   const unsigned char *src = coding->source, *src_base;
5398   const unsigned char *src_end = coding->source + coding->src_bytes;
5399   bool multibytep = coding->src_multibyte;
5400   ptrdiff_t consumed_chars = 0;
5401   Lisp_Object attrs, valids, name;
5402   int found = 0;
5403   ptrdiff_t head_ascii = coding->head_ascii;
5404   bool check_latin_extra = 0;
5405
5406   detect_info->checked |= CATEGORY_MASK_CHARSET;
5407
5408   coding = &coding_categories[coding_category_charset];
5409   attrs = CODING_ID_ATTRS (coding->id);
5410   valids = AREF (attrs, coding_attr_charset_valids);
5411   name = CODING_ID_NAME (coding->id);
5412   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5413                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5414       || strncmp (SSDATA (SYMBOL_NAME (name)),
5415                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5416     check_latin_extra = 1;
5417
5418   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5419     src += head_ascii;
5420
5421   while (1)
5422     {
5423       int c;
5424       Lisp_Object val;
5425       struct charset *charset;
5426       int dim, idx;
5427
5428       src_base = src;
5429       ONE_MORE_BYTE (c);
5430       if (c < 0)
5431         continue;
5432       val = AREF (valids, c);
5433       if (NILP (val))
5434         break;
5435       if (c >= 0x80)
5436         {
5437           if (c < 0xA0
5438               && check_latin_extra
5439               && (!VECTORP (Vlatin_extra_code_table)
5440                   || NILP (AREF (Vlatin_extra_code_table, c))))
5441             break;
5442           found = CATEGORY_MASK_CHARSET;
5443         }
5444       if (INTEGERP (val))
5445         {
5446           charset = CHARSET_FROM_ID (XFASTINT (val));
5447           dim = CHARSET_DIMENSION (charset);
5448           for (idx = 1; idx < dim; idx++)
5449             {
5450               if (src == src_end)
5451                 goto too_short;
5452               ONE_MORE_BYTE (c);
5453               if (c < charset->code_space[(dim - 1 - idx) * 4]
5454                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5455                 break;
5456             }
5457           if (idx < dim)
5458             break;
5459         }
5460       else
5461         {
5462           idx = 1;
5463           for (; CONSP (val); val = XCDR (val))
5464             {
5465               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5466               dim = CHARSET_DIMENSION (charset);
5467               while (idx < dim)
5468                 {
5469                   if (src == src_end)
5470                     goto too_short;
5471                   ONE_MORE_BYTE (c);
5472                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5473                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5474                     break;
5475                   idx++;
5476                 }
5477               if (idx == dim)
5478                 {
5479                   val = Qnil;
5480                   break;
5481                 }
5482             }
5483           if (CONSP (val))
5484             break;
5485         }
5486     }
5487  too_short:
5488   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5489   return 0;
5490
5491  no_more_source:
5492   detect_info->found |= found;
5493   return 1;
5494 }
5495
5496 static void
5497 decode_coding_charset (struct coding_system *coding)
5498 {
5499   const unsigned char *src = coding->source + coding->consumed;
5500   const unsigned char *src_end = coding->source + coding->src_bytes;
5501   const unsigned char *src_base;
5502   int *charbuf = coding->charbuf + coding->charbuf_used;
5503   /* We may produce one charset annotation in one loop and one more at
5504      the end.  */
5505   int *charbuf_end
5506     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5507   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5508   bool multibytep = coding->src_multibyte;
5509   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5510   Lisp_Object valids;
5511   ptrdiff_t char_offset = coding->produced_char;
5512   ptrdiff_t last_offset = char_offset;
5513   int last_id = charset_ascii;
5514   bool eol_dos
5515     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5516   int byte_after_cr = -1;
5517
5518   valids = AREF (attrs, coding_attr_charset_valids);
5519
5520   while (1)
5521     {
5522       int c;
5523       Lisp_Object val;
5524       struct charset *charset;
5525       int dim;
5526       int len = 1;
5527       unsigned code;
5528
5529       src_base = src;
5530       consumed_chars_base = consumed_chars;
5531
5532       if (charbuf >= charbuf_end)
5533         {
5534           if (byte_after_cr >= 0)
5535             src_base--;
5536           break;
5537         }
5538
5539       if (byte_after_cr >= 0)
5540         {
5541           c = byte_after_cr;
5542           byte_after_cr = -1;
5543         }
5544       else
5545         {
5546           ONE_MORE_BYTE (c);
5547           if (eol_dos && c == '\r')
5548             ONE_MORE_BYTE (byte_after_cr);
5549         }
5550       if (c < 0)
5551         goto invalid_code;
5552       code = c;
5553
5554       val = AREF (valids, c);
5555       if (! INTEGERP (val) && ! CONSP (val))
5556         goto invalid_code;
5557       if (INTEGERP (val))
5558         {
5559           charset = CHARSET_FROM_ID (XFASTINT (val));
5560           dim = CHARSET_DIMENSION (charset);
5561           while (len < dim)
5562             {
5563               ONE_MORE_BYTE (c);
5564               code = (code << 8) | c;
5565               len++;
5566             }
5567           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5568                               charset, code, c);
5569         }
5570       else
5571         {
5572           /* VAL is a list of charset IDs.  It is assured that the
5573              list is sorted by charset dimensions (smaller one
5574              comes first).  */
5575           while (CONSP (val))
5576             {
5577               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5578               dim = CHARSET_DIMENSION (charset);
5579               while (len < dim)
5580                 {
5581                   ONE_MORE_BYTE (c);
5582                   code = (code << 8) | c;
5583                   len++;
5584                 }
5585               CODING_DECODE_CHAR (coding, src, src_base,
5586                                   src_end, charset, code, c);
5587               if (c >= 0)
5588                 break;
5589               val = XCDR (val);
5590             }
5591         }
5592       if (c < 0)
5593         goto invalid_code;
5594       if (charset->id != charset_ascii
5595           && last_id != charset->id)
5596         {
5597           if (last_id != charset_ascii)
5598             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5599           last_id = charset->id;
5600           last_offset = char_offset;
5601         }
5602
5603       *charbuf++ = c;
5604       char_offset++;
5605       continue;
5606
5607     invalid_code:
5608       src = src_base;
5609       consumed_chars = consumed_chars_base;
5610       ONE_MORE_BYTE (c);
5611       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5612       char_offset++;
5613     }
5614
5615  no_more_source:
5616   if (last_id != charset_ascii)
5617     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5618   coding->consumed_char += consumed_chars_base;
5619   coding->consumed = src_base - coding->source;
5620   coding->charbuf_used = charbuf - coding->charbuf;
5621 }
5622
5623 static bool
5624 encode_coding_charset (struct coding_system *coding)
5625 {
5626   bool multibytep = coding->dst_multibyte;
5627   int *charbuf = coding->charbuf;
5628   int *charbuf_end = charbuf + coding->charbuf_used;
5629   unsigned char *dst = coding->destination + coding->produced;
5630   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5631   int safe_room = MAX_MULTIBYTE_LENGTH;
5632   ptrdiff_t produced_chars = 0;
5633   Lisp_Object attrs, charset_list;
5634   bool ascii_compatible;
5635   int c;
5636
5637   CODING_GET_INFO (coding, attrs, charset_list);
5638   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5639
5640   while (charbuf < charbuf_end)
5641     {
5642       struct charset *charset;
5643       unsigned code;
5644
5645       ASSURE_DESTINATION (safe_room);
5646       c = *charbuf++;
5647       if (ascii_compatible && ASCII_CHAR_P (c))
5648         EMIT_ONE_ASCII_BYTE (c);
5649       else if (CHAR_BYTE8_P (c))
5650         {
5651           c = CHAR_TO_BYTE8 (c);
5652           EMIT_ONE_BYTE (c);
5653         }
5654       else
5655         {
5656           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5657                                &code, charset);
5658
5659           if (charset)
5660             {
5661               if (CHARSET_DIMENSION (charset) == 1)
5662                 EMIT_ONE_BYTE (code);
5663               else if (CHARSET_DIMENSION (charset) == 2)
5664                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5665               else if (CHARSET_DIMENSION (charset) == 3)
5666                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5667               else
5668                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5669                                  (code >> 8) & 0xFF, code & 0xFF);
5670             }
5671           else
5672             {
5673               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5674                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5675               else
5676                 c = coding->default_char;
5677               EMIT_ONE_BYTE (c);
5678             }
5679         }
5680     }
5681
5682   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5683   coding->produced_char += produced_chars;
5684   coding->produced = dst - coding->destination;
5685   return 0;
5686 }
5687
5688 \f
5689 /*** 7. C library functions ***/
5690
5691 /* Setup coding context CODING from information about CODING_SYSTEM.
5692    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5693    CODING_SYSTEM is invalid, signal an error.  */
5694
5695 void
5696 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5697 {
5698   Lisp_Object attrs;
5699   Lisp_Object eol_type;
5700   Lisp_Object coding_type;
5701   Lisp_Object val;
5702
5703   if (NILP (coding_system))
5704     coding_system = Qundecided;
5705
5706   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5707
5708   attrs = CODING_ID_ATTRS (coding->id);
5709   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5710
5711   coding->mode = 0;
5712   if (VECTORP (eol_type))
5713     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5714                             | CODING_REQUIRE_DETECTION_MASK);
5715   else if (! EQ (eol_type, Qunix))
5716     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5717                             | CODING_REQUIRE_ENCODING_MASK);
5718   else
5719     coding->common_flags = 0;
5720   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5721     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5722   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5723     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5724   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5725     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5726
5727   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5728   coding->max_charset_id = SCHARS (val) - 1;
5729   coding->safe_charsets = SDATA (val);
5730   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5731   coding->carryover_bytes = 0;
5732   coding->raw_destination = 0;
5733
5734   coding_type = CODING_ATTR_TYPE (attrs);
5735   if (EQ (coding_type, Qundecided))
5736     {
5737       coding->detector = NULL;
5738       coding->decoder = decode_coding_raw_text;
5739       coding->encoder = encode_coding_raw_text;
5740       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5741       coding->spec.undecided.inhibit_nbd
5742         = (encode_inhibit_flag
5743            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5744       coding->spec.undecided.inhibit_ied
5745         = (encode_inhibit_flag
5746            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5747       coding->spec.undecided.prefer_utf_8
5748         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5749     }
5750   else if (EQ (coding_type, Qiso_2022))
5751     {
5752       int i;
5753       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5754
5755       /* Invoke graphic register 0 to plane 0.  */
5756       CODING_ISO_INVOCATION (coding, 0) = 0;
5757       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5758       CODING_ISO_INVOCATION (coding, 1)
5759         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5760       /* Setup the initial status of designation.  */
5761       for (i = 0; i < 4; i++)
5762         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5763       /* Not single shifting initially.  */
5764       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5765       /* Beginning of buffer should also be regarded as bol. */
5766       CODING_ISO_BOL (coding) = 1;
5767       coding->detector = detect_coding_iso_2022;
5768       coding->decoder = decode_coding_iso_2022;
5769       coding->encoder = encode_coding_iso_2022;
5770       if (flags & CODING_ISO_FLAG_SAFE)
5771         coding->mode |= CODING_MODE_SAFE_ENCODING;
5772       coding->common_flags
5773         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5774             | CODING_REQUIRE_FLUSHING_MASK);
5775       if (flags & CODING_ISO_FLAG_COMPOSITION)
5776         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5777       if (flags & CODING_ISO_FLAG_DESIGNATION)
5778         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5779       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5780         {
5781           setup_iso_safe_charsets (attrs);
5782           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5783           coding->max_charset_id = SCHARS (val) - 1;
5784           coding->safe_charsets = SDATA (val);
5785         }
5786       CODING_ISO_FLAGS (coding) = flags;
5787       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5788       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5789       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5790       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5791     }
5792   else if (EQ (coding_type, Qcharset))
5793     {
5794       coding->detector = detect_coding_charset;
5795       coding->decoder = decode_coding_charset;
5796       coding->encoder = encode_coding_charset;
5797       coding->common_flags
5798         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5799     }
5800   else if (EQ (coding_type, Qutf_8))
5801     {
5802       val = AREF (attrs, coding_attr_utf_bom);
5803       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5804                                    : EQ (val, Qt) ? utf_with_bom
5805                                    : utf_without_bom);
5806       coding->detector = detect_coding_utf_8;
5807       coding->decoder = decode_coding_utf_8;
5808       coding->encoder = encode_coding_utf_8;
5809       coding->common_flags
5810         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5811       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5812         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5813     }
5814   else if (EQ (coding_type, Qutf_16))
5815     {
5816       val = AREF (attrs, coding_attr_utf_bom);
5817       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5818                                     : EQ (val, Qt) ? utf_with_bom
5819                                     : utf_without_bom);
5820       val = AREF (attrs, coding_attr_utf_16_endian);
5821       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5822                                        : utf_16_little_endian);
5823       CODING_UTF_16_SURROGATE (coding) = 0;
5824       coding->detector = detect_coding_utf_16;
5825       coding->decoder = decode_coding_utf_16;
5826       coding->encoder = encode_coding_utf_16;
5827       coding->common_flags
5828         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5829       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5830         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5831     }
5832   else if (EQ (coding_type, Qccl))
5833     {
5834       coding->detector = detect_coding_ccl;
5835       coding->decoder = decode_coding_ccl;
5836       coding->encoder = encode_coding_ccl;
5837       coding->common_flags
5838         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5839             | CODING_REQUIRE_FLUSHING_MASK);
5840     }
5841   else if (EQ (coding_type, Qemacs_mule))
5842     {
5843       coding->detector = detect_coding_emacs_mule;
5844       coding->decoder = decode_coding_emacs_mule;
5845       coding->encoder = encode_coding_emacs_mule;
5846       coding->common_flags
5847         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5848       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5849           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5850         {
5851           Lisp_Object tail, safe_charsets;
5852           int max_charset_id = 0;
5853
5854           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5855                tail = XCDR (tail))
5856             if (max_charset_id < XFASTINT (XCAR (tail)))
5857               max_charset_id = XFASTINT (XCAR (tail));
5858           safe_charsets = make_uninit_string (max_charset_id + 1);
5859           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5860           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5861                tail = XCDR (tail))
5862             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5863           coding->max_charset_id = max_charset_id;
5864           coding->safe_charsets = SDATA (safe_charsets);
5865         }
5866       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5867       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5868     }
5869   else if (EQ (coding_type, Qshift_jis))
5870     {
5871       coding->detector = detect_coding_sjis;
5872       coding->decoder = decode_coding_sjis;
5873       coding->encoder = encode_coding_sjis;
5874       coding->common_flags
5875         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5876     }
5877   else if (EQ (coding_type, Qbig5))
5878     {
5879       coding->detector = detect_coding_big5;
5880       coding->decoder = decode_coding_big5;
5881       coding->encoder = encode_coding_big5;
5882       coding->common_flags
5883         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5884     }
5885   else                          /* EQ (coding_type, Qraw_text) */
5886     {
5887       coding->detector = NULL;
5888       coding->decoder = decode_coding_raw_text;
5889       coding->encoder = encode_coding_raw_text;
5890       if (! EQ (eol_type, Qunix))
5891         {
5892           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5893           if (! VECTORP (eol_type))
5894             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5895         }
5896
5897     }
5898
5899   return;
5900 }
5901
5902 /* Return a list of charsets supported by CODING.  */
5903
5904 Lisp_Object
5905 coding_charset_list (struct coding_system *coding)
5906 {
5907   Lisp_Object attrs, charset_list;
5908
5909   CODING_GET_INFO (coding, attrs, charset_list);
5910   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5911     {
5912       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5913
5914       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5915         charset_list = Viso_2022_charset_list;
5916     }
5917   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5918     {
5919       charset_list = Vemacs_mule_charset_list;
5920     }
5921   return charset_list;
5922 }
5923
5924
5925 /* Return a list of charsets supported by CODING-SYSTEM.  */
5926
5927 Lisp_Object
5928 coding_system_charset_list (Lisp_Object coding_system)
5929 {
5930   ptrdiff_t id;
5931   Lisp_Object attrs, charset_list;
5932
5933   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5934   attrs = CODING_ID_ATTRS (id);
5935
5936   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5937     {
5938       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5939
5940       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5941         charset_list = Viso_2022_charset_list;
5942       else
5943         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5944     }
5945   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5946     {
5947       charset_list = Vemacs_mule_charset_list;
5948     }
5949   else
5950     {
5951       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5952     }
5953   return charset_list;
5954 }
5955
5956
5957 /* Return raw-text or one of its subsidiaries that has the same
5958    eol_type as CODING-SYSTEM.  */
5959
5960 Lisp_Object
5961 raw_text_coding_system (Lisp_Object coding_system)
5962 {
5963   Lisp_Object spec, attrs;
5964   Lisp_Object eol_type, raw_text_eol_type;
5965
5966   if (NILP (coding_system))
5967     return Qraw_text;
5968   spec = CODING_SYSTEM_SPEC (coding_system);
5969   attrs = AREF (spec, 0);
5970
5971   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5972     return coding_system;
5973
5974   eol_type = AREF (spec, 2);
5975   if (VECTORP (eol_type))
5976     return Qraw_text;
5977   spec = CODING_SYSTEM_SPEC (Qraw_text);
5978   raw_text_eol_type = AREF (spec, 2);
5979   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5980           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5981           : AREF (raw_text_eol_type, 2));
5982 }
5983
5984 /* Return true if CODING corresponds to raw-text coding-system.  */
5985
5986 bool
5987 raw_text_coding_system_p (struct coding_system *coding)
5988 {
5989   return (coding->decoder == decode_coding_raw_text
5990           && coding->encoder == encode_coding_raw_text) ? true : false;
5991 }
5992
5993
5994 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5995    the subsidiary that has the same eol-spec as PARENT (if it is not
5996    nil and specifies end-of-line format) or the system's setting
5997    (system_eol_type).  */
5998
5999 Lisp_Object
6000 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6001 {
6002   Lisp_Object spec, eol_type;
6003
6004   if (NILP (coding_system))
6005     coding_system = Qraw_text;
6006   else
6007     CHECK_CODING_SYSTEM (coding_system);
6008   spec = CODING_SYSTEM_SPEC (coding_system);
6009   eol_type = AREF (spec, 2);
6010   if (VECTORP (eol_type))
6011     {
6012       Lisp_Object parent_eol_type;
6013
6014       if (! NILP (parent))
6015         {
6016           Lisp_Object parent_spec;
6017
6018           CHECK_CODING_SYSTEM (parent);
6019           parent_spec = CODING_SYSTEM_SPEC (parent);
6020           parent_eol_type = AREF (parent_spec, 2);
6021           if (VECTORP (parent_eol_type))
6022             parent_eol_type = system_eol_type;
6023         }
6024       else
6025         parent_eol_type = system_eol_type;
6026       if (EQ (parent_eol_type, Qunix))
6027         coding_system = AREF (eol_type, 0);
6028       else if (EQ (parent_eol_type, Qdos))
6029         coding_system = AREF (eol_type, 1);
6030       else if (EQ (parent_eol_type, Qmac))
6031         coding_system = AREF (eol_type, 2);
6032     }
6033   return coding_system;
6034 }
6035
6036
6037 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6038    decided for writing to a process.  If not, complement them, and
6039    return a new coding system.  */
6040
6041 Lisp_Object
6042 complement_process_encoding_system (Lisp_Object coding_system)
6043 {
6044   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6045   Lisp_Object spec, attrs;
6046   int i;
6047
6048   for (i = 0; i < 3; i++)
6049     {
6050       if (i == 1)
6051         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6052       else if (i == 2)
6053         coding_system = preferred_coding_system ();
6054       spec = CODING_SYSTEM_SPEC (coding_system);
6055       if (NILP (spec))
6056         continue;
6057       attrs = AREF (spec, 0);
6058       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6059         coding_base = CODING_ATTR_BASE_NAME (attrs);
6060       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6061         eol_base = coding_system;
6062       if (! NILP (coding_base) && ! NILP (eol_base))
6063         break;
6064     }
6065
6066   if (i > 0)
6067     /* The original CODING_SYSTEM didn't specify text-conversion or
6068        eol-conversion.  Be sure that we return a fully complemented
6069        coding system.  */
6070     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6071   return coding_system;
6072 }
6073
6074
6075 /* Emacs has a mechanism to automatically detect a coding system if it
6076    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6077    it's impossible to distinguish some coding systems accurately
6078    because they use the same range of codes.  So, at first, coding
6079    systems are categorized into 7, those are:
6080
6081    o coding-category-emacs-mule
6082
6083         The category for a coding system which has the same code range
6084         as Emacs' internal format.  Assigned the coding-system (Lisp
6085         symbol) `emacs-mule' by default.
6086
6087    o coding-category-sjis
6088
6089         The category for a coding system which has the same code range
6090         as SJIS.  Assigned the coding-system (Lisp
6091         symbol) `japanese-shift-jis' by default.
6092
6093    o coding-category-iso-7
6094
6095         The category for a coding system which has the same code range
6096         as ISO2022 of 7-bit environment.  This doesn't use any locking
6097         shift and single shift functions.  This can encode/decode all
6098         charsets.  Assigned the coding-system (Lisp symbol)
6099         `iso-2022-7bit' by default.
6100
6101    o coding-category-iso-7-tight
6102
6103         Same as coding-category-iso-7 except that this can
6104         encode/decode only the specified charsets.
6105
6106    o coding-category-iso-8-1
6107
6108         The category for a coding system which has the same code range
6109         as ISO2022 of 8-bit environment and graphic plane 1 used only
6110         for DIMENSION1 charset.  This doesn't use any locking shift
6111         and single shift functions.  Assigned the coding-system (Lisp
6112         symbol) `iso-latin-1' by default.
6113
6114    o coding-category-iso-8-2
6115
6116         The category for a coding system which has the same code range
6117         as ISO2022 of 8-bit environment and graphic plane 1 used only
6118         for DIMENSION2 charset.  This doesn't use any locking shift
6119         and single shift functions.  Assigned the coding-system (Lisp
6120         symbol) `japanese-iso-8bit' by default.
6121
6122    o coding-category-iso-7-else
6123
6124         The category for a coding system which has the same code range
6125         as ISO2022 of 7-bit environment but uses locking shift or
6126         single shift functions.  Assigned the coding-system (Lisp
6127         symbol) `iso-2022-7bit-lock' by default.
6128
6129    o coding-category-iso-8-else
6130
6131         The category for a coding system which has the same code range
6132         as ISO2022 of 8-bit environment but uses locking shift or
6133         single shift functions.  Assigned the coding-system (Lisp
6134         symbol) `iso-2022-8bit-ss2' by default.
6135
6136    o coding-category-big5
6137
6138         The category for a coding system which has the same code range
6139         as BIG5.  Assigned the coding-system (Lisp symbol)
6140         `cn-big5' by default.
6141
6142    o coding-category-utf-8
6143
6144         The category for a coding system which has the same code range
6145         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6146         symbol) `utf-8' by default.
6147
6148    o coding-category-utf-16-be
6149
6150         The category for a coding system in which a text has an
6151         Unicode signature (cf. Unicode Standard) in the order of BIG
6152         endian at the head.  Assigned the coding-system (Lisp symbol)
6153         `utf-16-be' by default.
6154
6155    o coding-category-utf-16-le
6156
6157         The category for a coding system in which a text has an
6158         Unicode signature (cf. Unicode Standard) in the order of
6159         LITTLE endian at the head.  Assigned the coding-system (Lisp
6160         symbol) `utf-16-le' by default.
6161
6162    o coding-category-ccl
6163
6164         The category for a coding system of which encoder/decoder is
6165         written in CCL programs.  The default value is nil, i.e., no
6166         coding system is assigned.
6167
6168    o coding-category-binary
6169
6170         The category for a coding system not categorized in any of the
6171         above.  Assigned the coding-system (Lisp symbol)
6172         `no-conversion' by default.
6173
6174    Each of them is a Lisp symbol and the value is an actual
6175    `coding-system's (this is also a Lisp symbol) assigned by a user.
6176    What Emacs does actually is to detect a category of coding system.
6177    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6178    decide only one possible category, it selects a category of the
6179    highest priority.  Priorities of categories are also specified by a
6180    user in a Lisp variable `coding-category-list'.
6181
6182 */
6183
6184 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6185                                            int eol_seen);
6186
6187
6188 /* Return the number of ASCII characters at the head of the source.
6189    By side effects, set coding->head_ascii and update
6190    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6191    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6192    reliable only when all the source bytes are ASCII.  */
6193
6194 static ptrdiff_t
6195 check_ascii (struct coding_system *coding)
6196 {
6197   const unsigned char *src, *end;
6198   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6199   int eol_seen = coding->eol_seen;
6200
6201   coding_set_source (coding);
6202   src = coding->source;
6203   end = src + coding->src_bytes;
6204
6205   if (inhibit_eol_conversion
6206       || SYMBOLP (eol_type))
6207     {
6208       /* We don't have to check EOL format.  */
6209       while (src < end && !( *src & 0x80))
6210         {
6211           if (*src++ == '\n')
6212             eol_seen |= EOL_SEEN_LF;
6213         }
6214     }
6215   else
6216     {
6217       end--;                /* We look ahead one byte for "CR LF".  */
6218       while (src < end)
6219         {
6220           int c = *src;
6221
6222           if (c & 0x80)
6223             break;
6224           src++;
6225           if (c == '\r')
6226             {
6227               if (*src == '\n')
6228                 {
6229                   eol_seen |= EOL_SEEN_CRLF;
6230                   src++;
6231                 }
6232               else
6233                 eol_seen |= EOL_SEEN_CR;
6234             }
6235           else if (c == '\n')
6236             eol_seen |= EOL_SEEN_LF;
6237         }
6238       if (src == end)
6239         {
6240           int c = *src;
6241
6242           /* All bytes but the last one C are ASCII.  */
6243           if (! (c & 0x80))
6244             {
6245               if (c == '\r')
6246                 eol_seen |= EOL_SEEN_CR;
6247               else if (c  == '\n')
6248                 eol_seen |= EOL_SEEN_LF;
6249               src++;
6250             }
6251         }
6252     }
6253   coding->head_ascii = src - coding->source;
6254   coding->eol_seen = eol_seen;
6255   return (coding->head_ascii);
6256 }
6257
6258
6259 /* Return the number of characters at the source if all the bytes are
6260    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6261    effects, update coding->eol_seen.  The value of coding->eol_seen is
6262    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6263    the value is reliable only when all the source bytes are valid
6264    UTF-8.  */
6265
6266 static ptrdiff_t
6267 check_utf_8 (struct coding_system *coding)
6268 {
6269   const unsigned char *src, *end;
6270   int eol_seen;
6271   ptrdiff_t nchars = coding->head_ascii;
6272
6273   if (coding->head_ascii < 0)
6274     check_ascii (coding);
6275   else
6276     coding_set_source (coding);
6277   src = coding->source + coding->head_ascii;
6278   /* We look ahead one byte for CR LF.  */
6279   end = coding->source + coding->src_bytes - 1;
6280   eol_seen = coding->eol_seen;
6281   while (src < end)
6282     {
6283       int c = *src;
6284
6285       if (UTF_8_1_OCTET_P (*src))
6286         {
6287           src++;
6288           if (c < 0x20)
6289             {
6290               if (c == '\r')
6291                 {
6292                   if (*src == '\n')
6293                     {
6294                       eol_seen |= EOL_SEEN_CRLF;
6295                       src++;
6296                       nchars++;
6297                     }
6298                   else
6299                     eol_seen |= EOL_SEEN_CR;
6300                 }
6301               else if (c == '\n')
6302                 eol_seen |= EOL_SEEN_LF;
6303             }
6304         }
6305       else if (UTF_8_2_OCTET_LEADING_P (c))
6306         {
6307           if (c < 0xC2          /* overlong sequence */
6308               || src + 1 >= end
6309               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6310             return -1;
6311           src += 2;
6312         }
6313       else if (UTF_8_3_OCTET_LEADING_P (c))
6314         {
6315           if (src + 2 >= end
6316               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6317                     && UTF_8_EXTRA_OCTET_P (src[2])))
6318             return -1;
6319           c = (((c & 0xF) << 12)
6320                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6321           if (c < 0x800                       /* overlong sequence */
6322               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6323             return -1;
6324           src += 3;
6325         }
6326       else if (UTF_8_4_OCTET_LEADING_P (c))
6327         {
6328           if (src + 3 >= end
6329               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6330                     && UTF_8_EXTRA_OCTET_P (src[2])
6331                     && UTF_8_EXTRA_OCTET_P (src[3])))
6332             return -1;
6333           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6334                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6335           if (c < 0x10000       /* overlong sequence */
6336               || c >= 0x110000) /* non-Unicode character  */
6337             return -1;
6338           src += 4;
6339         }
6340       else
6341         return -1;
6342       nchars++;
6343     }
6344
6345   if (src == end)
6346     {
6347       if (! UTF_8_1_OCTET_P (*src))
6348         return -1;
6349       nchars++;
6350       if (*src == '\r')
6351         eol_seen |= EOL_SEEN_CR;
6352       else if (*src  == '\n')
6353         eol_seen |= EOL_SEEN_LF;
6354     }
6355   coding->eol_seen = eol_seen;
6356   return nchars;
6357 }
6358
6359
6360 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6361    SOURCE is encoded.  If CATEGORY is one of
6362    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6363    two-byte, else they are encoded by one-byte.
6364
6365    Return one of EOL_SEEN_XXX.  */
6366
6367 #define MAX_EOL_CHECK_COUNT 3
6368
6369 static int
6370 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6371             enum coding_category category)
6372 {
6373   const unsigned char *src = source, *src_end = src + src_bytes;
6374   unsigned char c;
6375   int total  = 0;
6376   int eol_seen = EOL_SEEN_NONE;
6377
6378   if ((1 << category) & CATEGORY_MASK_UTF_16)
6379     {
6380       bool msb = category == (coding_category_utf_16_le
6381                               | coding_category_utf_16_le_nosig);
6382       bool lsb = !msb;
6383
6384       while (src + 1 < src_end)
6385         {
6386           c = src[lsb];
6387           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6388             {
6389               int this_eol;
6390
6391               if (c == '\n')
6392                 this_eol = EOL_SEEN_LF;
6393               else if (src + 3 >= src_end
6394                        || src[msb + 2] != 0
6395                        || src[lsb + 2] != '\n')
6396                 this_eol = EOL_SEEN_CR;
6397               else
6398                 {
6399                   this_eol = EOL_SEEN_CRLF;
6400                   src += 2;
6401                 }
6402
6403               if (eol_seen == EOL_SEEN_NONE)
6404                 /* This is the first end-of-line.  */
6405                 eol_seen = this_eol;
6406               else if (eol_seen != this_eol)
6407                 {
6408                   /* The found type is different from what found before.
6409                      Allow for stray ^M characters in DOS EOL files.  */
6410                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6411                       || (eol_seen == EOL_SEEN_CRLF
6412                           && this_eol == EOL_SEEN_CR))
6413                     eol_seen = EOL_SEEN_CRLF;
6414                   else
6415                     {
6416                       eol_seen = EOL_SEEN_LF;
6417                       break;
6418                     }
6419                 }
6420               if (++total == MAX_EOL_CHECK_COUNT)
6421                 break;
6422             }
6423           src += 2;
6424         }
6425     }
6426   else
6427     while (src < src_end)
6428       {
6429         c = *src++;
6430         if (c == '\n' || c == '\r')
6431           {
6432             int this_eol;
6433
6434             if (c == '\n')
6435               this_eol = EOL_SEEN_LF;
6436             else if (src >= src_end || *src != '\n')
6437               this_eol = EOL_SEEN_CR;
6438             else
6439               this_eol = EOL_SEEN_CRLF, src++;
6440
6441             if (eol_seen == EOL_SEEN_NONE)
6442               /* This is the first end-of-line.  */
6443               eol_seen = this_eol;
6444             else if (eol_seen != this_eol)
6445               {
6446                 /* The found type is different from what found before.
6447                    Allow for stray ^M characters in DOS EOL files.  */
6448                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6449                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6450                   eol_seen = EOL_SEEN_CRLF;
6451                 else
6452                   {
6453                     eol_seen = EOL_SEEN_LF;
6454                     break;
6455                   }
6456               }
6457             if (++total == MAX_EOL_CHECK_COUNT)
6458               break;
6459           }
6460       }
6461   return eol_seen;
6462 }
6463
6464
6465 static Lisp_Object
6466 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6467 {
6468   Lisp_Object eol_type;
6469
6470   eol_type = CODING_ID_EOL_TYPE (coding->id);
6471   if (! VECTORP (eol_type))
6472     /* Already adjusted.  */
6473     return eol_type;
6474   if (eol_seen & EOL_SEEN_LF)
6475     {
6476       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6477       eol_type = Qunix;
6478     }
6479   else if (eol_seen & EOL_SEEN_CRLF)
6480     {
6481       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6482       eol_type = Qdos;
6483     }
6484   else if (eol_seen & EOL_SEEN_CR)
6485     {
6486       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6487       eol_type = Qmac;
6488     }
6489   return eol_type;
6490 }
6491
6492 /* Detect how a text specified in CODING is encoded.  If a coding
6493    system is detected, update fields of CODING by the detected coding
6494    system.  */
6495
6496 static void
6497 detect_coding (struct coding_system *coding)
6498 {
6499   const unsigned char *src, *src_end;
6500   unsigned int saved_mode = coding->mode;
6501   Lisp_Object found = Qnil;
6502   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6503
6504   coding->consumed = coding->consumed_char = 0;
6505   coding->produced = coding->produced_char = 0;
6506   coding_set_source (coding);
6507
6508   src_end = coding->source + coding->src_bytes;
6509
6510   coding->eol_seen = EOL_SEEN_NONE;
6511   /* If we have not yet decided the text encoding type, detect it
6512      now.  */
6513   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6514     {
6515       int c, i;
6516       struct coding_detection_info detect_info;
6517       bool null_byte_found = 0, eight_bit_found = 0;
6518       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6519                                        inhibit_null_byte_detection);
6520       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6521                                        inhibit_iso_escape_detection);
6522       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6523
6524       coding->head_ascii = 0;
6525       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6526       for (src = coding->source; src < src_end; src++)
6527         {
6528           c = *src;
6529           if (c & 0x80)
6530             {
6531               eight_bit_found = 1;
6532               if (null_byte_found)
6533                 break;
6534             }
6535           else if (c < 0x20)
6536             {
6537               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6538                   && ! inhibit_ied
6539                   && ! detect_info.checked)
6540                 {
6541                   if (detect_coding_iso_2022 (coding, &detect_info))
6542                     {
6543                       /* We have scanned the whole data.  */
6544                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6545                         {
6546                           /* We didn't find an 8-bit code.  We may
6547                              have found a null-byte, but it's very
6548                              rare that a binary file conforms to
6549                              ISO-2022.  */
6550                           src = src_end;
6551                           coding->head_ascii = src - coding->source;
6552                         }
6553                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6554                       break;
6555                     }
6556                 }
6557               else if (! c && !inhibit_nbd)
6558                 {
6559                   null_byte_found = 1;
6560                   if (eight_bit_found)
6561                     break;
6562                 }
6563               else if (! disable_ascii_optimization
6564                        && ! inhibit_eol_conversion)
6565                 {
6566                   if (c == '\r')
6567                     {
6568                       if (src < src_end && src[1] == '\n')
6569                         {
6570                           coding->eol_seen |= EOL_SEEN_CRLF;
6571                           src++;
6572                           if (! eight_bit_found)
6573                             coding->head_ascii++;
6574                         }
6575                       else
6576                         coding->eol_seen |= EOL_SEEN_CR;
6577                     }
6578                   else if (c == '\n')
6579                     {
6580                       coding->eol_seen |= EOL_SEEN_LF;
6581                     }
6582                 }
6583
6584               if (! eight_bit_found)
6585                 coding->head_ascii++;
6586             }
6587           else if (! eight_bit_found)
6588             coding->head_ascii++;
6589         }
6590
6591       if (null_byte_found || eight_bit_found
6592           || coding->head_ascii < coding->src_bytes
6593           || detect_info.found)
6594         {
6595           enum coding_category category;
6596           struct coding_system *this;
6597
6598           if (coding->head_ascii == coding->src_bytes)
6599             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6600             for (i = 0; i < coding_category_raw_text; i++)
6601               {
6602                 category = coding_priorities[i];
6603                 this = coding_categories + category;
6604                 if (detect_info.found & (1 << category))
6605                   break;
6606               }
6607           else
6608             {
6609               if (null_byte_found)
6610                 {
6611                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6612                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6613                 }
6614               else if (prefer_utf_8
6615                        && detect_coding_utf_8 (coding, &detect_info))
6616                 {
6617                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6618                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6619                 }
6620               for (i = 0; i < coding_category_raw_text; i++)
6621                 {
6622                   category = coding_priorities[i];
6623                   this = coding_categories + category;
6624                   /* Some of this->detector (e.g. detect_coding_sjis)
6625                      require this information.  */
6626                   coding->id = this->id;
6627                   if (this->id < 0)
6628                     {
6629                       /* No coding system of this category is defined.  */
6630                       detect_info.rejected |= (1 << category);
6631                     }
6632                   else if (category >= coding_category_raw_text)
6633                     continue;
6634                   else if (detect_info.checked & (1 << category))
6635                     {
6636                       if (detect_info.found & (1 << category))
6637                         break;
6638                     }
6639                   else if ((*(this->detector)) (coding, &detect_info)
6640                            && detect_info.found & (1 << category))
6641                     break;
6642                 }
6643             }
6644
6645           if (i < coding_category_raw_text)
6646             {
6647               if (category == coding_category_utf_8_auto)
6648                 {
6649                   Lisp_Object coding_systems;
6650
6651                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6652                                          coding_attr_utf_bom);
6653                   if (CONSP (coding_systems))
6654                     {
6655                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6656                         found = XCAR (coding_systems);
6657                       else
6658                         found = XCDR (coding_systems);
6659                     }
6660                   else
6661                     found = CODING_ID_NAME (this->id);
6662                 }
6663               else if (category == coding_category_utf_16_auto)
6664                 {
6665                   Lisp_Object coding_systems;
6666
6667                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6668                                          coding_attr_utf_bom);
6669                   if (CONSP (coding_systems))
6670                     {
6671                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6672                         found = XCAR (coding_systems);
6673                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6674                         found = XCDR (coding_systems);
6675                     }
6676                   else
6677                     found = CODING_ID_NAME (this->id);
6678                 }
6679               else
6680                 found = CODING_ID_NAME (this->id);
6681             }
6682           else if (null_byte_found)
6683             found = Qno_conversion;
6684           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6685                    == CATEGORY_MASK_ANY)
6686             found = Qraw_text;
6687           else if (detect_info.rejected)
6688             for (i = 0; i < coding_category_raw_text; i++)
6689               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6690                 {
6691                   this = coding_categories + coding_priorities[i];
6692                   found = CODING_ID_NAME (this->id);
6693                   break;
6694                 }
6695         }
6696     }
6697   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6698            == coding_category_utf_8_auto)
6699     {
6700       Lisp_Object coding_systems;
6701       struct coding_detection_info detect_info;
6702
6703       coding_systems
6704         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6705       detect_info.found = detect_info.rejected = 0;
6706       if (check_ascii (coding) == coding->src_bytes)
6707         {
6708           if (CONSP (coding_systems))
6709             found = XCDR (coding_systems);
6710         }
6711       else
6712         {
6713           if (CONSP (coding_systems)
6714               && detect_coding_utf_8 (coding, &detect_info))
6715             {
6716               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6717                 found = XCAR (coding_systems);
6718               else
6719                 found = XCDR (coding_systems);
6720             }
6721         }
6722     }
6723   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6724            == coding_category_utf_16_auto)
6725     {
6726       Lisp_Object coding_systems;
6727       struct coding_detection_info detect_info;
6728
6729       coding_systems
6730         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6731       detect_info.found = detect_info.rejected = 0;
6732       coding->head_ascii = 0;
6733       if (CONSP (coding_systems)
6734           && detect_coding_utf_16 (coding, &detect_info))
6735         {
6736           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6737             found = XCAR (coding_systems);
6738           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6739             found = XCDR (coding_systems);
6740         }
6741     }
6742
6743   if (! NILP (found))
6744     {
6745       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6746                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6747                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6748                            : EOL_SEEN_LF);
6749
6750       setup_coding_system (found, coding);
6751       if (specified_eol != EOL_SEEN_NONE)
6752         adjust_coding_eol_type (coding, specified_eol);
6753     }
6754
6755   coding->mode = saved_mode;
6756 }
6757
6758
6759 static void
6760 decode_eol (struct coding_system *coding)
6761 {
6762   Lisp_Object eol_type;
6763   unsigned char *p, *pbeg, *pend;
6764
6765   eol_type = CODING_ID_EOL_TYPE (coding->id);
6766   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6767     return;
6768
6769   if (NILP (coding->dst_object))
6770     pbeg = coding->destination;
6771   else
6772     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6773   pend = pbeg + coding->produced;
6774
6775   if (VECTORP (eol_type))
6776     {
6777       int eol_seen = EOL_SEEN_NONE;
6778
6779       for (p = pbeg; p < pend; p++)
6780         {
6781           if (*p == '\n')
6782             eol_seen |= EOL_SEEN_LF;
6783           else if (*p == '\r')
6784             {
6785               if (p + 1 < pend && *(p + 1) == '\n')
6786                 {
6787                   eol_seen |= EOL_SEEN_CRLF;
6788                   p++;
6789                 }
6790               else
6791                 eol_seen |= EOL_SEEN_CR;
6792             }
6793         }
6794       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6795       if ((eol_seen & EOL_SEEN_CRLF) != 0
6796           && (eol_seen & EOL_SEEN_CR) != 0
6797           && (eol_seen & EOL_SEEN_LF) == 0)
6798         eol_seen = EOL_SEEN_CRLF;
6799       else if (eol_seen != EOL_SEEN_NONE
6800           && eol_seen != EOL_SEEN_LF
6801           && eol_seen != EOL_SEEN_CRLF
6802           && eol_seen != EOL_SEEN_CR)
6803         eol_seen = EOL_SEEN_LF;
6804       if (eol_seen != EOL_SEEN_NONE)
6805         eol_type = adjust_coding_eol_type (coding, eol_seen);
6806     }
6807
6808   if (EQ (eol_type, Qmac))
6809     {
6810       for (p = pbeg; p < pend; p++)
6811         if (*p == '\r')
6812           *p = '\n';
6813     }
6814   else if (EQ (eol_type, Qdos))
6815     {
6816       ptrdiff_t n = 0;
6817
6818       if (NILP (coding->dst_object))
6819         {
6820           /* Start deleting '\r' from the tail to minimize the memory
6821              movement.  */
6822           for (p = pend - 2; p >= pbeg; p--)
6823             if (*p == '\r')
6824               {
6825                 memmove (p, p + 1, pend-- - p - 1);
6826                 n++;
6827               }
6828         }
6829       else
6830         {
6831           ptrdiff_t pos = coding->dst_pos;
6832           ptrdiff_t pos_byte = coding->dst_pos_byte;
6833           ptrdiff_t pos_end = pos_byte + coding->produced - 1;
6834
6835           while (pos_byte < pos_end)
6836             {
6837               p = BYTE_POS_ADDR (pos_byte);
6838               if (*p == '\r' && p[1] == '\n')
6839                 {
6840                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6841                   n++;
6842                   pos_end--;
6843                 }
6844               pos++;
6845               if (coding->dst_multibyte)
6846                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6847               else
6848                 pos_byte++;
6849             }
6850         }
6851       coding->produced -= n;
6852       coding->produced_char -= n;
6853     }
6854 }
6855
6856
6857 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
6858    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
6859    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
6860 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6861
6862 /* Return a translation table (or list of them) from coding system
6863    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6864    not ENCODEP). */
6865
6866 static Lisp_Object
6867 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6868 {
6869   Lisp_Object standard, translation_table;
6870   Lisp_Object val;
6871
6872   if (NILP (Venable_character_translation))
6873     {
6874       if (max_lookup)
6875         *max_lookup = 0;
6876       return Qnil;
6877     }
6878   if (encodep)
6879     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6880       standard = Vstandard_translation_table_for_encode;
6881   else
6882     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6883       standard = Vstandard_translation_table_for_decode;
6884   if (NILP (translation_table))
6885     translation_table = standard;
6886   else
6887     {
6888       if (SYMBOLP (translation_table))
6889         translation_table = Fget (translation_table, Qtranslation_table);
6890       else if (CONSP (translation_table))
6891         {
6892           translation_table = Fcopy_sequence (translation_table);
6893           for (val = translation_table; CONSP (val); val = XCDR (val))
6894             if (SYMBOLP (XCAR (val)))
6895               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6896         }
6897       if (CHAR_TABLE_P (standard))
6898         {
6899           if (CONSP (translation_table))
6900             translation_table = nconc2 (translation_table, list1 (standard));
6901           else
6902             translation_table = list2 (translation_table, standard);
6903         }
6904     }
6905
6906   if (max_lookup)
6907     {
6908       *max_lookup = 1;
6909       if (CHAR_TABLE_P (translation_table)
6910           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6911         {
6912           val = XCHAR_TABLE (translation_table)->extras[1];
6913           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6914             *max_lookup = min (XFASTINT (val), MAX_LOOKUP_MAX);
6915         }
6916       else if (CONSP (translation_table))
6917         {
6918           Lisp_Object tail;
6919
6920           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6921             if (CHAR_TABLE_P (XCAR (tail))
6922                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6923               {
6924                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6925                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6926                   *max_lookup = min (XFASTINT (tailval), MAX_LOOKUP_MAX);
6927               }
6928         }
6929     }
6930   return translation_table;
6931 }
6932
6933 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6934   do {                                                          \
6935     trans = Qnil;                                               \
6936     if (CHAR_TABLE_P (table))                                   \
6937       {                                                         \
6938         trans = CHAR_TABLE_REF (table, c);                      \
6939         if (CHARACTERP (trans))                                 \
6940           c = XFASTINT (trans), trans = Qnil;                   \
6941       }                                                         \
6942     else if (CONSP (table))                                     \
6943       {                                                         \
6944         Lisp_Object tail;                                       \
6945                                                                 \
6946         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6947           if (CHAR_TABLE_P (XCAR (tail)))                       \
6948             {                                                   \
6949               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6950               if (CHARACTERP (trans))                           \
6951                 c = XFASTINT (trans), trans = Qnil;             \
6952               else if (! NILP (trans))                          \
6953                 break;                                          \
6954             }                                                   \
6955       }                                                         \
6956   } while (0)
6957
6958
6959 /* Return a translation of character(s) at BUF according to TRANS.
6960    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6961    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6962    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6963    translation is found, and Qnil if not found..
6964    If BUF is too short to lookup characters in FROM, return Qt.  */
6965
6966 static Lisp_Object
6967 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6968 {
6969
6970   if (INTEGERP (trans))
6971     return trans;
6972   for (; CONSP (trans); trans = XCDR (trans))
6973     {
6974       Lisp_Object val = XCAR (trans);
6975       Lisp_Object from = XCAR (val);
6976       ptrdiff_t len = ASIZE (from);
6977       ptrdiff_t i;
6978
6979       for (i = 0; i < len; i++)
6980         {
6981           if (buf + i == buf_end)
6982             return Qt;
6983           if (XINT (AREF (from, i)) != buf[i])
6984             break;
6985         }
6986       if (i == len)
6987         return val;
6988     }
6989   return Qnil;
6990 }
6991
6992
6993 static int
6994 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6995                bool last_block)
6996 {
6997   unsigned char *dst = coding->destination + coding->produced;
6998   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6999   ptrdiff_t produced;
7000   ptrdiff_t produced_chars = 0;
7001   int carryover = 0;
7002
7003   if (! coding->chars_at_source)
7004     {
7005       /* Source characters are in coding->charbuf.  */
7006       int *buf = coding->charbuf;
7007       int *buf_end = buf + coding->charbuf_used;
7008
7009       if (EQ (coding->src_object, coding->dst_object)
7010           && ! NILP (coding->dst_object))
7011         {
7012           eassert (growable_destination (coding));
7013           coding_set_source (coding);
7014           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7015         }
7016
7017       while (buf < buf_end)
7018         {
7019           int c = *buf;
7020           ptrdiff_t i;
7021
7022           if (c >= 0)
7023             {
7024               ptrdiff_t from_nchars = 1, to_nchars = 1;
7025               Lisp_Object trans = Qnil;
7026
7027               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7028               if (! NILP (trans))
7029                 {
7030                   trans = get_translation (trans, buf, buf_end);
7031                   if (INTEGERP (trans))
7032                     c = XINT (trans);
7033                   else if (CONSP (trans))
7034                     {
7035                       from_nchars = ASIZE (XCAR (trans));
7036                       trans = XCDR (trans);
7037                       if (INTEGERP (trans))
7038                         c = XINT (trans);
7039                       else
7040                         {
7041                           to_nchars = ASIZE (trans);
7042                           c = XINT (AREF (trans, 0));
7043                         }
7044                     }
7045                   else if (EQ (trans, Qt) && ! last_block)
7046                     break;
7047                 }
7048
7049               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7050                 {
7051                   eassert (growable_destination (coding));
7052                   ptrdiff_t dst_size;
7053                   if (INT_MULTIPLY_WRAPV (to_nchars, MAX_MULTIBYTE_LENGTH,
7054                                           &dst_size)
7055                       || INT_ADD_WRAPV (buf_end - buf, dst_size, &dst_size))
7056                     memory_full (SIZE_MAX);
7057                   dst = alloc_destination (coding, dst_size, dst);
7058                   if (EQ (coding->src_object, coding->dst_object))
7059                     {
7060                       coding_set_source (coding);
7061                       dst_end = (((unsigned char *) coding->source)
7062                                  + coding->consumed);
7063                     }
7064                   else
7065                     dst_end = coding->destination + coding->dst_bytes;
7066                 }
7067
7068               for (i = 0; i < to_nchars; i++)
7069                 {
7070                   if (i > 0)
7071                     c = XINT (AREF (trans, i));
7072                   if (coding->dst_multibyte
7073                       || ! CHAR_BYTE8_P (c))
7074                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7075                   else
7076                     *dst++ = CHAR_TO_BYTE8 (c);
7077                 }
7078               produced_chars += to_nchars;
7079               buf += from_nchars;
7080             }
7081           else
7082             /* This is an annotation datum.  (-C) is the length.  */
7083             buf += -c;
7084         }
7085       carryover = buf_end - buf;
7086     }
7087   else
7088     {
7089       /* Source characters are at coding->source.  */
7090       const unsigned char *src = coding->source;
7091       const unsigned char *src_end = src + coding->consumed;
7092
7093       if (EQ (coding->dst_object, coding->src_object))
7094         {
7095           eassert (growable_destination (coding));
7096           dst_end = (unsigned char *) src;
7097         }
7098       if (coding->src_multibyte != coding->dst_multibyte)
7099         {
7100           if (coding->src_multibyte)
7101             {
7102               bool multibytep = 1;
7103               ptrdiff_t consumed_chars = 0;
7104
7105               while (1)
7106                 {
7107                   const unsigned char *src_base = src;
7108                   int c;
7109
7110                   ONE_MORE_BYTE (c);
7111                   if (dst == dst_end)
7112                     {
7113                       eassert (growable_destination (coding));
7114                       if (EQ (coding->src_object, coding->dst_object))
7115                         dst_end = (unsigned char *) src;
7116                       if (dst == dst_end)
7117                         {
7118                           ptrdiff_t offset = src - coding->source;
7119
7120                           dst = alloc_destination (coding, src_end - src + 1,
7121                                                    dst);
7122                           dst_end = coding->destination + coding->dst_bytes;
7123                           coding_set_source (coding);
7124                           src = coding->source + offset;
7125                           src_end = coding->source + coding->consumed;
7126                           if (EQ (coding->src_object, coding->dst_object))
7127                             dst_end = (unsigned char *) src;
7128                         }
7129                     }
7130                   *dst++ = c;
7131                   produced_chars++;
7132                 }
7133             no_more_source:
7134               ;
7135             }
7136           else
7137             while (src < src_end)
7138               {
7139                 bool multibytep = 1;
7140                 int c = *src++;
7141
7142                 if (dst >= dst_end - 1)
7143                   {
7144                     eassert (growable_destination (coding));
7145                     if (EQ (coding->src_object, coding->dst_object))
7146                       dst_end = (unsigned char *) src;
7147                     if (dst >= dst_end - 1)
7148                       {
7149                         ptrdiff_t offset = src - coding->source;
7150                         ptrdiff_t more_bytes;
7151
7152                         if (EQ (coding->src_object, coding->dst_object))
7153                           more_bytes = ((src_end - src) / 2) + 2;
7154                         else
7155                           more_bytes = src_end - src + 2;
7156                         dst = alloc_destination (coding, more_bytes, dst);
7157                         dst_end = coding->destination + coding->dst_bytes;
7158                         coding_set_source (coding);
7159                         src = coding->source + offset;
7160                         src_end = coding->source + coding->consumed;
7161                         if (EQ (coding->src_object, coding->dst_object))
7162                           dst_end = (unsigned char *) src;
7163                       }
7164                   }
7165                 EMIT_ONE_BYTE (c);
7166               }
7167         }
7168       else
7169         {
7170           if (!EQ (coding->src_object, coding->dst_object))
7171             {
7172               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7173
7174               if (require > 0)
7175                 {
7176                   ptrdiff_t offset = src - coding->source;
7177
7178                   dst = alloc_destination (coding, require, dst);
7179                   coding_set_source (coding);
7180                   src = coding->source + offset;
7181                   src_end = coding->source + coding->consumed;
7182                 }
7183             }
7184           produced_chars = coding->consumed_char;
7185           while (src < src_end)
7186             *dst++ = *src++;
7187         }
7188     }
7189
7190   produced = dst - (coding->destination + coding->produced);
7191   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7192     insert_from_gap (produced_chars, produced, 0);
7193   coding->produced += produced;
7194   coding->produced_char += produced_chars;
7195   return carryover;
7196 }
7197
7198 /* Compose text in CODING->object according to the annotation data at
7199    CHARBUF.  CHARBUF is an array:
7200      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7201  */
7202
7203 static void
7204 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7205 {
7206   int len;
7207   ptrdiff_t to;
7208   enum composition_method method;
7209   Lisp_Object components;
7210
7211   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7212   to = pos + charbuf[2];
7213   method = (enum composition_method) (charbuf[4]);
7214
7215   if (method == COMPOSITION_RELATIVE)
7216     components = Qnil;
7217   else
7218     {
7219       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7220       int i, j;
7221
7222       if (method == COMPOSITION_WITH_RULE)
7223         len = charbuf[2] * 3 - 2;
7224       charbuf += MAX_ANNOTATION_LENGTH;
7225       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7226       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7227         {
7228           if (charbuf[i] >= 0)
7229             args[j] = make_number (charbuf[i]);
7230           else
7231             {
7232               i++;
7233               args[j] = make_number (charbuf[i] % 0x100);
7234             }
7235         }
7236       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7237     }
7238   compose_text (pos, to, components, Qnil, coding->dst_object);
7239 }
7240
7241
7242 /* Put `charset' property on text in CODING->object according to
7243    the annotation data at CHARBUF.  CHARBUF is an array:
7244      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7245  */
7246
7247 static void
7248 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7249 {
7250   ptrdiff_t from = pos - charbuf[2];
7251   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7252
7253   Fput_text_property (make_number (from), make_number (pos),
7254                       Qcharset, CHARSET_NAME (charset),
7255                       coding->dst_object);
7256 }
7257
7258 #define MAX_CHARBUF_SIZE 0x4000
7259 /* How many units decoding functions expect in coding->charbuf at
7260    most.  Currently, decode_coding_emacs_mule expects the following
7261    size, and that is the largest value.  */
7262 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7263
7264 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7265   do {                                                          \
7266     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
7267                            MAX_CHARBUF_SIZE);                   \
7268     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
7269     coding->charbuf_size = units;                               \
7270   } while (0)
7271
7272 static void
7273 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7274 {
7275   int *charbuf = coding->charbuf;
7276   int *charbuf_end = charbuf + coding->charbuf_used;
7277
7278   if (NILP (coding->dst_object))
7279     return;
7280
7281   while (charbuf < charbuf_end)
7282     {
7283       if (*charbuf >= 0)
7284         pos++, charbuf++;
7285       else
7286         {
7287           int len = -*charbuf;
7288
7289           if (len > 2)
7290             switch (charbuf[1])
7291               {
7292               case CODING_ANNOTATE_COMPOSITION_MASK:
7293                 produce_composition (coding, charbuf, pos);
7294                 break;
7295               case CODING_ANNOTATE_CHARSET_MASK:
7296                 produce_charset (coding, charbuf, pos);
7297                 break;
7298               default:
7299                 break;
7300               }
7301           charbuf += len;
7302         }
7303     }
7304 }
7305
7306 /* Decode the data at CODING->src_object into CODING->dst_object.
7307    CODING->src_object is a buffer, a string, or nil.
7308    CODING->dst_object is a buffer.
7309
7310    If CODING->src_object is a buffer, it must be the current buffer.
7311    In this case, if CODING->src_pos is positive, it is a position of
7312    the source text in the buffer, otherwise, the source text is in the
7313    gap area of the buffer, and CODING->src_pos specifies the offset of
7314    the text from GPT (which must be the same as PT).  If this is the
7315    same buffer as CODING->dst_object, CODING->src_pos must be
7316    negative.
7317
7318    If CODING->src_object is a string, CODING->src_pos is an index to
7319    that string.
7320
7321    If CODING->src_object is nil, CODING->source must already point to
7322    the non-relocatable memory area.  In this case, CODING->src_pos is
7323    an offset from CODING->source.
7324
7325    The decoded data is inserted at the current point of the buffer
7326    CODING->dst_object.
7327 */
7328
7329 static void
7330 decode_coding (struct coding_system *coding)
7331 {
7332   Lisp_Object attrs;
7333   Lisp_Object undo_list;
7334   Lisp_Object translation_table;
7335   struct ccl_spec cclspec;
7336   int carryover;
7337   int i;
7338
7339   USE_SAFE_ALLOCA;
7340
7341   if (BUFFERP (coding->src_object)
7342       && coding->src_pos > 0
7343       && coding->src_pos < GPT
7344       && coding->src_pos + coding->src_chars > GPT)
7345     move_gap_both (coding->src_pos, coding->src_pos_byte);
7346
7347   undo_list = Qt;
7348   if (BUFFERP (coding->dst_object))
7349     {
7350       set_buffer_internal (XBUFFER (coding->dst_object));
7351       if (GPT != PT)
7352         move_gap_both (PT, PT_BYTE);
7353
7354       /* We must disable undo_list in order to record the whole insert
7355          transaction via record_insert at the end.  But doing so also
7356          disables the recording of the first change to the undo_list.
7357          Therefore we check for first change here and record it via
7358          record_first_change if needed.  */
7359       if (MODIFF <= SAVE_MODIFF)
7360         record_first_change ();
7361
7362       undo_list = BVAR (current_buffer, undo_list);
7363       bset_undo_list (current_buffer, Qt);
7364     }
7365
7366   coding->consumed = coding->consumed_char = 0;
7367   coding->produced = coding->produced_char = 0;
7368   coding->chars_at_source = 0;
7369   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7370
7371   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7372
7373   attrs = CODING_ID_ATTRS (coding->id);
7374   translation_table = get_translation_table (attrs, 0, NULL);
7375
7376   carryover = 0;
7377   if (coding->decoder == decode_coding_ccl)
7378     {
7379       coding->spec.ccl = &cclspec;
7380       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7381     }
7382   do
7383     {
7384       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7385
7386       coding_set_source (coding);
7387       coding->annotated = 0;
7388       coding->charbuf_used = carryover;
7389       (*(coding->decoder)) (coding);
7390       coding_set_destination (coding);
7391       carryover = produce_chars (coding, translation_table, 0);
7392       if (coding->annotated)
7393         produce_annotation (coding, pos);
7394       for (i = 0; i < carryover; i++)
7395         coding->charbuf[i]
7396           = coding->charbuf[coding->charbuf_used - carryover + i];
7397     }
7398   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7399          || (coding->consumed < coding->src_bytes
7400              && (coding->result == CODING_RESULT_SUCCESS
7401                  || coding->result == CODING_RESULT_INVALID_SRC)));
7402
7403   if (carryover > 0)
7404     {
7405       coding_set_destination (coding);
7406       coding->charbuf_used = carryover;
7407       produce_chars (coding, translation_table, 1);
7408     }
7409
7410   coding->carryover_bytes = 0;
7411   if (coding->consumed < coding->src_bytes)
7412     {
7413       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7414       const unsigned char *src;
7415
7416       coding_set_source (coding);
7417       coding_set_destination (coding);
7418       src = coding->source + coding->consumed;
7419
7420       if (coding->mode & CODING_MODE_LAST_BLOCK)
7421         {
7422           /* Flush out unprocessed data as binary chars.  We are sure
7423              that the number of data is less than the size of
7424              coding->charbuf.  */
7425           coding->charbuf_used = 0;
7426           coding->chars_at_source = 0;
7427
7428           while (nbytes-- > 0)
7429             {
7430               int c = *src++;
7431
7432               if (c & 0x80)
7433                 c = BYTE8_TO_CHAR (c);
7434               coding->charbuf[coding->charbuf_used++] = c;
7435             }
7436           produce_chars (coding, Qnil, 1);
7437         }
7438       else
7439         {
7440           /* Record unprocessed bytes in coding->carryover.  We are
7441              sure that the number of data is less than the size of
7442              coding->carryover.  */
7443           unsigned char *p = coding->carryover;
7444
7445           if (nbytes > sizeof coding->carryover)
7446             nbytes = sizeof coding->carryover;
7447           coding->carryover_bytes = nbytes;
7448           while (nbytes-- > 0)
7449             *p++ = *src++;
7450         }
7451       coding->consumed = coding->src_bytes;
7452     }
7453
7454   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7455       && !inhibit_eol_conversion)
7456     decode_eol (coding);
7457   if (BUFFERP (coding->dst_object))
7458     {
7459       bset_undo_list (current_buffer, undo_list);
7460       record_insert (coding->dst_pos, coding->produced_char);
7461     }
7462
7463   SAFE_FREE ();
7464 }
7465
7466
7467 /* Extract an annotation datum from a composition starting at POS and
7468    ending before LIMIT of CODING->src_object (buffer or string), store
7469    the data in BUF, set *STOP to a starting position of the next
7470    composition (if any) or to LIMIT, and return the address of the
7471    next element of BUF.
7472
7473    If such an annotation is not found, set *STOP to a starting
7474    position of a composition after POS (if any) or to LIMIT, and
7475    return BUF.  */
7476
7477 static int *
7478 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7479                                struct coding_system *coding, int *buf,
7480                                ptrdiff_t *stop)
7481 {
7482   ptrdiff_t start, end;
7483   Lisp_Object prop;
7484
7485   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7486       || end > limit)
7487     *stop = limit;
7488   else if (start > pos)
7489     *stop = start;
7490   else
7491     {
7492       if (start == pos)
7493         {
7494           /* We found a composition.  Store the corresponding
7495              annotation data in BUF.  */
7496           int *head = buf;
7497           enum composition_method method = composition_method (prop);
7498           int nchars = COMPOSITION_LENGTH (prop);
7499
7500           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7501           if (method != COMPOSITION_RELATIVE)
7502             {
7503               Lisp_Object components;
7504               ptrdiff_t i, len, i_byte;
7505
7506               components = COMPOSITION_COMPONENTS (prop);
7507               if (VECTORP (components))
7508                 {
7509                   len = ASIZE (components);
7510                   for (i = 0; i < len; i++)
7511                     *buf++ = XINT (AREF (components, i));
7512                 }
7513               else if (STRINGP (components))
7514                 {
7515                   len = SCHARS (components);
7516                   i = i_byte = 0;
7517                   while (i < len)
7518                     {
7519                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7520                       buf++;
7521                     }
7522                 }
7523               else if (INTEGERP (components))
7524                 {
7525                   len = 1;
7526                   *buf++ = XINT (components);
7527                 }
7528               else if (CONSP (components))
7529                 {
7530                   for (len = 0; CONSP (components);
7531                        len++, components = XCDR (components))
7532                     *buf++ = XINT (XCAR (components));
7533                 }
7534               else
7535                 emacs_abort ();
7536               *head -= len;
7537             }
7538         }
7539
7540       if (find_composition (end, limit, &start, &end, &prop,
7541                             coding->src_object)
7542           && end <= limit)
7543         *stop = start;
7544       else
7545         *stop = limit;
7546     }
7547   return buf;
7548 }
7549
7550
7551 /* Extract an annotation datum from a text property `charset' at POS of
7552    CODING->src_object (buffer of string), store the data in BUF, set
7553    *STOP to the position where the value of `charset' property changes
7554    (limiting by LIMIT), and return the address of the next element of
7555    BUF.
7556
7557    If the property value is nil, set *STOP to the position where the
7558    property value is non-nil (limiting by LIMIT), and return BUF.  */
7559
7560 static int *
7561 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7562                            struct coding_system *coding, int *buf,
7563                            ptrdiff_t *stop)
7564 {
7565   Lisp_Object val, next;
7566   int id;
7567
7568   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7569   if (! NILP (val) && CHARSETP (val))
7570     id = XINT (CHARSET_SYMBOL_ID (val));
7571   else
7572     id = -1;
7573   ADD_CHARSET_DATA (buf, 0, id);
7574   next = Fnext_single_property_change (make_number (pos), Qcharset,
7575                                        coding->src_object,
7576                                        make_number (limit));
7577   *stop = XINT (next);
7578   return buf;
7579 }
7580
7581
7582 static void
7583 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7584                int max_lookup)
7585 {
7586   int *buf = coding->charbuf;
7587   int *buf_end = coding->charbuf + coding->charbuf_size;
7588   const unsigned char *src = coding->source + coding->consumed;
7589   const unsigned char *src_end = coding->source + coding->src_bytes;
7590   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7591   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7592   bool multibytep = coding->src_multibyte;
7593   Lisp_Object eol_type;
7594   int c;
7595   ptrdiff_t stop, stop_composition, stop_charset;
7596   int *lookup_buf = NULL;
7597
7598   if (! NILP (translation_table))
7599     lookup_buf = alloca (sizeof (int) * max_lookup);
7600
7601   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7602   if (VECTORP (eol_type))
7603     eol_type = Qunix;
7604
7605   /* Note: composition handling is not yet implemented.  */
7606   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7607
7608   if (NILP (coding->src_object))
7609     stop = stop_composition = stop_charset = end_pos;
7610   else
7611     {
7612       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7613         stop = stop_composition = pos;
7614       else
7615         stop = stop_composition = end_pos;
7616       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7617         stop = stop_charset = pos;
7618       else
7619         stop_charset = end_pos;
7620     }
7621
7622   /* Compensate for CRLF and conversion.  */
7623   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7624   while (buf < buf_end)
7625     {
7626       Lisp_Object trans;
7627
7628       if (pos == stop)
7629         {
7630           if (pos == end_pos)
7631             break;
7632           if (pos == stop_composition)
7633             buf = handle_composition_annotation (pos, end_pos, coding,
7634                                                  buf, &stop_composition);
7635           if (pos == stop_charset)
7636             buf = handle_charset_annotation (pos, end_pos, coding,
7637                                              buf, &stop_charset);
7638           stop = (stop_composition < stop_charset
7639                   ? stop_composition : stop_charset);
7640         }
7641
7642       if (! multibytep)
7643         {
7644           int bytes;
7645
7646           if (coding->encoder == encode_coding_raw_text
7647               || coding->encoder == encode_coding_ccl)
7648             c = *src++, pos++;
7649           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7650             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7651           else
7652             c = BYTE8_TO_CHAR (*src), src++, pos++;
7653         }
7654       else
7655         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7656       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7657         c = '\n';
7658       if (! EQ (eol_type, Qunix))
7659         {
7660           if (c == '\n')
7661             {
7662               if (EQ (eol_type, Qdos))
7663                 *buf++ = '\r';
7664               else
7665                 c = '\r';
7666             }
7667         }
7668
7669       trans = Qnil;
7670       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7671       if (NILP (trans))
7672         *buf++ = c;
7673       else
7674         {
7675           ptrdiff_t from_nchars = 1, to_nchars = 1;
7676           int *lookup_buf_end;
7677           const unsigned char *p = src;
7678           int i;
7679
7680           lookup_buf[0] = c;
7681           for (i = 1; i < max_lookup && p < src_end; i++)
7682             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7683           lookup_buf_end = lookup_buf + i;
7684           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7685           if (INTEGERP (trans))
7686             c = XINT (trans);
7687           else if (CONSP (trans))
7688             {
7689               from_nchars = ASIZE (XCAR (trans));
7690               trans = XCDR (trans);
7691               if (INTEGERP (trans))
7692                 c = XINT (trans);
7693               else
7694                 {
7695                   to_nchars = ASIZE (trans);
7696                   if (buf_end - buf < to_nchars)
7697                     break;
7698                   c = XINT (AREF (trans, 0));
7699                 }
7700             }
7701           else
7702             break;
7703           *buf++ = c;
7704           for (i = 1; i < to_nchars; i++)
7705             *buf++ = XINT (AREF (trans, i));
7706           for (i = 1; i < from_nchars; i++, pos++)
7707             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7708         }
7709     }
7710
7711   coding->consumed = src - coding->source;
7712   coding->consumed_char = pos - coding->src_pos;
7713   coding->charbuf_used = buf - coding->charbuf;
7714   coding->chars_at_source = 0;
7715 }
7716
7717
7718 /* Encode the text at CODING->src_object into CODING->dst_object.
7719    CODING->src_object is a buffer or a string.
7720    CODING->dst_object is a buffer or nil.
7721
7722    If CODING->src_object is a buffer, it must be the current buffer.
7723    In this case, if CODING->src_pos is positive, it is a position of
7724    the source text in the buffer, otherwise. the source text is in the
7725    gap area of the buffer, and coding->src_pos specifies the offset of
7726    the text from GPT (which must be the same as PT).  If this is the
7727    same buffer as CODING->dst_object, CODING->src_pos must be
7728    negative and CODING should not have `pre-write-conversion'.
7729
7730    If CODING->src_object is a string, CODING should not have
7731    `pre-write-conversion'.
7732
7733    If CODING->dst_object is a buffer, the encoded data is inserted at
7734    the current point of that buffer.
7735
7736    If CODING->dst_object is nil, the encoded data is placed at the
7737    memory area specified by CODING->destination.  */
7738
7739 static void
7740 encode_coding (struct coding_system *coding)
7741 {
7742   Lisp_Object attrs;
7743   Lisp_Object translation_table;
7744   int max_lookup;
7745   struct ccl_spec cclspec;
7746
7747   USE_SAFE_ALLOCA;
7748
7749   attrs = CODING_ID_ATTRS (coding->id);
7750   if (coding->encoder == encode_coding_raw_text)
7751     translation_table = Qnil, max_lookup = 0;
7752   else
7753     translation_table = get_translation_table (attrs, 1, &max_lookup);
7754
7755   if (BUFFERP (coding->dst_object))
7756     {
7757       set_buffer_internal (XBUFFER (coding->dst_object));
7758       coding->dst_multibyte
7759         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7760     }
7761
7762   coding->consumed = coding->consumed_char = 0;
7763   coding->produced = coding->produced_char = 0;
7764   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7765
7766   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7767
7768   if (coding->encoder == encode_coding_ccl)
7769     {
7770       coding->spec.ccl = &cclspec;
7771       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7772     }
7773   do {
7774     coding_set_source (coding);
7775     consume_chars (coding, translation_table, max_lookup);
7776     coding_set_destination (coding);
7777     (*(coding->encoder)) (coding);
7778   } while (coding->consumed_char < coding->src_chars);
7779
7780   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7781     insert_from_gap (coding->produced_char, coding->produced, 0);
7782
7783   SAFE_FREE ();
7784 }
7785
7786
7787 /* Name (or base name) of work buffer for code conversion.  */
7788 static Lisp_Object Vcode_conversion_workbuf_name;
7789
7790 /* A working buffer used by the top level conversion.  Once it is
7791    created, it is never destroyed.  It has the name
7792    Vcode_conversion_workbuf_name.  The other working buffers are
7793    destroyed after the use is finished, and their names are modified
7794    versions of Vcode_conversion_workbuf_name.  */
7795 static Lisp_Object Vcode_conversion_reused_workbuf;
7796
7797 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7798 static bool reused_workbuf_in_use;
7799
7800
7801 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7802    multibyteness of returning buffer.  */
7803
7804 static Lisp_Object
7805 make_conversion_work_buffer (bool multibyte)
7806 {
7807   Lisp_Object name, workbuf;
7808   struct buffer *current;
7809
7810   if (reused_workbuf_in_use)
7811     {
7812       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7813       workbuf = Fget_buffer_create (name);
7814     }
7815   else
7816     {
7817       reused_workbuf_in_use = 1;
7818       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7819         Vcode_conversion_reused_workbuf
7820           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7821       workbuf = Vcode_conversion_reused_workbuf;
7822     }
7823   current = current_buffer;
7824   set_buffer_internal (XBUFFER (workbuf));
7825   /* We can't allow modification hooks to run in the work buffer.  For
7826      instance, directory_files_internal assumes that file decoding
7827      doesn't compile new regexps.  */
7828   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7829   Ferase_buffer ();
7830   bset_undo_list (current_buffer, Qt);
7831   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7832   set_buffer_internal (current);
7833   return workbuf;
7834 }
7835
7836
7837 static void
7838 code_conversion_restore (Lisp_Object arg)
7839 {
7840   Lisp_Object current, workbuf;
7841
7842   current = XCAR (arg);
7843   workbuf = XCDR (arg);
7844   if (! NILP (workbuf))
7845     {
7846       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7847         reused_workbuf_in_use = 0;
7848       else
7849         Fkill_buffer (workbuf);
7850     }
7851   set_buffer_internal (XBUFFER (current));
7852 }
7853
7854 Lisp_Object
7855 code_conversion_save (bool with_work_buf, bool multibyte)
7856 {
7857   Lisp_Object workbuf = Qnil;
7858
7859   if (with_work_buf)
7860     workbuf = make_conversion_work_buffer (multibyte);
7861   record_unwind_protect (code_conversion_restore,
7862                          Fcons (Fcurrent_buffer (), workbuf));
7863   return workbuf;
7864 }
7865
7866 void
7867 decode_coding_gap (struct coding_system *coding,
7868                    ptrdiff_t chars, ptrdiff_t bytes)
7869 {
7870   ptrdiff_t count = SPECPDL_INDEX ();
7871   Lisp_Object attrs;
7872
7873   coding->src_object = Fcurrent_buffer ();
7874   coding->src_chars = chars;
7875   coding->src_bytes = bytes;
7876   coding->src_pos = -chars;
7877   coding->src_pos_byte = -bytes;
7878   coding->src_multibyte = chars < bytes;
7879   coding->dst_object = coding->src_object;
7880   coding->dst_pos = PT;
7881   coding->dst_pos_byte = PT_BYTE;
7882   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7883
7884   coding->head_ascii = -1;
7885   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7886   coding->eol_seen = EOL_SEEN_NONE;
7887   if (CODING_REQUIRE_DETECTION (coding))
7888     detect_coding (coding);
7889   attrs = CODING_ID_ATTRS (coding->id);
7890   if (! disable_ascii_optimization
7891       && ! coding->src_multibyte
7892       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7893       && NILP (CODING_ATTR_POST_READ (attrs))
7894       && NILP (get_translation_table (attrs, 0, NULL)))
7895     {
7896       chars = coding->head_ascii;
7897       if (chars < 0)
7898         chars = check_ascii (coding);
7899       if (chars != bytes)
7900         {
7901           /* There exists a non-ASCII byte.  */
7902           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7903               && coding->detected_utf8_bytes == coding->src_bytes)
7904             {
7905               if (coding->detected_utf8_chars >= 0)
7906                 chars = coding->detected_utf8_chars;
7907               else
7908                 chars = check_utf_8 (coding);
7909               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7910                   && coding->head_ascii == 0
7911                   && coding->source[0] == UTF_8_BOM_1
7912                   && coding->source[1] == UTF_8_BOM_2
7913                   && coding->source[2] == UTF_8_BOM_3)
7914                 {
7915                   chars--;
7916                   bytes -= 3;
7917                   coding->src_bytes -= 3;
7918                 }
7919             }
7920           else
7921             chars = -1;
7922         }
7923       if (chars >= 0)
7924         {
7925           Lisp_Object eol_type;
7926
7927           eol_type = CODING_ID_EOL_TYPE (coding->id);
7928           if (VECTORP (eol_type))
7929             {
7930               if (coding->eol_seen != EOL_SEEN_NONE)
7931                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7932             }
7933           if (EQ (eol_type, Qmac))
7934             {
7935               unsigned char *src_end = GAP_END_ADDR;
7936               unsigned char *src = src_end - coding->src_bytes;
7937
7938               while (src < src_end)
7939                 {
7940                   if (*src++ == '\r')
7941                     src[-1] = '\n';
7942                 }
7943             }
7944           else if (EQ (eol_type, Qdos))
7945             {
7946               unsigned char *src = GAP_END_ADDR;
7947               unsigned char *src_beg = src - coding->src_bytes;
7948               unsigned char *dst = src;
7949               ptrdiff_t diff;
7950
7951               while (src_beg < src)
7952                 {
7953                   *--dst = *--src;
7954                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7955                     src--;
7956                 }
7957               diff = dst - src;
7958               bytes -= diff;
7959               chars -= diff;
7960             }
7961           coding->produced = bytes;
7962           coding->produced_char = chars;
7963           insert_from_gap (chars, bytes, 1);
7964           return;
7965         }
7966     }
7967   code_conversion_save (0, 0);
7968
7969   coding->mode |= CODING_MODE_LAST_BLOCK;
7970   current_buffer->text->inhibit_shrinking = 1;
7971   decode_coding (coding);
7972   current_buffer->text->inhibit_shrinking = 0;
7973
7974   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7975     {
7976       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7977       Lisp_Object val;
7978
7979       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7980       val = call1 (CODING_ATTR_POST_READ (attrs),
7981                    make_number (coding->produced_char));
7982       CHECK_NATNUM (val);
7983       coding->produced_char += Z - prev_Z;
7984       coding->produced += Z_BYTE - prev_Z_BYTE;
7985     }
7986
7987   unbind_to (count, Qnil);
7988 }
7989
7990
7991 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7992    SRC_OBJECT into DST_OBJECT by coding context CODING.
7993
7994    SRC_OBJECT is a buffer, a string, or Qnil.
7995
7996    If it is a buffer, the text is at point of the buffer.  FROM and TO
7997    are positions in the buffer.
7998
7999    If it is a string, the text is at the beginning of the string.
8000    FROM and TO are indices to the string.
8001
8002    If it is nil, the text is at coding->source.  FROM and TO are
8003    indices to coding->source.
8004
8005    DST_OBJECT is a buffer, Qt, or Qnil.
8006
8007    If it is a buffer, the decoded text is inserted at point of the
8008    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8009    is deleted.
8010
8011    If it is Qt, a string is made from the decoded text, and
8012    set in CODING->dst_object.
8013
8014    If it is Qnil, the decoded text is stored at CODING->destination.
8015    The caller must allocate CODING->dst_bytes bytes at
8016    CODING->destination by xmalloc.  If the decoded text is longer than
8017    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8018  */
8019
8020 void
8021 decode_coding_object (struct coding_system *coding,
8022                       Lisp_Object src_object,
8023                       ptrdiff_t from, ptrdiff_t from_byte,
8024                       ptrdiff_t to, ptrdiff_t to_byte,
8025                       Lisp_Object dst_object)
8026 {
8027   ptrdiff_t count = SPECPDL_INDEX ();
8028   unsigned char *destination IF_LINT (= NULL);
8029   ptrdiff_t dst_bytes IF_LINT (= 0);
8030   ptrdiff_t chars = to - from;
8031   ptrdiff_t bytes = to_byte - from_byte;
8032   Lisp_Object attrs;
8033   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8034   bool need_marker_adjustment = 0;
8035   Lisp_Object old_deactivate_mark;
8036
8037   old_deactivate_mark = Vdeactivate_mark;
8038
8039   if (NILP (dst_object))
8040     {
8041       destination = coding->destination;
8042       dst_bytes = coding->dst_bytes;
8043     }
8044
8045   coding->src_object = src_object;
8046   coding->src_chars = chars;
8047   coding->src_bytes = bytes;
8048   coding->src_multibyte = chars < bytes;
8049
8050   if (STRINGP (src_object))
8051     {
8052       coding->src_pos = from;
8053       coding->src_pos_byte = from_byte;
8054     }
8055   else if (BUFFERP (src_object))
8056     {
8057       set_buffer_internal (XBUFFER (src_object));
8058       if (from != GPT)
8059         move_gap_both (from, from_byte);
8060       if (EQ (src_object, dst_object))
8061         {
8062           struct Lisp_Marker *tail;
8063
8064           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8065             {
8066               tail->need_adjustment
8067                 = tail->charpos == (tail->insertion_type ? from : to);
8068               need_marker_adjustment |= tail->need_adjustment;
8069             }
8070           saved_pt = PT, saved_pt_byte = PT_BYTE;
8071           TEMP_SET_PT_BOTH (from, from_byte);
8072           current_buffer->text->inhibit_shrinking = 1;
8073           del_range_both (from, from_byte, to, to_byte, 1);
8074           coding->src_pos = -chars;
8075           coding->src_pos_byte = -bytes;
8076         }
8077       else
8078         {
8079           coding->src_pos = from;
8080           coding->src_pos_byte = from_byte;
8081         }
8082     }
8083
8084   if (CODING_REQUIRE_DETECTION (coding))
8085     detect_coding (coding);
8086   attrs = CODING_ID_ATTRS (coding->id);
8087
8088   if (EQ (dst_object, Qt)
8089       || (! NILP (CODING_ATTR_POST_READ (attrs))
8090           && NILP (dst_object)))
8091     {
8092       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8093       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8094       coding->dst_pos = BEG;
8095       coding->dst_pos_byte = BEG_BYTE;
8096     }
8097   else if (BUFFERP (dst_object))
8098     {
8099       code_conversion_save (0, 0);
8100       coding->dst_object = dst_object;
8101       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8102       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8103       coding->dst_multibyte
8104         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8105     }
8106   else
8107     {
8108       code_conversion_save (0, 0);
8109       coding->dst_object = Qnil;
8110       /* Most callers presume this will return a multibyte result, and they
8111          won't use `binary' or `raw-text' anyway, so let's not worry about
8112          CODING_FOR_UNIBYTE.  */
8113       coding->dst_multibyte = 1;
8114     }
8115
8116   decode_coding (coding);
8117
8118   if (BUFFERP (coding->dst_object))
8119     set_buffer_internal (XBUFFER (coding->dst_object));
8120
8121   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8122     {
8123       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8124       Lisp_Object val;
8125
8126       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8127       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8128                         make_number (coding->produced_char));
8129       CHECK_NATNUM (val);
8130       coding->produced_char += Z - prev_Z;
8131       coding->produced += Z_BYTE - prev_Z_BYTE;
8132     }
8133
8134   if (EQ (dst_object, Qt))
8135     {
8136       coding->dst_object = Fbuffer_string ();
8137     }
8138   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8139     {
8140       set_buffer_internal (XBUFFER (coding->dst_object));
8141       if (dst_bytes < coding->produced)
8142         {
8143           eassert (coding->produced > 0);
8144           destination = xrealloc (destination, coding->produced);
8145           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8146             move_gap_both (BEGV, BEGV_BYTE);
8147           memcpy (destination, BEGV_ADDR, coding->produced);
8148           coding->destination = destination;
8149         }
8150     }
8151
8152   if (saved_pt >= 0)
8153     {
8154       /* This is the case of:
8155          (BUFFERP (src_object) && EQ (src_object, dst_object))
8156          As we have moved PT while replacing the original buffer
8157          contents, we must recover it now.  */
8158       set_buffer_internal (XBUFFER (src_object));
8159       current_buffer->text->inhibit_shrinking = 0;
8160       if (saved_pt < from)
8161         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8162       else if (saved_pt < from + chars)
8163         TEMP_SET_PT_BOTH (from, from_byte);
8164       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8165         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8166                           saved_pt_byte + (coding->produced - bytes));
8167       else
8168         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8169                           saved_pt_byte + (coding->produced - bytes));
8170
8171       if (need_marker_adjustment)
8172         {
8173           struct Lisp_Marker *tail;
8174
8175           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8176             if (tail->need_adjustment)
8177               {
8178                 tail->need_adjustment = 0;
8179                 if (tail->insertion_type)
8180                   {
8181                     tail->bytepos = from_byte;
8182                     tail->charpos = from;
8183                   }
8184                 else
8185                   {
8186                     tail->bytepos = from_byte + coding->produced;
8187                     tail->charpos
8188                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8189                          ? tail->bytepos : from + coding->produced_char);
8190                   }
8191               }
8192         }
8193     }
8194
8195   Vdeactivate_mark = old_deactivate_mark;
8196   unbind_to (count, coding->dst_object);
8197 }
8198
8199
8200 void
8201 encode_coding_object (struct coding_system *coding,
8202                       Lisp_Object src_object,
8203                       ptrdiff_t from, ptrdiff_t from_byte,
8204                       ptrdiff_t to, ptrdiff_t to_byte,
8205                       Lisp_Object dst_object)
8206 {
8207   ptrdiff_t count = SPECPDL_INDEX ();
8208   ptrdiff_t chars = to - from;
8209   ptrdiff_t bytes = to_byte - from_byte;
8210   Lisp_Object attrs;
8211   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8212   bool need_marker_adjustment = 0;
8213   bool kill_src_buffer = 0;
8214   Lisp_Object old_deactivate_mark;
8215
8216   old_deactivate_mark = Vdeactivate_mark;
8217
8218   coding->src_object = src_object;
8219   coding->src_chars = chars;
8220   coding->src_bytes = bytes;
8221   coding->src_multibyte = chars < bytes;
8222
8223   attrs = CODING_ID_ATTRS (coding->id);
8224
8225   if (EQ (src_object, dst_object))
8226     {
8227       struct Lisp_Marker *tail;
8228
8229       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8230         {
8231           tail->need_adjustment
8232             = tail->charpos == (tail->insertion_type ? from : to);
8233           need_marker_adjustment |= tail->need_adjustment;
8234         }
8235     }
8236
8237   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8238     {
8239       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8240       set_buffer_internal (XBUFFER (coding->src_object));
8241       if (STRINGP (src_object))
8242         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8243       else if (BUFFERP (src_object))
8244         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8245       else
8246         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8247
8248       if (EQ (src_object, dst_object))
8249         {
8250           set_buffer_internal (XBUFFER (src_object));
8251           saved_pt = PT, saved_pt_byte = PT_BYTE;
8252           del_range_both (from, from_byte, to, to_byte, 1);
8253           set_buffer_internal (XBUFFER (coding->src_object));
8254         }
8255
8256       safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8257                   make_number (BEG), make_number (Z));
8258       if (XBUFFER (coding->src_object) != current_buffer)
8259         kill_src_buffer = 1;
8260       coding->src_object = Fcurrent_buffer ();
8261       if (BEG != GPT)
8262         move_gap_both (BEG, BEG_BYTE);
8263       coding->src_chars = Z - BEG;
8264       coding->src_bytes = Z_BYTE - BEG_BYTE;
8265       coding->src_pos = BEG;
8266       coding->src_pos_byte = BEG_BYTE;
8267       coding->src_multibyte = Z < Z_BYTE;
8268     }
8269   else if (STRINGP (src_object))
8270     {
8271       code_conversion_save (0, 0);
8272       coding->src_pos = from;
8273       coding->src_pos_byte = from_byte;
8274     }
8275   else if (BUFFERP (src_object))
8276     {
8277       code_conversion_save (0, 0);
8278       set_buffer_internal (XBUFFER (src_object));
8279       if (EQ (src_object, dst_object))
8280         {
8281           saved_pt = PT, saved_pt_byte = PT_BYTE;
8282           coding->src_object = del_range_1 (from, to, 1, 1);
8283           coding->src_pos = 0;
8284           coding->src_pos_byte = 0;
8285         }
8286       else
8287         {
8288           if (from < GPT && to >= GPT)
8289             move_gap_both (from, from_byte);
8290           coding->src_pos = from;
8291           coding->src_pos_byte = from_byte;
8292         }
8293     }
8294   else
8295     {
8296       code_conversion_save (0, 0);
8297       coding->src_pos = from;
8298       coding->src_pos_byte = from_byte;
8299     }
8300
8301   if (BUFFERP (dst_object))
8302     {
8303       coding->dst_object = dst_object;
8304       if (EQ (src_object, dst_object))
8305         {
8306           coding->dst_pos = from;
8307           coding->dst_pos_byte = from_byte;
8308         }
8309       else
8310         {
8311           struct buffer *current = current_buffer;
8312
8313           set_buffer_temp (XBUFFER (dst_object));
8314           coding->dst_pos = PT;
8315           coding->dst_pos_byte = PT_BYTE;
8316           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8317           set_buffer_temp (current);
8318         }
8319       coding->dst_multibyte
8320         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8321     }
8322   else if (EQ (dst_object, Qt))
8323     {
8324       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8325       coding->dst_object = Qnil;
8326       coding->destination = xmalloc (dst_bytes);
8327       coding->dst_bytes = dst_bytes;
8328       coding->dst_multibyte = 0;
8329     }
8330   else
8331     {
8332       coding->dst_object = Qnil;
8333       coding->dst_multibyte = 0;
8334     }
8335
8336   encode_coding (coding);
8337
8338   if (EQ (dst_object, Qt))
8339     {
8340       if (BUFFERP (coding->dst_object))
8341         coding->dst_object = Fbuffer_string ();
8342       else if (coding->raw_destination)
8343         /* This is used to avoid creating huge Lisp string.
8344            NOTE: caller who sets `raw_destination' is also
8345            responsible for freeing `destination' buffer.  */
8346         coding->dst_object = Qnil;
8347       else
8348         {
8349           coding->dst_object
8350             = make_unibyte_string ((char *) coding->destination,
8351                                    coding->produced);
8352           xfree (coding->destination);
8353         }
8354     }
8355
8356   if (saved_pt >= 0)
8357     {
8358       /* This is the case of:
8359          (BUFFERP (src_object) && EQ (src_object, dst_object))
8360          As we have moved PT while replacing the original buffer
8361          contents, we must recover it now.  */
8362       set_buffer_internal (XBUFFER (src_object));
8363       if (saved_pt < from)
8364         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8365       else if (saved_pt < from + chars)
8366         TEMP_SET_PT_BOTH (from, from_byte);
8367       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8368         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8369                           saved_pt_byte + (coding->produced - bytes));
8370       else
8371         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8372                           saved_pt_byte + (coding->produced - bytes));
8373
8374       if (need_marker_adjustment)
8375         {
8376           struct Lisp_Marker *tail;
8377
8378           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8379             if (tail->need_adjustment)
8380               {
8381                 tail->need_adjustment = 0;
8382                 if (tail->insertion_type)
8383                   {
8384                     tail->bytepos = from_byte;
8385                     tail->charpos = from;
8386                   }
8387                 else
8388                   {
8389                     tail->bytepos = from_byte + coding->produced;
8390                     tail->charpos
8391                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8392                          ? tail->bytepos : from + coding->produced_char);
8393                   }
8394               }
8395         }
8396     }
8397
8398   if (kill_src_buffer)
8399     Fkill_buffer (coding->src_object);
8400
8401   Vdeactivate_mark = old_deactivate_mark;
8402   unbind_to (count, Qnil);
8403 }
8404
8405
8406 Lisp_Object
8407 preferred_coding_system (void)
8408 {
8409   int id = coding_categories[coding_priorities[0]].id;
8410
8411   return CODING_ID_NAME (id);
8412 }
8413
8414 #if defined (WINDOWSNT) || defined (CYGWIN)
8415
8416 Lisp_Object
8417 from_unicode (Lisp_Object str)
8418 {
8419   CHECK_STRING (str);
8420   if (!STRING_MULTIBYTE (str) &&
8421       SBYTES (str) & 1)
8422     {
8423       str = Fsubstring (str, make_number (0), make_number (-1));
8424     }
8425
8426   return code_convert_string_norecord (str, Qutf_16le, 0);
8427 }
8428
8429 Lisp_Object
8430 from_unicode_buffer (const wchar_t *wstr)
8431 {
8432     return from_unicode (
8433         make_unibyte_string (
8434             (char *) wstr,
8435             /* we get one of the two final 0 bytes for free. */
8436             1 + sizeof (wchar_t) * wcslen (wstr)));
8437 }
8438
8439 wchar_t *
8440 to_unicode (Lisp_Object str, Lisp_Object *buf)
8441 {
8442   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8443   /* We need to make another copy (in addition to the one made by
8444      code_convert_string_norecord) to ensure that the final string is
8445      _doubly_ zero terminated --- that is, that the string is
8446      terminated by two zero bytes and one utf-16le null character.
8447      Because strings are already terminated with a single zero byte,
8448      we just add one additional zero. */
8449   str = make_uninit_string (SBYTES (*buf) + 1);
8450   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8451   SDATA (str) [SBYTES (*buf)] = '\0';
8452   *buf = str;
8453   return WCSDATA (*buf);
8454 }
8455
8456 #endif /* WINDOWSNT || CYGWIN */
8457
8458 \f
8459 #ifdef emacs
8460 /*** 8. Emacs Lisp library functions ***/
8461
8462 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8463        doc: /* Return t if OBJECT is nil or a coding-system.
8464 See the documentation of `define-coding-system' for information
8465 about coding-system objects.  */)
8466   (Lisp_Object object)
8467 {
8468   if (NILP (object)
8469       || CODING_SYSTEM_ID (object) >= 0)
8470     return Qt;
8471   if (! SYMBOLP (object)
8472       || NILP (Fget (object, Qcoding_system_define_form)))
8473     return Qnil;
8474   return Qt;
8475 }
8476
8477 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8478        Sread_non_nil_coding_system, 1, 1, 0,
8479        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8480   (Lisp_Object prompt)
8481 {
8482   Lisp_Object val;
8483   do
8484     {
8485       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8486                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8487     }
8488   while (SCHARS (val) == 0);
8489   return (Fintern (val, Qnil));
8490 }
8491
8492 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8493        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8494 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8495 Ignores case when completing coding systems (all Emacs coding systems
8496 are lower-case).  */)
8497   (Lisp_Object prompt, Lisp_Object default_coding_system)
8498 {
8499   Lisp_Object val;
8500   ptrdiff_t count = SPECPDL_INDEX ();
8501
8502   if (SYMBOLP (default_coding_system))
8503     default_coding_system = SYMBOL_NAME (default_coding_system);
8504   specbind (Qcompletion_ignore_case, Qt);
8505   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8506                           Qt, Qnil, Qcoding_system_history,
8507                           default_coding_system, Qnil);
8508   unbind_to (count, Qnil);
8509   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8510 }
8511
8512 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8513        1, 1, 0,
8514        doc: /* Check validity of CODING-SYSTEM.
8515 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8516 It is valid if it is nil or a symbol defined as a coding system by the
8517 function `define-coding-system'.  */)
8518   (Lisp_Object coding_system)
8519 {
8520   Lisp_Object define_form;
8521
8522   define_form = Fget (coding_system, Qcoding_system_define_form);
8523   if (! NILP (define_form))
8524     {
8525       Fput (coding_system, Qcoding_system_define_form, Qnil);
8526       safe_eval (define_form);
8527     }
8528   if (!NILP (Fcoding_system_p (coding_system)))
8529     return coding_system;
8530   xsignal1 (Qcoding_system_error, coding_system);
8531 }
8532
8533 \f
8534 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8535    HIGHEST, return the coding system of the highest
8536    priority among the detected coding systems.  Otherwise return a
8537    list of detected coding systems sorted by their priorities.  If
8538    MULTIBYTEP, it is assumed that the bytes are in correct
8539    multibyte form but contains only ASCII and eight-bit chars.
8540    Otherwise, the bytes are raw bytes.
8541
8542    CODING-SYSTEM controls the detection as below:
8543
8544    If it is nil, detect both text-format and eol-format.  If the
8545    text-format part of CODING-SYSTEM is already specified
8546    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8547    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8548    detect only text-format.  */
8549
8550 Lisp_Object
8551 detect_coding_system (const unsigned char *src,
8552                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8553                       bool highest, bool multibytep,
8554                       Lisp_Object coding_system)
8555 {
8556   const unsigned char *src_end = src + src_bytes;
8557   Lisp_Object attrs, eol_type;
8558   Lisp_Object val = Qnil;
8559   struct coding_system coding;
8560   ptrdiff_t id;
8561   struct coding_detection_info detect_info;
8562   enum coding_category base_category;
8563   bool null_byte_found = 0, eight_bit_found = 0;
8564
8565   if (NILP (coding_system))
8566     coding_system = Qundecided;
8567   setup_coding_system (coding_system, &coding);
8568   attrs = CODING_ID_ATTRS (coding.id);
8569   eol_type = CODING_ID_EOL_TYPE (coding.id);
8570   coding_system = CODING_ATTR_BASE_NAME (attrs);
8571
8572   coding.source = src;
8573   coding.src_chars = src_chars;
8574   coding.src_bytes = src_bytes;
8575   coding.src_multibyte = multibytep;
8576   coding.consumed = 0;
8577   coding.mode |= CODING_MODE_LAST_BLOCK;
8578   coding.head_ascii = 0;
8579
8580   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8581
8582   /* At first, detect text-format if necessary.  */
8583   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8584   if (base_category == coding_category_undecided)
8585     {
8586       enum coding_category category IF_LINT (= 0);
8587       struct coding_system *this IF_LINT (= NULL);
8588       int c, i;
8589       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8590                                        inhibit_null_byte_detection);
8591       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8592                                        inhibit_iso_escape_detection);
8593       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8594
8595       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8596       for (; src < src_end; src++)
8597         {
8598           c = *src;
8599           if (c & 0x80)
8600             {
8601               eight_bit_found = 1;
8602               if (null_byte_found)
8603                 break;
8604             }
8605           else if (c < 0x20)
8606             {
8607               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8608                   && ! inhibit_ied
8609                   && ! detect_info.checked)
8610                 {
8611                   if (detect_coding_iso_2022 (&coding, &detect_info))
8612                     {
8613                       /* We have scanned the whole data.  */
8614                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8615                         {
8616                           /* We didn't find an 8-bit code.  We may
8617                              have found a null-byte, but it's very
8618                              rare that a binary file confirm to
8619                              ISO-2022.  */
8620                           src = src_end;
8621                           coding.head_ascii = src - coding.source;
8622                         }
8623                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8624                       break;
8625                     }
8626                 }
8627               else if (! c && !inhibit_nbd)
8628                 {
8629                   null_byte_found = 1;
8630                   if (eight_bit_found)
8631                     break;
8632                 }
8633               if (! eight_bit_found)
8634                 coding.head_ascii++;
8635             }
8636           else if (! eight_bit_found)
8637             coding.head_ascii++;
8638         }
8639
8640       if (null_byte_found || eight_bit_found
8641           || coding.head_ascii < coding.src_bytes
8642           || detect_info.found)
8643         {
8644           if (coding.head_ascii == coding.src_bytes)
8645             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8646             for (i = 0; i < coding_category_raw_text; i++)
8647               {
8648                 category = coding_priorities[i];
8649                 this = coding_categories + category;
8650                 if (detect_info.found & (1 << category))
8651                   break;
8652               }
8653           else
8654             {
8655               if (null_byte_found)
8656                 {
8657                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8658                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8659                 }
8660               else if (prefer_utf_8
8661                        && detect_coding_utf_8 (&coding, &detect_info))
8662                 {
8663                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8664                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8665                 }
8666               for (i = 0; i < coding_category_raw_text; i++)
8667                 {
8668                   category = coding_priorities[i];
8669                   this = coding_categories + category;
8670
8671                   if (this->id < 0)
8672                     {
8673                       /* No coding system of this category is defined.  */
8674                       detect_info.rejected |= (1 << category);
8675                     }
8676                   else if (category >= coding_category_raw_text)
8677                     continue;
8678                   else if (detect_info.checked & (1 << category))
8679                     {
8680                       if (highest
8681                           && (detect_info.found & (1 << category)))
8682                         break;
8683                     }
8684                   else if ((*(this->detector)) (&coding, &detect_info)
8685                            && highest
8686                            && (detect_info.found & (1 << category)))
8687                     {
8688                       if (category == coding_category_utf_16_auto)
8689                         {
8690                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8691                             category = coding_category_utf_16_le;
8692                           else
8693                             category = coding_category_utf_16_be;
8694                         }
8695                       break;
8696                     }
8697                 }
8698             }
8699         }
8700
8701       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8702           || null_byte_found)
8703         {
8704           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8705           id = CODING_SYSTEM_ID (Qno_conversion);
8706           val = list1 (make_number (id));
8707         }
8708       else if (! detect_info.rejected && ! detect_info.found)
8709         {
8710           detect_info.found = CATEGORY_MASK_ANY;
8711           id = coding_categories[coding_category_undecided].id;
8712           val = list1 (make_number (id));
8713         }
8714       else if (highest)
8715         {
8716           if (detect_info.found)
8717             {
8718               detect_info.found = 1 << category;
8719               val = list1 (make_number (this->id));
8720             }
8721           else
8722             for (i = 0; i < coding_category_raw_text; i++)
8723               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8724                 {
8725                   detect_info.found = 1 << coding_priorities[i];
8726                   id = coding_categories[coding_priorities[i]].id;
8727                   val = list1 (make_number (id));
8728                   break;
8729                 }
8730         }
8731       else
8732         {
8733           int mask = detect_info.rejected | detect_info.found;
8734           int found = 0;
8735
8736           for (i = coding_category_raw_text - 1; i >= 0; i--)
8737             {
8738               category = coding_priorities[i];
8739               if (! (mask & (1 << category)))
8740                 {
8741                   found |= 1 << category;
8742                   id = coding_categories[category].id;
8743                   if (id >= 0)
8744                     val = list1 (make_number (id));
8745                 }
8746             }
8747           for (i = coding_category_raw_text - 1; i >= 0; i--)
8748             {
8749               category = coding_priorities[i];
8750               if (detect_info.found & (1 << category))
8751                 {
8752                   id = coding_categories[category].id;
8753                   val = Fcons (make_number (id), val);
8754                 }
8755             }
8756           detect_info.found |= found;
8757         }
8758     }
8759   else if (base_category == coding_category_utf_8_auto)
8760     {
8761       if (detect_coding_utf_8 (&coding, &detect_info))
8762         {
8763           struct coding_system *this;
8764
8765           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8766             this = coding_categories + coding_category_utf_8_sig;
8767           else
8768             this = coding_categories + coding_category_utf_8_nosig;
8769           val = list1 (make_number (this->id));
8770         }
8771     }
8772   else if (base_category == coding_category_utf_16_auto)
8773     {
8774       if (detect_coding_utf_16 (&coding, &detect_info))
8775         {
8776           struct coding_system *this;
8777
8778           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8779             this = coding_categories + coding_category_utf_16_le;
8780           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8781             this = coding_categories + coding_category_utf_16_be;
8782           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8783             this = coding_categories + coding_category_utf_16_be_nosig;
8784           else
8785             this = coding_categories + coding_category_utf_16_le_nosig;
8786           val = list1 (make_number (this->id));
8787         }
8788     }
8789   else
8790     {
8791       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8792       val = list1 (make_number (coding.id));
8793     }
8794
8795   /* Then, detect eol-format if necessary.  */
8796   {
8797     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8798     Lisp_Object tail;
8799
8800     if (VECTORP (eol_type))
8801       {
8802         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8803           {
8804             if (null_byte_found)
8805               normal_eol = EOL_SEEN_LF;
8806             else
8807               normal_eol = detect_eol (coding.source, src_bytes,
8808                                        coding_category_raw_text);
8809           }
8810         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8811                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8812           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8813                                       coding_category_utf_16_be);
8814         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8815                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8816           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8817                                       coding_category_utf_16_le);
8818       }
8819     else
8820       {
8821         if (EQ (eol_type, Qunix))
8822           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8823         else if (EQ (eol_type, Qdos))
8824           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8825         else
8826           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8827       }
8828
8829     for (tail = val; CONSP (tail); tail = XCDR (tail))
8830       {
8831         enum coding_category category;
8832         int this_eol;
8833
8834         id = XINT (XCAR (tail));
8835         attrs = CODING_ID_ATTRS (id);
8836         category = XINT (CODING_ATTR_CATEGORY (attrs));
8837         eol_type = CODING_ID_EOL_TYPE (id);
8838         if (VECTORP (eol_type))
8839           {
8840             if (category == coding_category_utf_16_be
8841                 || category == coding_category_utf_16_be_nosig)
8842               this_eol = utf_16_be_eol;
8843             else if (category == coding_category_utf_16_le
8844                      || category == coding_category_utf_16_le_nosig)
8845               this_eol = utf_16_le_eol;
8846             else
8847               this_eol = normal_eol;
8848
8849             if (this_eol == EOL_SEEN_LF)
8850               XSETCAR (tail, AREF (eol_type, 0));
8851             else if (this_eol == EOL_SEEN_CRLF)
8852               XSETCAR (tail, AREF (eol_type, 1));
8853             else if (this_eol == EOL_SEEN_CR)
8854               XSETCAR (tail, AREF (eol_type, 2));
8855             else
8856               XSETCAR (tail, CODING_ID_NAME (id));
8857           }
8858         else
8859           XSETCAR (tail, CODING_ID_NAME (id));
8860       }
8861   }
8862
8863   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8864 }
8865
8866
8867 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8868        2, 3, 0,
8869        doc: /* Detect coding system of the text in the region between START and END.
8870 Return a list of possible coding systems ordered by priority.
8871 The coding systems to try and their priorities follows what
8872 the function `coding-system-priority-list' (which see) returns.
8873
8874 If only ASCII characters are found (except for such ISO-2022 control
8875 characters as ESC), it returns a list of single element `undecided'
8876 or its subsidiary coding system according to a detected end-of-line
8877 format.
8878
8879 If optional argument HIGHEST is non-nil, return the coding system of
8880 highest priority.  */)
8881   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8882 {
8883   ptrdiff_t from, to;
8884   ptrdiff_t from_byte, to_byte;
8885
8886   validate_region (&start, &end);
8887   from = XINT (start), to = XINT (end);
8888   from_byte = CHAR_TO_BYTE (from);
8889   to_byte = CHAR_TO_BYTE (to);
8890
8891   if (from < GPT && to >= GPT)
8892     move_gap_both (to, to_byte);
8893
8894   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8895                                to - from, to_byte - from_byte,
8896                                !NILP (highest),
8897                                !NILP (BVAR (current_buffer
8898                                       , enable_multibyte_characters)),
8899                                Qnil);
8900 }
8901
8902 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8903        1, 2, 0,
8904        doc: /* Detect coding system of the text in STRING.
8905 Return a list of possible coding systems ordered by priority.
8906 The coding systems to try and their priorities follows what
8907 the function `coding-system-priority-list' (which see) returns.
8908
8909 If only ASCII characters are found (except for such ISO-2022 control
8910 characters as ESC), it returns a list of single element `undecided'
8911 or its subsidiary coding system according to a detected end-of-line
8912 format.
8913
8914 If optional argument HIGHEST is non-nil, return the coding system of
8915 highest priority.  */)
8916   (Lisp_Object string, Lisp_Object highest)
8917 {
8918   CHECK_STRING (string);
8919
8920   return detect_coding_system (SDATA (string),
8921                                SCHARS (string), SBYTES (string),
8922                                !NILP (highest), STRING_MULTIBYTE (string),
8923                                Qnil);
8924 }
8925
8926
8927 static bool
8928 char_encodable_p (int c, Lisp_Object attrs)
8929 {
8930   Lisp_Object tail;
8931   struct charset *charset;
8932   Lisp_Object translation_table;
8933
8934   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8935   if (! NILP (translation_table))
8936     c = translate_char (translation_table, c);
8937   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8938        CONSP (tail); tail = XCDR (tail))
8939     {
8940       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8941       if (CHAR_CHARSET_P (c, charset))
8942         break;
8943     }
8944   return (! NILP (tail));
8945 }
8946
8947
8948 /* Return a list of coding systems that safely encode the text between
8949    START and END.  If EXCLUDE is non-nil, it is a list of coding
8950    systems not to check.  The returned list doesn't contain any such
8951    coding systems.  In any case, if the text contains only ASCII or is
8952    unibyte, return t.  */
8953
8954 DEFUN ("find-coding-systems-region-internal",
8955        Ffind_coding_systems_region_internal,
8956        Sfind_coding_systems_region_internal, 2, 3, 0,
8957        doc: /* Internal use only.  */)
8958   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8959 {
8960   Lisp_Object coding_attrs_list, safe_codings;
8961   ptrdiff_t start_byte, end_byte;
8962   const unsigned char *p, *pbeg, *pend;
8963   int c;
8964   Lisp_Object tail, elt, work_table;
8965
8966   if (STRINGP (start))
8967     {
8968       if (!STRING_MULTIBYTE (start)
8969           || SCHARS (start) == SBYTES (start))
8970         return Qt;
8971       start_byte = 0;
8972       end_byte = SBYTES (start);
8973     }
8974   else
8975     {
8976       CHECK_NUMBER_COERCE_MARKER (start);
8977       CHECK_NUMBER_COERCE_MARKER (end);
8978       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8979         args_out_of_range (start, end);
8980       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8981         return Qt;
8982       start_byte = CHAR_TO_BYTE (XINT (start));
8983       end_byte = CHAR_TO_BYTE (XINT (end));
8984       if (XINT (end) - XINT (start) == end_byte - start_byte)
8985         return Qt;
8986
8987       if (XINT (start) < GPT && XINT (end) > GPT)
8988         {
8989           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8990             move_gap_both (XINT (start), start_byte);
8991           else
8992             move_gap_both (XINT (end), end_byte);
8993         }
8994     }
8995
8996   coding_attrs_list = Qnil;
8997   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8998     if (NILP (exclude)
8999         || NILP (Fmemq (XCAR (tail), exclude)))
9000       {
9001         Lisp_Object attrs;
9002
9003         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9004         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9005           {
9006             ASET (attrs, coding_attr_trans_tbl,
9007                   get_translation_table (attrs, 1, NULL));
9008             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9009           }
9010       }
9011
9012   if (STRINGP (start))
9013     p = pbeg = SDATA (start);
9014   else
9015     p = pbeg = BYTE_POS_ADDR (start_byte);
9016   pend = p + (end_byte - start_byte);
9017
9018   while (p < pend && ASCII_CHAR_P (*p)) p++;
9019   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9020
9021   work_table = Fmake_char_table (Qnil, Qnil);
9022   while (p < pend)
9023     {
9024       if (ASCII_CHAR_P (*p))
9025         p++;
9026       else
9027         {
9028           c = STRING_CHAR_ADVANCE (p);
9029           if (!NILP (char_table_ref (work_table, c)))
9030             /* This character was already checked.  Ignore it.  */
9031             continue;
9032
9033           charset_map_loaded = 0;
9034           for (tail = coding_attrs_list; CONSP (tail);)
9035             {
9036               elt = XCAR (tail);
9037               if (NILP (elt))
9038                 tail = XCDR (tail);
9039               else if (char_encodable_p (c, elt))
9040                 tail = XCDR (tail);
9041               else if (CONSP (XCDR (tail)))
9042                 {
9043                   XSETCAR (tail, XCAR (XCDR (tail)));
9044                   XSETCDR (tail, XCDR (XCDR (tail)));
9045                 }
9046               else
9047                 {
9048                   XSETCAR (tail, Qnil);
9049                   tail = XCDR (tail);
9050                 }
9051             }
9052           if (charset_map_loaded)
9053             {
9054               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9055
9056               if (STRINGP (start))
9057                 pbeg = SDATA (start);
9058               else
9059                 pbeg = BYTE_POS_ADDR (start_byte);
9060               p = pbeg + p_offset;
9061               pend = pbeg + pend_offset;
9062             }
9063           char_table_set (work_table, c, Qt);
9064         }
9065     }
9066
9067   safe_codings = list2 (Qraw_text, Qno_conversion);
9068   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9069     if (! NILP (XCAR (tail)))
9070       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9071
9072   return safe_codings;
9073 }
9074
9075
9076 DEFUN ("unencodable-char-position", Funencodable_char_position,
9077        Sunencodable_char_position, 3, 5, 0,
9078        doc: /* Return position of first un-encodable character in a region.
9079 START and END specify the region and CODING-SYSTEM specifies the
9080 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9081
9082 If optional 4th argument COUNT is non-nil, it specifies at most how
9083 many un-encodable characters to search.  In this case, the value is a
9084 list of positions.
9085
9086 If optional 5th argument STRING is non-nil, it is a string to search
9087 for un-encodable characters.  In that case, START and END are indexes
9088 to the string and treated as in `substring'.  */)
9089   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9090    Lisp_Object count, Lisp_Object string)
9091 {
9092   EMACS_INT n;
9093   struct coding_system coding;
9094   Lisp_Object attrs, charset_list, translation_table;
9095   Lisp_Object positions;
9096   ptrdiff_t from, to;
9097   const unsigned char *p, *stop, *pend;
9098   bool ascii_compatible;
9099
9100   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9101   attrs = CODING_ID_ATTRS (coding.id);
9102   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9103     return Qnil;
9104   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9105   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9106   translation_table = get_translation_table (attrs, 1, NULL);
9107
9108   if (NILP (string))
9109     {
9110       validate_region (&start, &end);
9111       from = XINT (start);
9112       to = XINT (end);
9113       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9114           || (ascii_compatible
9115               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9116         return Qnil;
9117       p = CHAR_POS_ADDR (from);
9118       pend = CHAR_POS_ADDR (to);
9119       if (from < GPT && to >= GPT)
9120         stop = GPT_ADDR;
9121       else
9122         stop = pend;
9123     }
9124   else
9125     {
9126       CHECK_STRING (string);
9127       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9128       if (! STRING_MULTIBYTE (string))
9129         return Qnil;
9130       p = SDATA (string) + string_char_to_byte (string, from);
9131       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9132       if (ascii_compatible && (to - from) == (pend - p))
9133         return Qnil;
9134     }
9135
9136   if (NILP (count))
9137     n = 1;
9138   else
9139     {
9140       CHECK_NATNUM (count);
9141       n = XINT (count);
9142     }
9143
9144   positions = Qnil;
9145   charset_map_loaded = 0;
9146   while (1)
9147     {
9148       int c;
9149
9150       if (ascii_compatible)
9151         while (p < stop && ASCII_CHAR_P (*p))
9152           p++, from++;
9153       if (p >= stop)
9154         {
9155           if (p >= pend)
9156             break;
9157           stop = pend;
9158           p = GAP_END_ADDR;
9159         }
9160
9161       c = STRING_CHAR_ADVANCE (p);
9162       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9163           && ! char_charset (translate_char (translation_table, c),
9164                              charset_list, NULL))
9165         {
9166           positions = Fcons (make_number (from), positions);
9167           n--;
9168           if (n == 0)
9169             break;
9170         }
9171
9172       from++;
9173       if (charset_map_loaded && NILP (string))
9174         {
9175           p = CHAR_POS_ADDR (from);
9176           pend = CHAR_POS_ADDR (to);
9177           if (from < GPT && to >= GPT)
9178             stop = GPT_ADDR;
9179           else
9180             stop = pend;
9181           charset_map_loaded = 0;
9182         }
9183     }
9184
9185   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9186 }
9187
9188
9189 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9190        Scheck_coding_systems_region, 3, 3, 0,
9191        doc: /* Check if the region is encodable by coding systems.
9192
9193 START and END are buffer positions specifying the region.
9194 CODING-SYSTEM-LIST is a list of coding systems to check.
9195
9196 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9197 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9198 whole region, POS0, POS1, ... are buffer positions where non-encodable
9199 characters are found.
9200
9201 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9202 value is nil.
9203
9204 START may be a string.  In that case, check if the string is
9205 encodable, and the value contains indices to the string instead of
9206 buffer positions.  END is ignored.
9207
9208 If the current buffer (or START if it is a string) is unibyte, the value
9209 is nil.  */)
9210   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9211 {
9212   Lisp_Object list;
9213   ptrdiff_t start_byte, end_byte;
9214   ptrdiff_t pos;
9215   const unsigned char *p, *pbeg, *pend;
9216   int c;
9217   Lisp_Object tail, elt, attrs;
9218
9219   if (STRINGP (start))
9220     {
9221       if (!STRING_MULTIBYTE (start)
9222           || SCHARS (start) == SBYTES (start))
9223         return Qnil;
9224       start_byte = 0;
9225       end_byte = SBYTES (start);
9226       pos = 0;
9227     }
9228   else
9229     {
9230       CHECK_NUMBER_COERCE_MARKER (start);
9231       CHECK_NUMBER_COERCE_MARKER (end);
9232       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9233         args_out_of_range (start, end);
9234       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9235         return Qnil;
9236       start_byte = CHAR_TO_BYTE (XINT (start));
9237       end_byte = CHAR_TO_BYTE (XINT (end));
9238       if (XINT (end) - XINT (start) == end_byte - start_byte)
9239         return Qnil;
9240
9241       if (XINT (start) < GPT && XINT (end) > GPT)
9242         {
9243           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9244             move_gap_both (XINT (start), start_byte);
9245           else
9246             move_gap_both (XINT (end), end_byte);
9247         }
9248       pos = XINT (start);
9249     }
9250
9251   list = Qnil;
9252   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9253     {
9254       elt = XCAR (tail);
9255       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9256       ASET (attrs, coding_attr_trans_tbl,
9257             get_translation_table (attrs, 1, NULL));
9258       list = Fcons (list2 (elt, attrs), list);
9259     }
9260
9261   if (STRINGP (start))
9262     p = pbeg = SDATA (start);
9263   else
9264     p = pbeg = BYTE_POS_ADDR (start_byte);
9265   pend = p + (end_byte - start_byte);
9266
9267   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9268   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9269
9270   while (p < pend)
9271     {
9272       if (ASCII_CHAR_P (*p))
9273         p++;
9274       else
9275         {
9276           c = STRING_CHAR_ADVANCE (p);
9277
9278           charset_map_loaded = 0;
9279           for (tail = list; CONSP (tail); tail = XCDR (tail))
9280             {
9281               elt = XCDR (XCAR (tail));
9282               if (! char_encodable_p (c, XCAR (elt)))
9283                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9284             }
9285           if (charset_map_loaded)
9286             {
9287               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9288
9289               if (STRINGP (start))
9290                 pbeg = SDATA (start);
9291               else
9292                 pbeg = BYTE_POS_ADDR (start_byte);
9293               p = pbeg + p_offset;
9294               pend = pbeg + pend_offset;
9295             }
9296         }
9297       pos++;
9298     }
9299
9300   tail = list;
9301   list = Qnil;
9302   for (; CONSP (tail); tail = XCDR (tail))
9303     {
9304       elt = XCAR (tail);
9305       if (CONSP (XCDR (XCDR (elt))))
9306         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9307                       list);
9308     }
9309
9310   return list;
9311 }
9312
9313
9314 static Lisp_Object
9315 code_convert_region (Lisp_Object start, Lisp_Object end,
9316                      Lisp_Object coding_system, Lisp_Object dst_object,
9317                      bool encodep, bool norecord)
9318 {
9319   struct coding_system coding;
9320   ptrdiff_t from, from_byte, to, to_byte;
9321   Lisp_Object src_object;
9322
9323   if (NILP (coding_system))
9324     coding_system = Qno_conversion;
9325   else
9326     CHECK_CODING_SYSTEM (coding_system);
9327   src_object = Fcurrent_buffer ();
9328   if (NILP (dst_object))
9329     dst_object = src_object;
9330   else if (! EQ (dst_object, Qt))
9331     CHECK_BUFFER (dst_object);
9332
9333   validate_region (&start, &end);
9334   from = XFASTINT (start);
9335   from_byte = CHAR_TO_BYTE (from);
9336   to = XFASTINT (end);
9337   to_byte = CHAR_TO_BYTE (to);
9338
9339   setup_coding_system (coding_system, &coding);
9340   coding.mode |= CODING_MODE_LAST_BLOCK;
9341
9342   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9343     {
9344       struct buffer *buf = XBUFFER (dst_object);
9345       ptrdiff_t buf_pt = BUF_PT (buf);
9346
9347       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9348     }
9349
9350   if (encodep)
9351     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9352                           dst_object);
9353   else
9354     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9355                           dst_object);
9356   if (! norecord)
9357     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9358
9359   return (BUFFERP (dst_object)
9360           ? make_number (coding.produced_char)
9361           : coding.dst_object);
9362 }
9363
9364
9365 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9366        3, 4, "r\nzCoding system: ",
9367        doc: /* Decode the current region from the specified coding system.
9368 When called from a program, takes four arguments:
9369         START, END, CODING-SYSTEM, and DESTINATION.
9370 START and END are buffer positions.
9371
9372 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9373 If nil, the region between START and END is replaced by the decoded text.
9374 If buffer, the decoded text is inserted in that buffer after point (point
9375 does not move).
9376 In those cases, the length of the decoded text is returned.
9377 If DESTINATION is t, the decoded text is returned.
9378
9379 This function sets `last-coding-system-used' to the precise coding system
9380 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9381 not fully specified.)  */)
9382   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9383 {
9384   return code_convert_region (start, end, coding_system, destination, 0, 0);
9385 }
9386
9387 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9388        3, 4, "r\nzCoding system: ",
9389        doc: /* Encode the current region by specified coding system.
9390 When called from a program, takes four arguments:
9391         START, END, CODING-SYSTEM and DESTINATION.
9392 START and END are buffer positions.
9393
9394 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9395 If nil, the region between START and END is replace by the encoded text.
9396 If buffer, the encoded text is inserted in that buffer after point (point
9397 does not move).
9398 In those cases, the length of the encoded text is returned.
9399 If DESTINATION is t, the encoded text is returned.
9400
9401 This function sets `last-coding-system-used' to the precise coding system
9402 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9403 not fully specified.)  */)
9404   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9405 {
9406   return code_convert_region (start, end, coding_system, destination, 1, 0);
9407 }
9408
9409 Lisp_Object
9410 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9411                      Lisp_Object dst_object, bool encodep, bool nocopy,
9412                      bool norecord)
9413 {
9414   struct coding_system coding;
9415   ptrdiff_t chars, bytes;
9416
9417   CHECK_STRING (string);
9418   if (NILP (coding_system))
9419     {
9420       if (! norecord)
9421         Vlast_coding_system_used = Qno_conversion;
9422       if (NILP (dst_object))
9423         return (nocopy ? Fcopy_sequence (string) : string);
9424     }
9425
9426   if (NILP (coding_system))
9427     coding_system = Qno_conversion;
9428   else
9429     CHECK_CODING_SYSTEM (coding_system);
9430   if (NILP (dst_object))
9431     dst_object = Qt;
9432   else if (! EQ (dst_object, Qt))
9433     CHECK_BUFFER (dst_object);
9434
9435   setup_coding_system (coding_system, &coding);
9436   coding.mode |= CODING_MODE_LAST_BLOCK;
9437   chars = SCHARS (string);
9438   bytes = SBYTES (string);
9439
9440   if (BUFFERP (dst_object))
9441     {
9442       struct buffer *buf = XBUFFER (dst_object);
9443       ptrdiff_t buf_pt = BUF_PT (buf);
9444
9445       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9446     }
9447
9448   if (encodep)
9449     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9450   else
9451     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9452   if (! norecord)
9453     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9454
9455   return (BUFFERP (dst_object)
9456           ? make_number (coding.produced_char)
9457           : coding.dst_object);
9458 }
9459
9460
9461 /* Encode or decode STRING according to CODING_SYSTEM.
9462    Do not set Vlast_coding_system_used.
9463
9464    This function is called only from macros DECODE_FILE and
9465    ENCODE_FILE, thus we ignore character composition.  */
9466
9467 Lisp_Object
9468 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9469                               bool encodep)
9470 {
9471   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9472 }
9473
9474 /* Encode or decode a file name, to or from a unibyte string suitable
9475    for passing to C library functions.  */
9476 Lisp_Object
9477 decode_file_name (Lisp_Object fname)
9478 {
9479 #ifdef WINDOWSNT
9480   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9481      converts the file names either to UTF-16LE or to the system ANSI
9482      codepage internally, depending on the underlying OS; see w32.c.  */
9483   if (! NILP (Fcoding_system_p (Qutf_8)))
9484     return code_convert_string_norecord (fname, Qutf_8, 0);
9485   return fname;
9486 #else  /* !WINDOWSNT */
9487   if (! NILP (Vfile_name_coding_system))
9488     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9489   else if (! NILP (Vdefault_file_name_coding_system))
9490     return code_convert_string_norecord (fname,
9491                                          Vdefault_file_name_coding_system, 0);
9492   else
9493     return fname;
9494 #endif
9495 }
9496
9497 Lisp_Object
9498 encode_file_name (Lisp_Object fname)
9499 {
9500   /* This is especially important during bootstrap and dumping, when
9501      file-name encoding is not yet known, and therefore any non-ASCII
9502      file names are unibyte strings, and could only be thrashed if we
9503      try to encode them.  */
9504   if (!STRING_MULTIBYTE (fname))
9505     return fname;
9506 #ifdef WINDOWSNT
9507   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9508      converts the file names either to UTF-16LE or to the system ANSI
9509      codepage internally, depending on the underlying OS; see w32.c.  */
9510   if (! NILP (Fcoding_system_p (Qutf_8)))
9511     return code_convert_string_norecord (fname, Qutf_8, 1);
9512   return fname;
9513 #else  /* !WINDOWSNT */
9514   if (! NILP (Vfile_name_coding_system))
9515     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9516   else if (! NILP (Vdefault_file_name_coding_system))
9517     return code_convert_string_norecord (fname,
9518                                          Vdefault_file_name_coding_system, 1);
9519   else
9520     return fname;
9521 #endif
9522 }
9523
9524 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9525        2, 4, 0,
9526        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9527
9528 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9529 if the decoding operation is trivial.
9530
9531 Optional fourth arg BUFFER non-nil means that the decoded text is
9532 inserted in that buffer after point (point does not move).  In this
9533 case, the return value is the length of the decoded text.
9534
9535 This function sets `last-coding-system-used' to the precise coding system
9536 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9537 not fully specified.)  */)
9538   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9539 {
9540   return code_convert_string (string, coding_system, buffer,
9541                               0, ! NILP (nocopy), 0);
9542 }
9543
9544 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9545        2, 4, 0,
9546        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9547
9548 Optional third arg NOCOPY non-nil means it is OK to return STRING
9549 itself if the encoding operation is trivial.
9550
9551 Optional fourth arg BUFFER non-nil means that the encoded text is
9552 inserted in that buffer after point (point does not move).  In this
9553 case, the return value is the length of the encoded text.
9554
9555 This function sets `last-coding-system-used' to the precise coding system
9556 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9557 not fully specified.)  */)
9558   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9559 {
9560   return code_convert_string (string, coding_system, buffer,
9561                               1, ! NILP (nocopy), 0);
9562 }
9563
9564 \f
9565 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9566        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9567 Return the corresponding character.  */)
9568   (Lisp_Object code)
9569 {
9570   Lisp_Object spec, attrs, val;
9571   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9572   EMACS_INT ch;
9573   int c;
9574
9575   CHECK_NATNUM (code);
9576   ch = XFASTINT (code);
9577   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9578   attrs = AREF (spec, 0);
9579
9580   if (ASCII_CHAR_P (ch)
9581       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9582     return code;
9583
9584   val = CODING_ATTR_CHARSET_LIST (attrs);
9585   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9586   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9587   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9588
9589   if (ch <= 0x7F)
9590     {
9591       c = ch;
9592       charset = charset_roman;
9593     }
9594   else if (ch >= 0xA0 && ch < 0xDF)
9595     {
9596       c = ch - 0x80;
9597       charset = charset_kana;
9598     }
9599   else
9600     {
9601       EMACS_INT c1 = ch >> 8;
9602       int c2 = ch & 0xFF;
9603
9604       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9605           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9606         error ("Invalid code: %"pI"d", ch);
9607       c = ch;
9608       SJIS_TO_JIS (c);
9609       charset = charset_kanji;
9610     }
9611   c = DECODE_CHAR (charset, c);
9612   if (c < 0)
9613     error ("Invalid code: %"pI"d", ch);
9614   return make_number (c);
9615 }
9616
9617
9618 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9619        doc: /* Encode a Japanese character CH to shift_jis encoding.
9620 Return the corresponding code in SJIS.  */)
9621   (Lisp_Object ch)
9622 {
9623   Lisp_Object spec, attrs, charset_list;
9624   int c;
9625   struct charset *charset;
9626   unsigned code;
9627
9628   CHECK_CHARACTER (ch);
9629   c = XFASTINT (ch);
9630   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9631   attrs = AREF (spec, 0);
9632
9633   if (ASCII_CHAR_P (c)
9634       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9635     return ch;
9636
9637   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9638   charset = char_charset (c, charset_list, &code);
9639   if (code == CHARSET_INVALID_CODE (charset))
9640     error ("Can't encode by shift_jis encoding: %c", c);
9641   JIS_TO_SJIS (code);
9642
9643   return make_number (code);
9644 }
9645
9646 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9647        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9648 Return the corresponding character.  */)
9649   (Lisp_Object code)
9650 {
9651   Lisp_Object spec, attrs, val;
9652   struct charset *charset_roman, *charset_big5, *charset;
9653   EMACS_INT ch;
9654   int c;
9655
9656   CHECK_NATNUM (code);
9657   ch = XFASTINT (code);
9658   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9659   attrs = AREF (spec, 0);
9660
9661   if (ASCII_CHAR_P (ch)
9662       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9663     return code;
9664
9665   val = CODING_ATTR_CHARSET_LIST (attrs);
9666   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9667   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9668
9669   if (ch <= 0x7F)
9670     {
9671       c = ch;
9672       charset = charset_roman;
9673     }
9674   else
9675     {
9676       EMACS_INT b1 = ch >> 8;
9677       int b2 = ch & 0x7F;
9678       if (b1 < 0xA1 || b1 > 0xFE
9679           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9680         error ("Invalid code: %"pI"d", ch);
9681       c = ch;
9682       charset = charset_big5;
9683     }
9684   c = DECODE_CHAR (charset, c);
9685   if (c < 0)
9686     error ("Invalid code: %"pI"d", ch);
9687   return make_number (c);
9688 }
9689
9690 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9691        doc: /* Encode the Big5 character CH to BIG5 coding system.
9692 Return the corresponding character code in Big5.  */)
9693   (Lisp_Object ch)
9694 {
9695   Lisp_Object spec, attrs, charset_list;
9696   struct charset *charset;
9697   int c;
9698   unsigned code;
9699
9700   CHECK_CHARACTER (ch);
9701   c = XFASTINT (ch);
9702   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9703   attrs = AREF (spec, 0);
9704   if (ASCII_CHAR_P (c)
9705       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9706     return ch;
9707
9708   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9709   charset = char_charset (c, charset_list, &code);
9710   if (code == CHARSET_INVALID_CODE (charset))
9711     error ("Can't encode by Big5 encoding: %c", c);
9712
9713   return make_number (code);
9714 }
9715
9716 \f
9717 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9718        Sset_terminal_coding_system_internal, 1, 2, 0,
9719        doc: /* Internal use only.  */)
9720   (Lisp_Object coding_system, Lisp_Object terminal)
9721 {
9722   struct terminal *term = decode_live_terminal (terminal);
9723   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9724   CHECK_SYMBOL (coding_system);
9725   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9726   /* We had better not send unsafe characters to terminal.  */
9727   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9728   /* Character composition should be disabled.  */
9729   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9730   terminal_coding->src_multibyte = 1;
9731   terminal_coding->dst_multibyte = 0;
9732   tset_charset_list
9733     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9734             ? coding_charset_list (terminal_coding)
9735             : list1 (make_number (charset_ascii))));
9736   return Qnil;
9737 }
9738
9739 DEFUN ("set-safe-terminal-coding-system-internal",
9740        Fset_safe_terminal_coding_system_internal,
9741        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9742        doc: /* Internal use only.  */)
9743   (Lisp_Object coding_system)
9744 {
9745   CHECK_SYMBOL (coding_system);
9746   setup_coding_system (Fcheck_coding_system (coding_system),
9747                        &safe_terminal_coding);
9748   /* Character composition should be disabled.  */
9749   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9750   safe_terminal_coding.src_multibyte = 1;
9751   safe_terminal_coding.dst_multibyte = 0;
9752   return Qnil;
9753 }
9754
9755 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9756        Sterminal_coding_system, 0, 1, 0,
9757        doc: /* Return coding system specified for terminal output on the given terminal.
9758 TERMINAL may be a terminal object, a frame, or nil for the selected
9759 frame's terminal device.  */)
9760   (Lisp_Object terminal)
9761 {
9762   struct coding_system *terminal_coding
9763     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
9764   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9765
9766   /* For backward compatibility, return nil if it is `undecided'.  */
9767   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9768 }
9769
9770 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9771        Sset_keyboard_coding_system_internal, 1, 2, 0,
9772        doc: /* Internal use only.  */)
9773   (Lisp_Object coding_system, Lisp_Object terminal)
9774 {
9775   struct terminal *t = decode_live_terminal (terminal);
9776   CHECK_SYMBOL (coding_system);
9777   if (NILP (coding_system))
9778     coding_system = Qno_conversion;
9779   else
9780     Fcheck_coding_system (coding_system);
9781   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9782   /* Character composition should be disabled.  */
9783   TERMINAL_KEYBOARD_CODING (t)->common_flags
9784     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9785   return Qnil;
9786 }
9787
9788 DEFUN ("keyboard-coding-system",
9789        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9790        doc: /* Return coding system specified for decoding keyboard input.  */)
9791   (Lisp_Object terminal)
9792 {
9793   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9794                          (decode_live_terminal (terminal))->id);
9795 }
9796
9797 \f
9798 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9799        Sfind_operation_coding_system,  1, MANY, 0,
9800        doc: /* Choose a coding system for an operation based on the target name.
9801 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9802 DECODING-SYSTEM is the coding system to use for decoding
9803 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9804 for encoding (in case OPERATION does encoding).
9805
9806 The first argument OPERATION specifies an I/O primitive:
9807   For file I/O, `insert-file-contents' or `write-region'.
9808   For process I/O, `call-process', `call-process-region', or `start-process'.
9809   For network I/O, `open-network-stream'.
9810
9811 The remaining arguments should be the same arguments that were passed
9812 to the primitive.  Depending on which primitive, one of those arguments
9813 is selected as the TARGET.  For example, if OPERATION does file I/O,
9814 whichever argument specifies the file name is TARGET.
9815
9816 TARGET has a meaning which depends on OPERATION:
9817   For file I/O, TARGET is a file name (except for the special case below).
9818   For process I/O, TARGET is a process name.
9819   For network I/O, TARGET is a service name or a port number.
9820
9821 This function looks up what is specified for TARGET in
9822 `file-coding-system-alist', `process-coding-system-alist',
9823 or `network-coding-system-alist' depending on OPERATION.
9824 They may specify a coding system, a cons of coding systems,
9825 or a function symbol to call.
9826 In the last case, we call the function with one argument,
9827 which is a list of all the arguments given to this function.
9828 If the function can't decide a coding system, it can return
9829 `undecided' so that the normal code-detection is performed.
9830
9831 If OPERATION is `insert-file-contents', the argument corresponding to
9832 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9833 file name to look up, and BUFFER is a buffer that contains the file's
9834 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9835 function to call for FILENAME, that function should examine the
9836 contents of BUFFER instead of reading the file.
9837
9838 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9839   (ptrdiff_t nargs, Lisp_Object *args)
9840 {
9841   Lisp_Object operation, target_idx, target, val;
9842   register Lisp_Object chain;
9843
9844   if (nargs < 2)
9845     error ("Too few arguments");
9846   operation = args[0];
9847   if (!SYMBOLP (operation)
9848       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9849     error ("Invalid first argument");
9850   if (nargs <= 1 + XFASTINT (target_idx))
9851     error ("Too few arguments for operation `%s'",
9852            SDATA (SYMBOL_NAME (operation)));
9853   target = args[XFASTINT (target_idx) + 1];
9854   if (!(STRINGP (target)
9855         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9856             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9857         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9858     error ("Invalid argument %"pI"d of operation `%s'",
9859            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9860   if (CONSP (target))
9861     target = XCAR (target);
9862
9863   chain = ((EQ (operation, Qinsert_file_contents)
9864             || EQ (operation, Qwrite_region))
9865            ? Vfile_coding_system_alist
9866            : (EQ (operation, Qopen_network_stream)
9867               ? Vnetwork_coding_system_alist
9868               : Vprocess_coding_system_alist));
9869   if (NILP (chain))
9870     return Qnil;
9871
9872   for (; CONSP (chain); chain = XCDR (chain))
9873     {
9874       Lisp_Object elt;
9875
9876       elt = XCAR (chain);
9877       if (CONSP (elt)
9878           && ((STRINGP (target)
9879                && STRINGP (XCAR (elt))
9880                && fast_string_match (XCAR (elt), target) >= 0)
9881               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9882         {
9883           val = XCDR (elt);
9884           /* Here, if VAL is both a valid coding system and a valid
9885              function symbol, we return VAL as a coding system.  */
9886           if (CONSP (val))
9887             return val;
9888           if (! SYMBOLP (val))
9889             return Qnil;
9890           if (! NILP (Fcoding_system_p (val)))
9891             return Fcons (val, val);
9892           if (! NILP (Ffboundp (val)))
9893             {
9894               /* We use call1 rather than safe_call1
9895                  so as to get bug reports about functions called here
9896                  which don't handle the current interface.  */
9897               val = call1 (val, Flist (nargs, args));
9898               if (CONSP (val))
9899                 return val;
9900               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9901                 return Fcons (val, val);
9902             }
9903           return Qnil;
9904         }
9905     }
9906   return Qnil;
9907 }
9908
9909 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9910        Sset_coding_system_priority, 0, MANY, 0,
9911        doc: /* Assign higher priority to the coding systems given as arguments.
9912 If multiple coding systems belong to the same category,
9913 all but the first one are ignored.
9914
9915 usage: (set-coding-system-priority &rest coding-systems)  */)
9916   (ptrdiff_t nargs, Lisp_Object *args)
9917 {
9918   ptrdiff_t i, j;
9919   bool changed[coding_category_max];
9920   enum coding_category priorities[coding_category_max];
9921
9922   memset (changed, 0, sizeof changed);
9923
9924   for (i = j = 0; i < nargs; i++)
9925     {
9926       enum coding_category category;
9927       Lisp_Object spec, attrs;
9928
9929       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9930       attrs = AREF (spec, 0);
9931       category = XINT (CODING_ATTR_CATEGORY (attrs));
9932       if (changed[category])
9933         /* Ignore this coding system because a coding system of the
9934            same category already had a higher priority.  */
9935         continue;
9936       changed[category] = 1;
9937       priorities[j++] = category;
9938       if (coding_categories[category].id >= 0
9939           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9940         setup_coding_system (args[i], &coding_categories[category]);
9941       Fset (AREF (Vcoding_category_table, category), args[i]);
9942     }
9943
9944   /* Now we have decided top J priorities.  Reflect the order of the
9945      original priorities to the remaining priorities.  */
9946
9947   for (i = j, j = 0; i < coding_category_max; i++, j++)
9948     {
9949       while (j < coding_category_max
9950              && changed[coding_priorities[j]])
9951         j++;
9952       if (j == coding_category_max)
9953         emacs_abort ();
9954       priorities[i] = coding_priorities[j];
9955     }
9956
9957   memcpy (coding_priorities, priorities, sizeof priorities);
9958
9959   /* Update `coding-category-list'.  */
9960   Vcoding_category_list = Qnil;
9961   for (i = coding_category_max; i-- > 0; )
9962     Vcoding_category_list
9963       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9964                Vcoding_category_list);
9965
9966   return Qnil;
9967 }
9968
9969 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9970        Scoding_system_priority_list, 0, 1, 0,
9971        doc: /* Return a list of coding systems ordered by their priorities.
9972 The list contains a subset of coding systems; i.e. coding systems
9973 assigned to each coding category (see `coding-category-list').
9974
9975 HIGHESTP non-nil means just return the highest priority one.  */)
9976   (Lisp_Object highestp)
9977 {
9978   int i;
9979   Lisp_Object val;
9980
9981   for (i = 0, val = Qnil; i < coding_category_max; i++)
9982     {
9983       enum coding_category category = coding_priorities[i];
9984       int id = coding_categories[category].id;
9985       Lisp_Object attrs;
9986
9987       if (id < 0)
9988         continue;
9989       attrs = CODING_ID_ATTRS (id);
9990       if (! NILP (highestp))
9991         return CODING_ATTR_BASE_NAME (attrs);
9992       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9993     }
9994   return Fnreverse (val);
9995 }
9996
9997 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9998
9999 static Lisp_Object
10000 make_subsidiaries (Lisp_Object base)
10001 {
10002   Lisp_Object subsidiaries;
10003   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10004   USE_SAFE_ALLOCA;
10005   char *buf = SAFE_ALLOCA (base_name_len + 6);
10006   int i;
10007
10008   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10009   subsidiaries = make_uninit_vector (3);
10010   for (i = 0; i < 3; i++)
10011     {
10012       strcpy (buf + base_name_len, suffixes[i]);
10013       ASET (subsidiaries, i, intern (buf));
10014     }
10015   SAFE_FREE ();
10016   return subsidiaries;
10017 }
10018
10019
10020 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10021        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10022        doc: /* For internal use only.
10023 usage: (define-coding-system-internal ...)  */)
10024   (ptrdiff_t nargs, Lisp_Object *args)
10025 {
10026   Lisp_Object name;
10027   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10028   Lisp_Object attrs;            /* Vector of attributes.  */
10029   Lisp_Object eol_type;
10030   Lisp_Object aliases;
10031   Lisp_Object coding_type, charset_list, safe_charsets;
10032   enum coding_category category;
10033   Lisp_Object tail, val;
10034   int max_charset_id = 0;
10035   int i;
10036
10037   if (nargs < coding_arg_max)
10038     goto short_args;
10039
10040   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10041
10042   name = args[coding_arg_name];
10043   CHECK_SYMBOL (name);
10044   ASET (attrs, coding_attr_base_name, name);
10045
10046   val = args[coding_arg_mnemonic];
10047   if (! STRINGP (val))
10048     CHECK_CHARACTER (val);
10049   ASET (attrs, coding_attr_mnemonic, val);
10050
10051   coding_type = args[coding_arg_coding_type];
10052   CHECK_SYMBOL (coding_type);
10053   ASET (attrs, coding_attr_type, coding_type);
10054
10055   charset_list = args[coding_arg_charset_list];
10056   if (SYMBOLP (charset_list))
10057     {
10058       if (EQ (charset_list, Qiso_2022))
10059         {
10060           if (! EQ (coding_type, Qiso_2022))
10061             error ("Invalid charset-list");
10062           charset_list = Viso_2022_charset_list;
10063         }
10064       else if (EQ (charset_list, Qemacs_mule))
10065         {
10066           if (! EQ (coding_type, Qemacs_mule))
10067             error ("Invalid charset-list");
10068           charset_list = Vemacs_mule_charset_list;
10069         }
10070       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10071         {
10072           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10073             error ("Invalid charset-list");
10074           if (max_charset_id < XFASTINT (XCAR (tail)))
10075             max_charset_id = XFASTINT (XCAR (tail));
10076         }
10077     }
10078   else
10079     {
10080       charset_list = Fcopy_sequence (charset_list);
10081       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10082         {
10083           struct charset *charset;
10084
10085           val = XCAR (tail);
10086           CHECK_CHARSET_GET_CHARSET (val, charset);
10087           if (EQ (coding_type, Qiso_2022)
10088               ? CHARSET_ISO_FINAL (charset) < 0
10089               : EQ (coding_type, Qemacs_mule)
10090               ? CHARSET_EMACS_MULE_ID (charset) < 0
10091               : 0)
10092             error ("Can't handle charset `%s'",
10093                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10094
10095           XSETCAR (tail, make_number (charset->id));
10096           if (max_charset_id < charset->id)
10097             max_charset_id = charset->id;
10098         }
10099     }
10100   ASET (attrs, coding_attr_charset_list, charset_list);
10101
10102   safe_charsets = make_uninit_string (max_charset_id + 1);
10103   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10104   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10105     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10106   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10107
10108   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10109
10110   val = args[coding_arg_decode_translation_table];
10111   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10112     CHECK_SYMBOL (val);
10113   ASET (attrs, coding_attr_decode_tbl, val);
10114
10115   val = args[coding_arg_encode_translation_table];
10116   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10117     CHECK_SYMBOL (val);
10118   ASET (attrs, coding_attr_encode_tbl, val);
10119
10120   val = args[coding_arg_post_read_conversion];
10121   CHECK_SYMBOL (val);
10122   ASET (attrs, coding_attr_post_read, val);
10123
10124   val = args[coding_arg_pre_write_conversion];
10125   CHECK_SYMBOL (val);
10126   ASET (attrs, coding_attr_pre_write, val);
10127
10128   val = args[coding_arg_default_char];
10129   if (NILP (val))
10130     ASET (attrs, coding_attr_default_char, make_number (' '));
10131   else
10132     {
10133       CHECK_CHARACTER (val);
10134       ASET (attrs, coding_attr_default_char, val);
10135     }
10136
10137   val = args[coding_arg_for_unibyte];
10138   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10139
10140   val = args[coding_arg_plist];
10141   CHECK_LIST (val);
10142   ASET (attrs, coding_attr_plist, val);
10143
10144   if (EQ (coding_type, Qcharset))
10145     {
10146       /* Generate a lisp vector of 256 elements.  Each element is nil,
10147          integer, or a list of charset IDs.
10148
10149          If Nth element is nil, the byte code N is invalid in this
10150          coding system.
10151
10152          If Nth element is a number NUM, N is the first byte of a
10153          charset whose ID is NUM.
10154
10155          If Nth element is a list of charset IDs, N is the first byte
10156          of one of them.  The list is sorted by dimensions of the
10157          charsets.  A charset of smaller dimension comes first. */
10158       val = Fmake_vector (make_number (256), Qnil);
10159
10160       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10161         {
10162           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10163           int dim = CHARSET_DIMENSION (charset);
10164           int idx = (dim - 1) * 4;
10165
10166           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10167             ASET (attrs, coding_attr_ascii_compat, Qt);
10168
10169           for (i = charset->code_space[idx];
10170                i <= charset->code_space[idx + 1]; i++)
10171             {
10172               Lisp_Object tmp, tmp2;
10173               int dim2;
10174
10175               tmp = AREF (val, i);
10176               if (NILP (tmp))
10177                 tmp = XCAR (tail);
10178               else if (NUMBERP (tmp))
10179                 {
10180                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10181                   if (dim < dim2)
10182                     tmp = list2 (XCAR (tail), tmp);
10183                   else
10184                     tmp = list2 (tmp, XCAR (tail));
10185                 }
10186               else
10187                 {
10188                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10189                     {
10190                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10191                       if (dim < dim2)
10192                         break;
10193                     }
10194                   if (NILP (tmp2))
10195                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10196                   else
10197                     {
10198                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10199                       XSETCAR (tmp2, XCAR (tail));
10200                     }
10201                 }
10202               ASET (val, i, tmp);
10203             }
10204         }
10205       ASET (attrs, coding_attr_charset_valids, val);
10206       category = coding_category_charset;
10207     }
10208   else if (EQ (coding_type, Qccl))
10209     {
10210       Lisp_Object valids;
10211
10212       if (nargs < coding_arg_ccl_max)
10213         goto short_args;
10214
10215       val = args[coding_arg_ccl_decoder];
10216       CHECK_CCL_PROGRAM (val);
10217       if (VECTORP (val))
10218         val = Fcopy_sequence (val);
10219       ASET (attrs, coding_attr_ccl_decoder, val);
10220
10221       val = args[coding_arg_ccl_encoder];
10222       CHECK_CCL_PROGRAM (val);
10223       if (VECTORP (val))
10224         val = Fcopy_sequence (val);
10225       ASET (attrs, coding_attr_ccl_encoder, val);
10226
10227       val = args[coding_arg_ccl_valids];
10228       valids = Fmake_string (make_number (256), make_number (0));
10229       for (tail = val; CONSP (tail); tail = XCDR (tail))
10230         {
10231           int from, to;
10232
10233           val = XCAR (tail);
10234           if (INTEGERP (val))
10235             {
10236               if (! (0 <= XINT (val) && XINT (val) <= 255))
10237                 args_out_of_range_3 (val, make_number (0), make_number (255));
10238               from = to = XINT (val);
10239             }
10240           else
10241             {
10242               CHECK_CONS (val);
10243               CHECK_NATNUM_CAR (val);
10244               CHECK_NUMBER_CDR (val);
10245               if (XINT (XCAR (val)) > 255)
10246                 args_out_of_range_3 (XCAR (val),
10247                                      make_number (0), make_number (255));
10248               from = XINT (XCAR (val));
10249               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10250                 args_out_of_range_3 (XCDR (val),
10251                                      XCAR (val), make_number (255));
10252               to = XINT (XCDR (val));
10253             }
10254           for (i = from; i <= to; i++)
10255             SSET (valids, i, 1);
10256         }
10257       ASET (attrs, coding_attr_ccl_valids, valids);
10258
10259       category = coding_category_ccl;
10260     }
10261   else if (EQ (coding_type, Qutf_16))
10262     {
10263       Lisp_Object bom, endian;
10264
10265       ASET (attrs, coding_attr_ascii_compat, Qnil);
10266
10267       if (nargs < coding_arg_utf16_max)
10268         goto short_args;
10269
10270       bom = args[coding_arg_utf16_bom];
10271       if (! NILP (bom) && ! EQ (bom, Qt))
10272         {
10273           CHECK_CONS (bom);
10274           val = XCAR (bom);
10275           CHECK_CODING_SYSTEM (val);
10276           val = XCDR (bom);
10277           CHECK_CODING_SYSTEM (val);
10278         }
10279       ASET (attrs, coding_attr_utf_bom, bom);
10280
10281       endian = args[coding_arg_utf16_endian];
10282       CHECK_SYMBOL (endian);
10283       if (NILP (endian))
10284         endian = Qbig;
10285       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10286         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10287       ASET (attrs, coding_attr_utf_16_endian, endian);
10288
10289       category = (CONSP (bom)
10290                   ? coding_category_utf_16_auto
10291                   : NILP (bom)
10292                   ? (EQ (endian, Qbig)
10293                      ? coding_category_utf_16_be_nosig
10294                      : coding_category_utf_16_le_nosig)
10295                   : (EQ (endian, Qbig)
10296                      ? coding_category_utf_16_be
10297                      : coding_category_utf_16_le));
10298     }
10299   else if (EQ (coding_type, Qiso_2022))
10300     {
10301       Lisp_Object initial, reg_usage, request, flags;
10302
10303       if (nargs < coding_arg_iso2022_max)
10304         goto short_args;
10305
10306       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10307       CHECK_VECTOR (initial);
10308       for (i = 0; i < 4; i++)
10309         {
10310           val = AREF (initial, i);
10311           if (! NILP (val))
10312             {
10313               struct charset *charset;
10314
10315               CHECK_CHARSET_GET_CHARSET (val, charset);
10316               ASET (initial, i, make_number (CHARSET_ID (charset)));
10317               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10318                 ASET (attrs, coding_attr_ascii_compat, Qt);
10319             }
10320           else
10321             ASET (initial, i, make_number (-1));
10322         }
10323
10324       reg_usage = args[coding_arg_iso2022_reg_usage];
10325       CHECK_CONS (reg_usage);
10326       CHECK_NUMBER_CAR (reg_usage);
10327       CHECK_NUMBER_CDR (reg_usage);
10328
10329       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10330       for (tail = request; CONSP (tail); tail = XCDR (tail))
10331         {
10332           int id;
10333           Lisp_Object tmp1;
10334
10335           val = XCAR (tail);
10336           CHECK_CONS (val);
10337           tmp1 = XCAR (val);
10338           CHECK_CHARSET_GET_ID (tmp1, id);
10339           CHECK_NATNUM_CDR (val);
10340           if (XINT (XCDR (val)) >= 4)
10341             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10342           XSETCAR (val, make_number (id));
10343         }
10344
10345       flags = args[coding_arg_iso2022_flags];
10346       CHECK_NATNUM (flags);
10347       i = XINT (flags) & INT_MAX;
10348       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10349         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10350       flags = make_number (i);
10351
10352       ASET (attrs, coding_attr_iso_initial, initial);
10353       ASET (attrs, coding_attr_iso_usage, reg_usage);
10354       ASET (attrs, coding_attr_iso_request, request);
10355       ASET (attrs, coding_attr_iso_flags, flags);
10356       setup_iso_safe_charsets (attrs);
10357
10358       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10359         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10360                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10361                     ? coding_category_iso_7_else
10362                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10363                     ? coding_category_iso_7
10364                     : coding_category_iso_7_tight);
10365       else
10366         {
10367           int id = XINT (AREF (initial, 1));
10368
10369           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10370                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10371                        || id < 0)
10372                       ? coding_category_iso_8_else
10373                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10374                       ? coding_category_iso_8_1
10375                       : coding_category_iso_8_2);
10376         }
10377       if (category != coding_category_iso_8_1
10378           && category != coding_category_iso_8_2)
10379         ASET (attrs, coding_attr_ascii_compat, Qnil);
10380     }
10381   else if (EQ (coding_type, Qemacs_mule))
10382     {
10383       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10384         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10385       ASET (attrs, coding_attr_ascii_compat, Qt);
10386       category = coding_category_emacs_mule;
10387     }
10388   else if (EQ (coding_type, Qshift_jis))
10389     {
10390
10391       struct charset *charset;
10392
10393       if (XINT (Flength (charset_list)) != 3
10394           && XINT (Flength (charset_list)) != 4)
10395         error ("There should be three or four charsets");
10396
10397       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10398       if (CHARSET_DIMENSION (charset) != 1)
10399         error ("Dimension of charset %s is not one",
10400                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10401       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10402         ASET (attrs, coding_attr_ascii_compat, Qt);
10403
10404       charset_list = XCDR (charset_list);
10405       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10406       if (CHARSET_DIMENSION (charset) != 1)
10407         error ("Dimension of charset %s is not one",
10408                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10409
10410       charset_list = XCDR (charset_list);
10411       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10412       if (CHARSET_DIMENSION (charset) != 2)
10413         error ("Dimension of charset %s is not two",
10414                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10415
10416       charset_list = XCDR (charset_list);
10417       if (! NILP (charset_list))
10418         {
10419           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10420           if (CHARSET_DIMENSION (charset) != 2)
10421             error ("Dimension of charset %s is not two",
10422                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10423         }
10424
10425       category = coding_category_sjis;
10426       Vsjis_coding_system = name;
10427     }
10428   else if (EQ (coding_type, Qbig5))
10429     {
10430       struct charset *charset;
10431
10432       if (XINT (Flength (charset_list)) != 2)
10433         error ("There should be just two charsets");
10434
10435       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10436       if (CHARSET_DIMENSION (charset) != 1)
10437         error ("Dimension of charset %s is not one",
10438                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10439       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10440         ASET (attrs, coding_attr_ascii_compat, Qt);
10441
10442       charset_list = XCDR (charset_list);
10443       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10444       if (CHARSET_DIMENSION (charset) != 2)
10445         error ("Dimension of charset %s is not two",
10446                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10447
10448       category = coding_category_big5;
10449       Vbig5_coding_system = name;
10450     }
10451   else if (EQ (coding_type, Qraw_text))
10452     {
10453       category = coding_category_raw_text;
10454       ASET (attrs, coding_attr_ascii_compat, Qt);
10455     }
10456   else if (EQ (coding_type, Qutf_8))
10457     {
10458       Lisp_Object bom;
10459
10460       if (nargs < coding_arg_utf8_max)
10461         goto short_args;
10462
10463       bom = args[coding_arg_utf8_bom];
10464       if (! NILP (bom) && ! EQ (bom, Qt))
10465         {
10466           CHECK_CONS (bom);
10467           val = XCAR (bom);
10468           CHECK_CODING_SYSTEM (val);
10469           val = XCDR (bom);
10470           CHECK_CODING_SYSTEM (val);
10471         }
10472       ASET (attrs, coding_attr_utf_bom, bom);
10473       if (NILP (bom))
10474         ASET (attrs, coding_attr_ascii_compat, Qt);
10475
10476       category = (CONSP (bom) ? coding_category_utf_8_auto
10477                   : NILP (bom) ? coding_category_utf_8_nosig
10478                   : coding_category_utf_8_sig);
10479     }
10480   else if (EQ (coding_type, Qundecided))
10481     {
10482       if (nargs < coding_arg_undecided_max)
10483         goto short_args;
10484       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10485             args[coding_arg_undecided_inhibit_null_byte_detection]);
10486       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10487             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10488       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10489             args[coding_arg_undecided_prefer_utf_8]);
10490       category = coding_category_undecided;
10491     }
10492   else
10493     error ("Invalid coding system type: %s",
10494            SDATA (SYMBOL_NAME (coding_type)));
10495
10496   ASET (attrs, coding_attr_category, make_number (category));
10497   ASET (attrs, coding_attr_plist,
10498         Fcons (QCcategory,
10499                Fcons (AREF (Vcoding_category_table, category),
10500                       CODING_ATTR_PLIST (attrs))));
10501   ASET (attrs, coding_attr_plist,
10502         Fcons (QCascii_compatible_p,
10503                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10504                       CODING_ATTR_PLIST (attrs))));
10505
10506   eol_type = args[coding_arg_eol_type];
10507   if (! NILP (eol_type)
10508       && ! EQ (eol_type, Qunix)
10509       && ! EQ (eol_type, Qdos)
10510       && ! EQ (eol_type, Qmac))
10511     error ("Invalid eol-type");
10512
10513   aliases = list1 (name);
10514
10515   if (NILP (eol_type))
10516     {
10517       eol_type = make_subsidiaries (name);
10518       for (i = 0; i < 3; i++)
10519         {
10520           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10521
10522           this_name = AREF (eol_type, i);
10523           this_aliases = list1 (this_name);
10524           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10525           this_spec = make_uninit_vector (3);
10526           ASET (this_spec, 0, attrs);
10527           ASET (this_spec, 1, this_aliases);
10528           ASET (this_spec, 2, this_eol_type);
10529           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10530           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10531           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10532           if (NILP (val))
10533             Vcoding_system_alist
10534               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10535                        Vcoding_system_alist);
10536         }
10537     }
10538
10539   spec_vec = make_uninit_vector (3);
10540   ASET (spec_vec, 0, attrs);
10541   ASET (spec_vec, 1, aliases);
10542   ASET (spec_vec, 2, eol_type);
10543
10544   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10545   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10546   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10547   if (NILP (val))
10548     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10549                                   Vcoding_system_alist);
10550
10551   {
10552     int id = coding_categories[category].id;
10553
10554     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10555       setup_coding_system (name, &coding_categories[category]);
10556   }
10557
10558   return Qnil;
10559
10560  short_args:
10561   return Fsignal (Qwrong_number_of_arguments,
10562                   Fcons (intern ("define-coding-system-internal"),
10563                          make_number (nargs)));
10564 }
10565
10566
10567 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10568        3, 3, 0,
10569        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10570   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10571 {
10572   Lisp_Object spec, attrs;
10573
10574   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10575   attrs = AREF (spec, 0);
10576   if (EQ (prop, QCmnemonic))
10577     {
10578       if (! STRINGP (val))
10579         CHECK_CHARACTER (val);
10580       ASET (attrs, coding_attr_mnemonic, val);
10581     }
10582   else if (EQ (prop, QCdefault_char))
10583     {
10584       if (NILP (val))
10585         val = make_number (' ');
10586       else
10587         CHECK_CHARACTER (val);
10588       ASET (attrs, coding_attr_default_char, val);
10589     }
10590   else if (EQ (prop, QCdecode_translation_table))
10591     {
10592       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10593         CHECK_SYMBOL (val);
10594       ASET (attrs, coding_attr_decode_tbl, val);
10595     }
10596   else if (EQ (prop, QCencode_translation_table))
10597     {
10598       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10599         CHECK_SYMBOL (val);
10600       ASET (attrs, coding_attr_encode_tbl, val);
10601     }
10602   else if (EQ (prop, QCpost_read_conversion))
10603     {
10604       CHECK_SYMBOL (val);
10605       ASET (attrs, coding_attr_post_read, val);
10606     }
10607   else if (EQ (prop, QCpre_write_conversion))
10608     {
10609       CHECK_SYMBOL (val);
10610       ASET (attrs, coding_attr_pre_write, val);
10611     }
10612   else if (EQ (prop, QCascii_compatible_p))
10613     {
10614       ASET (attrs, coding_attr_ascii_compat, val);
10615     }
10616
10617   ASET (attrs, coding_attr_plist,
10618         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10619   return val;
10620 }
10621
10622
10623 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10624        Sdefine_coding_system_alias, 2, 2, 0,
10625        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10626   (Lisp_Object alias, Lisp_Object coding_system)
10627 {
10628   Lisp_Object spec, aliases, eol_type, val;
10629
10630   CHECK_SYMBOL (alias);
10631   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10632   aliases = AREF (spec, 1);
10633   /* ALIASES should be a list of length more than zero, and the first
10634      element is a base coding system.  Append ALIAS at the tail of the
10635      list.  */
10636   while (!NILP (XCDR (aliases)))
10637     aliases = XCDR (aliases);
10638   XSETCDR (aliases, list1 (alias));
10639
10640   eol_type = AREF (spec, 2);
10641   if (VECTORP (eol_type))
10642     {
10643       Lisp_Object subsidiaries;
10644       int i;
10645
10646       subsidiaries = make_subsidiaries (alias);
10647       for (i = 0; i < 3; i++)
10648         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10649                                      AREF (eol_type, i));
10650     }
10651
10652   Fputhash (alias, spec, Vcoding_system_hash_table);
10653   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10654   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10655   if (NILP (val))
10656     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10657                                   Vcoding_system_alist);
10658
10659   return Qnil;
10660 }
10661
10662 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10663        1, 1, 0,
10664        doc: /* Return the base of CODING-SYSTEM.
10665 Any alias or subsidiary coding system is not a base coding system.  */)
10666   (Lisp_Object coding_system)
10667 {
10668   Lisp_Object spec, attrs;
10669
10670   if (NILP (coding_system))
10671     return (Qno_conversion);
10672   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10673   attrs = AREF (spec, 0);
10674   return CODING_ATTR_BASE_NAME (attrs);
10675 }
10676
10677 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10678        1, 1, 0,
10679        doc: /* Return the property list of CODING-SYSTEM.  */)
10680   (Lisp_Object coding_system)
10681 {
10682   Lisp_Object spec, attrs;
10683
10684   if (NILP (coding_system))
10685     coding_system = Qno_conversion;
10686   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10687   attrs = AREF (spec, 0);
10688   return CODING_ATTR_PLIST (attrs);
10689 }
10690
10691
10692 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10693        1, 1, 0,
10694        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10695   (Lisp_Object coding_system)
10696 {
10697   Lisp_Object spec;
10698
10699   if (NILP (coding_system))
10700     coding_system = Qno_conversion;
10701   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10702   return AREF (spec, 1);
10703 }
10704
10705 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10706        Scoding_system_eol_type, 1, 1, 0,
10707        doc: /* Return eol-type of CODING-SYSTEM.
10708 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10709
10710 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10711 and CR respectively.
10712
10713 A vector value indicates that a format of end-of-line should be
10714 detected automatically.  Nth element of the vector is the subsidiary
10715 coding system whose eol-type is N.  */)
10716   (Lisp_Object coding_system)
10717 {
10718   Lisp_Object spec, eol_type;
10719   int n;
10720
10721   if (NILP (coding_system))
10722     coding_system = Qno_conversion;
10723   if (! CODING_SYSTEM_P (coding_system))
10724     return Qnil;
10725   spec = CODING_SYSTEM_SPEC (coding_system);
10726   eol_type = AREF (spec, 2);
10727   if (VECTORP (eol_type))
10728     return Fcopy_sequence (eol_type);
10729   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10730   return make_number (n);
10731 }
10732
10733 #endif /* emacs */
10734
10735 \f
10736 /*** 9. Post-amble ***/
10737
10738 void
10739 init_coding_once (void)
10740 {
10741   int i;
10742
10743   for (i = 0; i < coding_category_max; i++)
10744     {
10745       coding_categories[i].id = -1;
10746       coding_priorities[i] = i;
10747     }
10748
10749   /* ISO2022 specific initialize routine.  */
10750   for (i = 0; i < 0x20; i++)
10751     iso_code_class[i] = ISO_control_0;
10752   for (i = 0x21; i < 0x7F; i++)
10753     iso_code_class[i] = ISO_graphic_plane_0;
10754   for (i = 0x80; i < 0xA0; i++)
10755     iso_code_class[i] = ISO_control_1;
10756   for (i = 0xA1; i < 0xFF; i++)
10757     iso_code_class[i] = ISO_graphic_plane_1;
10758   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10759   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10760   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10761   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10762   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10763   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10764   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10765   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10766   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10767
10768   for (i = 0; i < 256; i++)
10769     {
10770       emacs_mule_bytes[i] = 1;
10771     }
10772   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10773   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10774   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10775   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10776 }
10777
10778 #ifdef emacs
10779
10780 void
10781 syms_of_coding (void)
10782 {
10783   staticpro (&Vcoding_system_hash_table);
10784   Vcoding_system_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
10785
10786   staticpro (&Vsjis_coding_system);
10787   Vsjis_coding_system = Qnil;
10788
10789   staticpro (&Vbig5_coding_system);
10790   Vbig5_coding_system = Qnil;
10791
10792   staticpro (&Vcode_conversion_reused_workbuf);
10793   Vcode_conversion_reused_workbuf = Qnil;
10794
10795   staticpro (&Vcode_conversion_workbuf_name);
10796   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10797
10798   reused_workbuf_in_use = 0;
10799
10800   DEFSYM (Qcharset, "charset");
10801   DEFSYM (Qtarget_idx, "target-idx");
10802   DEFSYM (Qcoding_system_history, "coding-system-history");
10803   Fset (Qcoding_system_history, Qnil);
10804
10805   /* Target FILENAME is the first argument.  */
10806   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10807   /* Target FILENAME is the third argument.  */
10808   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10809
10810   DEFSYM (Qcall_process, "call-process");
10811   /* Target PROGRAM is the first argument.  */
10812   Fput (Qcall_process, Qtarget_idx, make_number (0));
10813
10814   DEFSYM (Qcall_process_region, "call-process-region");
10815   /* Target PROGRAM is the third argument.  */
10816   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10817
10818   DEFSYM (Qstart_process, "start-process");
10819   /* Target PROGRAM is the third argument.  */
10820   Fput (Qstart_process, Qtarget_idx, make_number (2));
10821
10822   DEFSYM (Qopen_network_stream, "open-network-stream");
10823   /* Target SERVICE is the fourth argument.  */
10824   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10825
10826   DEFSYM (Qunix, "unix");
10827   DEFSYM (Qdos, "dos");
10828   DEFSYM (Qmac, "mac");
10829
10830   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10831   DEFSYM (Qundecided, "undecided");
10832   DEFSYM (Qno_conversion, "no-conversion");
10833   DEFSYM (Qraw_text, "raw-text");
10834
10835   DEFSYM (Qiso_2022, "iso-2022");
10836
10837   DEFSYM (Qutf_8, "utf-8");
10838   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10839
10840 #if defined (WINDOWSNT) || defined (CYGWIN)
10841   /* No, not utf-16-le: that one has a BOM.  */
10842   DEFSYM (Qutf_16le, "utf-16le");
10843 #endif
10844
10845   DEFSYM (Qutf_16, "utf-16");
10846   DEFSYM (Qbig, "big");
10847   DEFSYM (Qlittle, "little");
10848
10849   DEFSYM (Qshift_jis, "shift-jis");
10850   DEFSYM (Qbig5, "big5");
10851
10852   DEFSYM (Qcoding_system_p, "coding-system-p");
10853
10854   /* Error signaled when there's a problem with detecting a coding system.  */
10855   DEFSYM (Qcoding_system_error, "coding-system-error");
10856   Fput (Qcoding_system_error, Qerror_conditions,
10857         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10858   Fput (Qcoding_system_error, Qerror_message,
10859         build_pure_c_string ("Invalid coding system"));
10860
10861   DEFSYM (Qtranslation_table, "translation-table");
10862   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10863   DEFSYM (Qtranslation_table_id, "translation-table-id");
10864
10865   /* Coding system emacs-mule and raw-text are for converting only
10866      end-of-line format.  */
10867   DEFSYM (Qemacs_mule, "emacs-mule");
10868
10869   DEFSYM (QCcategory, ":category");
10870   DEFSYM (QCmnemonic, ":mnemonic");
10871   DEFSYM (QCdefault_char, ":default-char");
10872   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10873   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10874   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10875   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10876   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10877
10878   Vcoding_category_table
10879     = Fmake_vector (make_number (coding_category_max), Qnil);
10880   staticpro (&Vcoding_category_table);
10881   /* Followings are target of code detection.  */
10882   ASET (Vcoding_category_table, coding_category_iso_7,
10883         intern_c_string ("coding-category-iso-7"));
10884   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10885         intern_c_string ("coding-category-iso-7-tight"));
10886   ASET (Vcoding_category_table, coding_category_iso_8_1,
10887         intern_c_string ("coding-category-iso-8-1"));
10888   ASET (Vcoding_category_table, coding_category_iso_8_2,
10889         intern_c_string ("coding-category-iso-8-2"));
10890   ASET (Vcoding_category_table, coding_category_iso_7_else,
10891         intern_c_string ("coding-category-iso-7-else"));
10892   ASET (Vcoding_category_table, coding_category_iso_8_else,
10893         intern_c_string ("coding-category-iso-8-else"));
10894   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10895         intern_c_string ("coding-category-utf-8-auto"));
10896   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10897         intern_c_string ("coding-category-utf-8"));
10898   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10899         intern_c_string ("coding-category-utf-8-sig"));
10900   ASET (Vcoding_category_table, coding_category_utf_16_be,
10901         intern_c_string ("coding-category-utf-16-be"));
10902   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10903         intern_c_string ("coding-category-utf-16-auto"));
10904   ASET (Vcoding_category_table, coding_category_utf_16_le,
10905         intern_c_string ("coding-category-utf-16-le"));
10906   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10907         intern_c_string ("coding-category-utf-16-be-nosig"));
10908   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10909         intern_c_string ("coding-category-utf-16-le-nosig"));
10910   ASET (Vcoding_category_table, coding_category_charset,
10911         intern_c_string ("coding-category-charset"));
10912   ASET (Vcoding_category_table, coding_category_sjis,
10913         intern_c_string ("coding-category-sjis"));
10914   ASET (Vcoding_category_table, coding_category_big5,
10915         intern_c_string ("coding-category-big5"));
10916   ASET (Vcoding_category_table, coding_category_ccl,
10917         intern_c_string ("coding-category-ccl"));
10918   ASET (Vcoding_category_table, coding_category_emacs_mule,
10919         intern_c_string ("coding-category-emacs-mule"));
10920   /* Followings are NOT target of code detection.  */
10921   ASET (Vcoding_category_table, coding_category_raw_text,
10922         intern_c_string ("coding-category-raw-text"));
10923   ASET (Vcoding_category_table, coding_category_undecided,
10924         intern_c_string ("coding-category-undecided"));
10925
10926   DEFSYM (Qinsufficient_source, "insufficient-source");
10927   DEFSYM (Qinvalid_source, "invalid-source");
10928   DEFSYM (Qinterrupted, "interrupted");
10929
10930   /* If a symbol has this property, evaluate the value to define the
10931      symbol as a coding system.  */
10932   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10933
10934   defsubr (&Scoding_system_p);
10935   defsubr (&Sread_coding_system);
10936   defsubr (&Sread_non_nil_coding_system);
10937   defsubr (&Scheck_coding_system);
10938   defsubr (&Sdetect_coding_region);
10939   defsubr (&Sdetect_coding_string);
10940   defsubr (&Sfind_coding_systems_region_internal);
10941   defsubr (&Sunencodable_char_position);
10942   defsubr (&Scheck_coding_systems_region);
10943   defsubr (&Sdecode_coding_region);
10944   defsubr (&Sencode_coding_region);
10945   defsubr (&Sdecode_coding_string);
10946   defsubr (&Sencode_coding_string);
10947   defsubr (&Sdecode_sjis_char);
10948   defsubr (&Sencode_sjis_char);
10949   defsubr (&Sdecode_big5_char);
10950   defsubr (&Sencode_big5_char);
10951   defsubr (&Sset_terminal_coding_system_internal);
10952   defsubr (&Sset_safe_terminal_coding_system_internal);
10953   defsubr (&Sterminal_coding_system);
10954   defsubr (&Sset_keyboard_coding_system_internal);
10955   defsubr (&Skeyboard_coding_system);
10956   defsubr (&Sfind_operation_coding_system);
10957   defsubr (&Sset_coding_system_priority);
10958   defsubr (&Sdefine_coding_system_internal);
10959   defsubr (&Sdefine_coding_system_alias);
10960   defsubr (&Scoding_system_put);
10961   defsubr (&Scoding_system_base);
10962   defsubr (&Scoding_system_plist);
10963   defsubr (&Scoding_system_aliases);
10964   defsubr (&Scoding_system_eol_type);
10965   defsubr (&Scoding_system_priority_list);
10966
10967   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10968                doc: /* List of coding systems.
10969
10970 Do not alter the value of this variable manually.  This variable should be
10971 updated by the functions `define-coding-system' and
10972 `define-coding-system-alias'.  */);
10973   Vcoding_system_list = Qnil;
10974
10975   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10976                doc: /* Alist of coding system names.
10977 Each element is one element list of coding system name.
10978 This variable is given to `completing-read' as COLLECTION argument.
10979
10980 Do not alter the value of this variable manually.  This variable should be
10981 updated by the functions `make-coding-system' and
10982 `define-coding-system-alias'.  */);
10983   Vcoding_system_alist = Qnil;
10984
10985   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10986                doc: /* List of coding-categories (symbols) ordered by priority.
10987
10988 On detecting a coding system, Emacs tries code detection algorithms
10989 associated with each coding-category one by one in this order.  When
10990 one algorithm agrees with a byte sequence of source text, the coding
10991 system bound to the corresponding coding-category is selected.
10992
10993 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10994   {
10995     int i;
10996
10997     Vcoding_category_list = Qnil;
10998     for (i = coding_category_max - 1; i >= 0; i--)
10999       Vcoding_category_list
11000         = Fcons (AREF (Vcoding_category_table, i),
11001                  Vcoding_category_list);
11002   }
11003
11004   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11005                doc: /* Specify the coding system for read operations.
11006 It is useful to bind this variable with `let', but do not set it globally.
11007 If the value is a coding system, it is used for decoding on read operation.
11008 If not, an appropriate element is used from one of the coding system alists.
11009 There are three such tables: `file-coding-system-alist',
11010 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11011   Vcoding_system_for_read = Qnil;
11012
11013   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11014                doc: /* Specify the coding system for write operations.
11015 Programs bind this variable with `let', but you should not set it globally.
11016 If the value is a coding system, it is used for encoding of output,
11017 when writing it to a file and when sending it to a file or subprocess.
11018
11019 If this does not specify a coding system, an appropriate element
11020 is used from one of the coding system alists.
11021 There are three such tables: `file-coding-system-alist',
11022 `process-coding-system-alist', and `network-coding-system-alist'.
11023 For output to files, if the above procedure does not specify a coding system,
11024 the value of `buffer-file-coding-system' is used.  */);
11025   Vcoding_system_for_write = Qnil;
11026
11027   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11028                doc: /*
11029 Coding system used in the latest file or process I/O.  */);
11030   Vlast_coding_system_used = Qnil;
11031
11032   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11033                doc: /*
11034 Error status of the last code conversion.
11035
11036 When an error was detected in the last code conversion, this variable
11037 is set to one of the following symbols.
11038   `insufficient-source'
11039   `inconsistent-eol'
11040   `invalid-source'
11041   `interrupted'
11042   `insufficient-memory'
11043 When no error was detected, the value doesn't change.  So, to check
11044 the error status of a code conversion by this variable, you must
11045 explicitly set this variable to nil before performing code
11046 conversion.  */);
11047   Vlast_code_conversion_error = Qnil;
11048
11049   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11050                doc: /*
11051 Non-nil means always inhibit code conversion of end-of-line format.
11052 See info node `Coding Systems' and info node `Text and Binary' concerning
11053 such conversion.  */);
11054   inhibit_eol_conversion = 0;
11055
11056   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11057                doc: /*
11058 Non-nil means process buffer inherits coding system of process output.
11059 Bind it to t if the process output is to be treated as if it were a file
11060 read from some filesystem.  */);
11061   inherit_process_coding_system = 0;
11062
11063   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11064                doc: /*
11065 Alist to decide a coding system to use for a file I/O operation.
11066 The format is ((PATTERN . VAL) ...),
11067 where PATTERN is a regular expression matching a file name,
11068 VAL is a coding system, a cons of coding systems, or a function symbol.
11069 If VAL is a coding system, it is used for both decoding and encoding
11070 the file contents.
11071 If VAL is a cons of coding systems, the car part is used for decoding,
11072 and the cdr part is used for encoding.
11073 If VAL is a function symbol, the function must return a coding system
11074 or a cons of coding systems which are used as above.  The function is
11075 called with an argument that is a list of the arguments with which
11076 `find-operation-coding-system' was called.  If the function can't decide
11077 a coding system, it can return `undecided' so that the normal
11078 code-detection is performed.
11079
11080 See also the function `find-operation-coding-system'
11081 and the variable `auto-coding-alist'.  */);
11082   Vfile_coding_system_alist = Qnil;
11083
11084   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11085                doc: /*
11086 Alist to decide a coding system to use for a process I/O operation.
11087 The format is ((PATTERN . VAL) ...),
11088 where PATTERN is a regular expression matching a program name,
11089 VAL is a coding system, a cons of coding systems, or a function symbol.
11090 If VAL is a coding system, it is used for both decoding what received
11091 from the program and encoding what sent to the program.
11092 If VAL is a cons of coding systems, the car part is used for decoding,
11093 and the cdr part is used for encoding.
11094 If VAL is a function symbol, the function must return a coding system
11095 or a cons of coding systems which are used as above.
11096
11097 See also the function `find-operation-coding-system'.  */);
11098   Vprocess_coding_system_alist = Qnil;
11099
11100   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11101                doc: /*
11102 Alist to decide a coding system to use for a network I/O operation.
11103 The format is ((PATTERN . VAL) ...),
11104 where PATTERN is a regular expression matching a network service name
11105 or is a port number to connect to,
11106 VAL is a coding system, a cons of coding systems, or a function symbol.
11107 If VAL is a coding system, it is used for both decoding what received
11108 from the network stream and encoding what sent to the network stream.
11109 If VAL is a cons of coding systems, the car part is used for decoding,
11110 and the cdr part is used for encoding.
11111 If VAL is a function symbol, the function must return a coding system
11112 or a cons of coding systems which are used as above.
11113
11114 See also the function `find-operation-coding-system'.  */);
11115   Vnetwork_coding_system_alist = Qnil;
11116
11117   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11118                doc: /* Coding system to use with system messages.
11119 Also used for decoding keyboard input on X Window system, and for
11120 encoding standard output and error streams.  */);
11121   Vlocale_coding_system = Qnil;
11122
11123   /* The eol mnemonics are reset in startup.el system-dependently.  */
11124   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11125                doc: /*
11126 String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11127   eol_mnemonic_unix = build_pure_c_string (":");
11128
11129   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11130                doc: /*
11131 String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11132   eol_mnemonic_dos = build_pure_c_string ("\\");
11133
11134   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11135                doc: /*
11136 String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11137   eol_mnemonic_mac = build_pure_c_string ("/");
11138
11139   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11140                doc: /*
11141 String displayed in mode line when end-of-line format is not yet determined.  */);
11142   eol_mnemonic_undecided = build_pure_c_string (":");
11143
11144   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11145                doc: /*
11146 Non-nil enables character translation while encoding and decoding.  */);
11147   Venable_character_translation = Qt;
11148
11149   DEFVAR_LISP ("standard-translation-table-for-decode",
11150                Vstandard_translation_table_for_decode,
11151                doc: /* Table for translating characters while decoding.  */);
11152   Vstandard_translation_table_for_decode = Qnil;
11153
11154   DEFVAR_LISP ("standard-translation-table-for-encode",
11155                Vstandard_translation_table_for_encode,
11156                doc: /* Table for translating characters while encoding.  */);
11157   Vstandard_translation_table_for_encode = Qnil;
11158
11159   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11160                doc: /* Alist of charsets vs revision numbers.
11161 While encoding, if a charset (car part of an element) is found,
11162 designate it with the escape sequence identifying revision (cdr part
11163 of the element).  */);
11164   Vcharset_revision_table = Qnil;
11165
11166   DEFVAR_LISP ("default-process-coding-system",
11167                Vdefault_process_coding_system,
11168                doc: /* Cons of coding systems used for process I/O by default.
11169 The car part is used for decoding a process output,
11170 the cdr part is used for encoding a text to be sent to a process.  */);
11171   Vdefault_process_coding_system = Qnil;
11172
11173   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11174                doc: /*
11175 Table of extra Latin codes in the range 128..159 (inclusive).
11176 This is a vector of length 256.
11177 If Nth element is non-nil, the existence of code N in a file
11178 \(or output of subprocess) doesn't prevent it to be detected as
11179 a coding system of ISO 2022 variant which has a flag
11180 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11181 or reading output of a subprocess.
11182 Only 128th through 159th elements have a meaning.  */);
11183   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11184
11185   DEFVAR_LISP ("select-safe-coding-system-function",
11186                Vselect_safe_coding_system_function,
11187                doc: /*
11188 Function to call to select safe coding system for encoding a text.
11189
11190 If set, this function is called to force a user to select a proper
11191 coding system which can encode the text in the case that a default
11192 coding system used in each operation can't encode the text.  The
11193 function should take care that the buffer is not modified while
11194 the coding system is being selected.
11195
11196 The default value is `select-safe-coding-system' (which see).  */);
11197   Vselect_safe_coding_system_function = Qnil;
11198
11199   DEFVAR_BOOL ("coding-system-require-warning",
11200                coding_system_require_warning,
11201                doc: /* Internal use only.
11202 If non-nil, on writing a file, `select-safe-coding-system-function' is
11203 called even if `coding-system-for-write' is non-nil.  The command
11204 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11205   coding_system_require_warning = 0;
11206
11207
11208   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11209                inhibit_iso_escape_detection,
11210                doc: /*
11211 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11212
11213 When Emacs reads text, it tries to detect how the text is encoded.
11214 This code detection is sensitive to escape sequences.  If Emacs sees
11215 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11216 of the ISO2022 encodings, and decodes text by the corresponding coding
11217 system (e.g. `iso-2022-7bit').
11218
11219 However, there may be a case that you want to read escape sequences in
11220 a file as is.  In such a case, you can set this variable to non-nil.
11221 Then the code detection will ignore any escape sequences, and no text is
11222 detected as encoded in some ISO-2022 encoding.  The result is that all
11223 escape sequences become visible in a buffer.
11224
11225 The default value is nil, and it is strongly recommended not to change
11226 it.  That is because many Emacs Lisp source files that contain
11227 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11228 in Emacs's distribution, and they won't be decoded correctly on
11229 reading if you suppress escape sequence detection.
11230
11231 The other way to read escape sequences in a file without decoding is
11232 to explicitly specify some coding system that doesn't use ISO-2022
11233 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11234   inhibit_iso_escape_detection = 0;
11235
11236   DEFVAR_BOOL ("inhibit-null-byte-detection",
11237                inhibit_null_byte_detection,
11238                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11239 By default, Emacs treats it as binary data, and does not attempt to
11240 decode it.  The effect is as if you specified `no-conversion' for
11241 reading that text.
11242
11243 Set this to non-nil when a regular text happens to include null bytes.
11244 Examples are Index nodes of Info files and null-byte delimited output
11245 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11246 decode text as usual.  */);
11247   inhibit_null_byte_detection = 0;
11248
11249   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11250                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11251 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11252   disable_ascii_optimization = 0;
11253
11254   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11255                doc: /* Char table for translating self-inserting characters.
11256 This is applied to the result of input methods, not their input.
11257 See also `keyboard-translate-table'.
11258
11259 Use of this variable for character code unification was rendered
11260 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11261 internal character representation.  */);
11262   Vtranslation_table_for_input = Qnil;
11263
11264   Lisp_Object args[coding_arg_undecided_max];
11265   memclear (args, sizeof args);
11266
11267   Lisp_Object plist[] =
11268     {
11269       QCname,
11270       args[coding_arg_name] = Qno_conversion,
11271       QCmnemonic,
11272       args[coding_arg_mnemonic] = make_number ('='),
11273       intern_c_string (":coding-type"),
11274       args[coding_arg_coding_type] = Qraw_text,
11275       QCascii_compatible_p,
11276       args[coding_arg_ascii_compatible_p] = Qt,
11277       QCdefault_char,
11278       args[coding_arg_default_char] = make_number (0),
11279       intern_c_string (":for-unibyte"),
11280       args[coding_arg_for_unibyte] = Qt,
11281       intern_c_string (":docstring"),
11282       (build_pure_c_string
11283        ("Do no conversion.\n"
11284         "\n"
11285         "When you visit a file with this coding, the file is read into a\n"
11286         "unibyte buffer as is, thus each byte of a file is treated as a\n"
11287         "character.")),
11288       intern_c_string (":eol-type"),
11289       args[coding_arg_eol_type] = Qunix,
11290     };
11291   args[coding_arg_plist] = CALLMANY (Flist, plist);
11292   Fdefine_coding_system_internal (coding_arg_max, args);
11293
11294   plist[1] = args[coding_arg_name] = Qundecided;
11295   plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11296   plist[5] = args[coding_arg_coding_type] = Qundecided;
11297   /* This is already set.
11298      plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11299   plist[8] = intern_c_string (":charset-list");
11300   plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11301   plist[11] = args[coding_arg_for_unibyte] = Qnil;
11302   plist[13] = build_pure_c_string ("No conversion on encoding, "
11303                                    "automatic conversion on decoding.");
11304   plist[15] = args[coding_arg_eol_type] = Qnil;
11305   args[coding_arg_plist] = CALLMANY (Flist, plist);
11306   args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11307   args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11308   Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11309
11310   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11311
11312   for (int i = 0; i < coding_category_max; i++)
11313     Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11314
11315 #if defined (DOS_NT)
11316   system_eol_type = Qdos;
11317 #else
11318   system_eol_type = Qunix;
11319 #endif
11320   staticpro (&system_eol_type);
11321 }
11322
11323 char *
11324 emacs_strerror (int error_number)
11325 {
11326   char *str;
11327
11328   synchronize_system_messages_locale ();
11329   str = strerror (error_number);
11330
11331   if (! NILP (Vlocale_coding_system))
11332     {
11333       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11334                                                       Vlocale_coding_system,
11335                                                       0);
11336       str = SSDATA (dec);
11337     }
11338
11339   return str;
11340 }
11341
11342 #endif /* emacs */