src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2013 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 497
 498 /* A character to be produced on output if encoding of the original
 499    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 500 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 501
 502 /* UTF-8 section */
 503 #define CODING_UTF_8_BOM(coding)        \
 504   ((coding)->spec.utf_8_bom)
 505
 506 /* UTF-16 section */
 507 #define CODING_UTF_16_BOM(coding)       \
 508   ((coding)->spec.utf_16.bom)
 509
 510 #define CODING_UTF_16_ENDIAN(coding)    \
 511   ((coding)->spec.utf_16.endian)
 512
 513 #define CODING_UTF_16_SURROGATE(coding) \
 514   ((coding)->spec.utf_16.surrogate)
 515
 516
 517 /* CCL section */
 518 #define CODING_CCL_DECODER(coding)      \
 519   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 520 #define CODING_CCL_ENCODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 522 #define CODING_CCL_VALIDS(coding)                                          \
 523   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 524
 525 /* Index for each coding category in `coding_categories' */
 526
 527 enum coding_category
 528   {
 529     coding_category_iso_7,
 530     coding_category_iso_7_tight,
 531     coding_category_iso_8_1,
 532     coding_category_iso_8_2,
 533     coding_category_iso_7_else,
 534     coding_category_iso_8_else,
 535     coding_category_utf_8_auto,
 536     coding_category_utf_8_nosig,
 537     coding_category_utf_8_sig,
 538     coding_category_utf_16_auto,
 539     coding_category_utf_16_be,
 540     coding_category_utf_16_le,
 541     coding_category_utf_16_be_nosig,
 542     coding_category_utf_16_le_nosig,
 543     coding_category_charset,
 544     coding_category_sjis,
 545     coding_category_big5,
 546     coding_category_ccl,
 547     coding_category_emacs_mule,
 548     /* All above are targets of code detection.  */
 549     coding_category_raw_text,
 550     coding_category_undecided,
 551     coding_category_max
 552   };
 553
 554 /* Definitions of flag bits used in detect_coding_XXXX.  */
 555 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 556 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 557 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 558 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 559 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 560 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 561 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 562 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 563 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 564 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 565 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 566 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 567 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 568 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 569 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 570 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 571 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 572 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 573 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 574 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 575
 576 /* This value is returned if detect_coding_mask () find nothing other
 577    than ASCII characters.  */
 578 #define CATEGORY_MASK_ANY               \
 579   (CATEGORY_MASK_ISO_7                  \
 580    | CATEGORY_MASK_ISO_7_TIGHT          \
 581    | CATEGORY_MASK_ISO_8_1              \
 582    | CATEGORY_MASK_ISO_8_2              \
 583    | CATEGORY_MASK_ISO_7_ELSE           \
 584    | CATEGORY_MASK_ISO_8_ELSE           \
 585    | CATEGORY_MASK_UTF_8_AUTO           \
 586    | CATEGORY_MASK_UTF_8_NOSIG          \
 587    | CATEGORY_MASK_UTF_8_SIG            \
 588    | CATEGORY_MASK_UTF_16_AUTO          \
 589    | CATEGORY_MASK_UTF_16_BE            \
 590    | CATEGORY_MASK_UTF_16_LE            \
 591    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 592    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 593    | CATEGORY_MASK_CHARSET              \
 594    | CATEGORY_MASK_SJIS                 \
 595    | CATEGORY_MASK_BIG5                 \
 596    | CATEGORY_MASK_CCL                  \
 597    | CATEGORY_MASK_EMACS_MULE)
 598
 599
 600 #define CATEGORY_MASK_ISO_7BIT \
 601   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 602
 603 #define CATEGORY_MASK_ISO_8BIT \
 604   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 605
 606 #define CATEGORY_MASK_ISO_ELSE \
 607   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 608
 609 #define CATEGORY_MASK_ISO_ESCAPE        \
 610   (CATEGORY_MASK_ISO_7                  \
 611    | CATEGORY_MASK_ISO_7_TIGHT          \
 612    | CATEGORY_MASK_ISO_7_ELSE           \
 613    | CATEGORY_MASK_ISO_8_ELSE)
 614
 615 #define CATEGORY_MASK_ISO       \
 616   (  CATEGORY_MASK_ISO_7BIT     \
 617      | CATEGORY_MASK_ISO_8BIT   \
 618      | CATEGORY_MASK_ISO_ELSE)
 619
 620 #define CATEGORY_MASK_UTF_16            \
 621   (CATEGORY_MASK_UTF_16_AUTO            \
 622    | CATEGORY_MASK_UTF_16_BE            \
 623    | CATEGORY_MASK_UTF_16_LE            \
 624    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 625    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 626
 627 #define CATEGORY_MASK_UTF_8     \
 628   (CATEGORY_MASK_UTF_8_AUTO     \
 629    | CATEGORY_MASK_UTF_8_NOSIG  \
 630    | CATEGORY_MASK_UTF_8_SIG)
 631
 632 /* Table of coding categories (Lisp symbols).  This variable is for
 633    internal use only.  */
 634 static Lisp_Object Vcoding_category_table;
 635
 636 /* Table of coding-categories ordered by priority.  */
 637 static enum coding_category coding_priorities[coding_category_max];
 638
 639 /* Nth element is a coding context for the coding system bound to the
 640    Nth coding category.  */
 641 static struct coding_system coding_categories[coding_category_max];
 642
 643 /*** Commonly used macros and functions ***/
 644
 645 #ifndef min
 646 #define min(a, b) ((a) < (b) ? (a) : (b))
 647 #endif
 648 #ifndef max
 649 #define max(a, b) ((a) > (b) ? (a) : (b))
 650 #endif
 651
 652 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 653   do {                                                  \
 654     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 655     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 656   } while (0)
 657
 658
 659 /* Safely get one byte from the source text pointed by SRC which ends
 660    at SRC_END, and set C to that byte.  If there are not enough bytes
 661    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 662    and a multibyte character is found at SRC, set C to the
 663    negative value of the character code.  The caller should declare
 664    and set these variables appropriately in advance:
 665         src, src_end, multibytep */
 666
 667 #define ONE_MORE_BYTE(c)                                \
 668   do {                                                  \
 669     if (src == src_end)                                 \
 670       {                                                 \
 671         if (src_base < src)                             \
 672           record_conversion_result                      \
 673             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 674         goto no_more_source;                            \
 675       }                                                 \
 676     c = *src++;                                         \
 677     if (multibytep && (c & 0x80))                       \
 678       {                                                 \
 679         if ((c & 0xFE) == 0xC0)                         \
 680           c = ((c & 1) << 6) | *src++;                  \
 681         else                                            \
 682           {                                             \
 683             src--;                                      \
 684             c = - string_char (src, &src, NULL);        \
 685             record_conversion_result                    \
 686               (coding, CODING_RESULT_INVALID_SRC);      \
 687           }                                             \
 688       }                                                 \
 689     consumed_chars++;                                   \
 690   } while (0)
 691
 692 /* Safely get two bytes from the source text pointed by SRC which ends
 693    at SRC_END, and set C1 and C2 to those bytes while skipping the
 694    heading multibyte characters.  If there are not enough bytes in the
 695    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 696    a multibyte character is found for C2, set C2 to the negative value
 697    of the character code.  The caller should declare and set these
 698    variables appropriately in advance:
 699         src, src_end, multibytep
 700    It is intended that this macro is used in detect_coding_utf_16.  */
 701
 702 #define TWO_MORE_BYTES(c1, c2)                          \
 703   do {                                                  \
 704     do {                                                \
 705       if (src == src_end)                               \
 706         goto no_more_source;                            \
 707       c1 = *src++;                                      \
 708       if (multibytep && (c1 & 0x80))                    \
 709         {                                               \
 710           if ((c1 & 0xFE) == 0xC0)                      \
 711             c1 = ((c1 & 1) << 6) | *src++;              \
 712           else                                          \
 713             {                                           \
 714               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 715               c1 = -1;                                  \
 716             }                                           \
 717         }                                               \
 718     } while (c1 < 0);                                   \
 719     if (src == src_end)                                 \
 720       goto no_more_source;                              \
 721     c2 = *src++;                                        \
 722     if (multibytep && (c2 & 0x80))                      \
 723       {                                                 \
 724         if ((c2 & 0xFE) == 0xC0)                        \
 725           c2 = ((c2 & 1) << 6) | *src++;                \
 726         else                                            \
 727           c2 = -1;                                      \
 728       }                                                 \
 729   } while (0)
 730
 731
 732 /* Store a byte C in the place pointed by DST and increment DST to the
 733    next free point, and increment PRODUCED_CHARS.  The caller should
 734    assure that C is 0..127, and declare and set the variable `dst'
 735    appropriately in advance.
 736 */
 737
 738
 739 #define EMIT_ONE_ASCII_BYTE(c)  \
 740   do {                          \
 741     produced_chars++;           \
 742     *dst++ = (c);               \
 743   } while (0)
 744
 745
 746 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 747
 748 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 749   do {                                  \
 750     produced_chars += 2;                \
 751     *dst++ = (c1), *dst++ = (c2);       \
 752   } while (0)
 753
 754
 755 /* Store a byte C in the place pointed by DST and increment DST to the
 756    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 757    store in an appropriate multibyte form.  The caller should
 758    declare and set the variables `dst' and `multibytep' appropriately
 759    in advance.  */
 760
 761 #define EMIT_ONE_BYTE(c)                \
 762   do {                                  \
 763     produced_chars++;                   \
 764     if (multibytep)                     \
 765       {                                 \
 766         unsigned ch = (c);              \
 767         if (ch >= 0x80)                 \
 768           ch = BYTE8_TO_CHAR (ch);      \
 769         CHAR_STRING_ADVANCE (ch, dst);  \
 770       }                                 \
 771     else                                \
 772       *dst++ = (c);                     \
 773   } while (0)
 774
 775
 776 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 777
 778 #define EMIT_TWO_BYTES(c1, c2)          \
 779   do {                                  \
 780     produced_chars += 2;                \
 781     if (multibytep)                     \
 782       {                                 \
 783         unsigned ch;                    \
 784                                         \
 785         ch = (c1);                      \
 786         if (ch >= 0x80)                 \
 787           ch = BYTE8_TO_CHAR (ch);      \
 788         CHAR_STRING_ADVANCE (ch, dst);  \
 789         ch = (c2);                      \
 790         if (ch >= 0x80)                 \
 791           ch = BYTE8_TO_CHAR (ch);      \
 792         CHAR_STRING_ADVANCE (ch, dst);  \
 793       }                                 \
 794     else                                \
 795       {                                 \
 796         *dst++ = (c1);                  \
 797         *dst++ = (c2);                  \
 798       }                                 \
 799   } while (0)
 800
 801
 802 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 803   do {                                  \
 804     EMIT_ONE_BYTE (c1);                 \
 805     EMIT_TWO_BYTES (c2, c3);            \
 806   } while (0)
 807
 808
 809 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 810   do {                                          \
 811     EMIT_TWO_BYTES (c1, c2);                    \
 812     EMIT_TWO_BYTES (c3, c4);                    \
 813   } while (0)
 814
 815
 816 static void
 817 record_conversion_result (struct coding_system *coding,
 818                           enum coding_result_code result)
 819 {
 820   coding->result = result;
 821   switch (result)
 822     {
 823     case CODING_RESULT_INSUFFICIENT_SRC:
 824       Vlast_code_conversion_error = Qinsufficient_source;
 825       break;
 826     case CODING_RESULT_INVALID_SRC:
 827       Vlast_code_conversion_error = Qinvalid_source;
 828       break;
 829     case CODING_RESULT_INTERRUPT:
 830       Vlast_code_conversion_error = Qinterrupted;
 831       break;
 832     case CODING_RESULT_INSUFFICIENT_DST:
 833       /* Don't record this error in Vlast_code_conversion_error
 834          because it happens just temporarily and is resolved when the
 835          whole conversion is finished.  */
 836       break;
 837     case CODING_RESULT_SUCCESS:
 838       break;
 839     default:
 840       Vlast_code_conversion_error = intern ("Unknown error");
 841     }
 842 }
 843
 844 /* These wrapper macros are used to preserve validity of pointers into
 845    buffer text across calls to decode_char, encode_char, etc, which
 846    could cause relocation of buffers if it loads a charset map,
 847    because loading a charset map allocates large structures.  */
 848
 849 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 850   do {                                                                       \
 851     ptrdiff_t offset;                                                        \
 852                                                                              \
 853     charset_map_loaded = 0;                                                  \
 854     c = DECODE_CHAR (charset, code);                                         \
 855     if (charset_map_loaded                                                   \
 856         && (offset = coding_change_source (coding)))                         \
 857       {                                                                      \
 858         src += offset;                                                       \
 859         src_base += offset;                                                  \
 860         src_end += offset;                                                   \
 861       }                                                                      \
 862   } while (0)
 863
 864 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 865   do {                                                                  \
 866     ptrdiff_t offset;                                                   \
 867                                                                         \
 868     charset_map_loaded = 0;                                             \
 869     code = ENCODE_CHAR (charset, c);                                    \
 870     if (charset_map_loaded                                              \
 871         && (offset = coding_change_destination (coding)))               \
 872       {                                                                 \
 873         dst += offset;                                                  \
 874         dst_end += offset;                                              \
 875       }                                                                 \
 876   } while (0)
 877
 878 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 879   do {                                                                  \
 880     ptrdiff_t offset;                                                   \
 881                                                                         \
 882     charset_map_loaded = 0;                                             \
 883     charset = char_charset (c, charset_list, code_return);              \
 884     if (charset_map_loaded                                              \
 885         && (offset = coding_change_destination (coding)))               \
 886       {                                                                 \
 887         dst += offset;                                                  \
 888         dst_end += offset;                                              \
 889       }                                                                 \
 890   } while (0)
 891
 892 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 893   do {                                                                  \
 894     ptrdiff_t offset;                                                   \
 895                                                                         \
 896     charset_map_loaded = 0;                                             \
 897     result = CHAR_CHARSET_P (c, charset);                               \
 898     if (charset_map_loaded                                              \
 899         && (offset = coding_change_destination (coding)))               \
 900       {                                                                 \
 901         dst += offset;                                                  \
 902         dst_end += offset;                                              \
 903       }                                                                 \
 904   } while (0)
 905
 906
 907 /* If there are at least BYTES length of room at dst, allocate memory
 908    for coding->destination and update dst and dst_end.  We don't have
 909    to take care of coding->source which will be relocated.  It is
 910    handled by calling coding_set_source in encode_coding.  */
 911
 912 #define ASSURE_DESTINATION(bytes)                               \
 913   do {                                                          \
 914     if (dst + (bytes) >= dst_end)                               \
 915       {                                                         \
 916         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 917                                                                 \
 918         dst = alloc_destination (coding, more_bytes, dst);      \
 919         dst_end = coding->destination + coding->dst_bytes;      \
 920       }                                                         \
 921   } while (0)
 922
 923
 924 /* Store multibyte form of the character C in P, and advance P to the
 925    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 926    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 927    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 928
 929 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 930
 931 /* Return the character code of character whose multibyte form is at
 932    P, and advance P to the end of the multibyte form.  This used to be
 933    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 934    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 935
 936 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 937
 938 /* Set coding->source from coding->src_object.  */
 939
 940 static void
 941 coding_set_source (struct coding_system *coding)
 942 {
 943   if (BUFFERP (coding->src_object))
 944     {
 945       struct buffer *buf = XBUFFER (coding->src_object);
 946
 947       if (coding->src_pos < 0)
 948         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 949       else
 950         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 951     }
 952   else if (STRINGP (coding->src_object))
 953     {
 954       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 955     }
 956   else
 957     {
 958       /* Otherwise, the source is C string and is never relocated
 959          automatically.  Thus we don't have to update anything.  */
 960     }
 961 }
 962
 963
 964 /* Set coding->source from coding->src_object, and return how many
 965    bytes coding->source was changed.  */
 966
 967 static ptrdiff_t
 968 coding_change_source (struct coding_system *coding)
 969 {
 970   const unsigned char *orig = coding->source;
 971   coding_set_source (coding);
 972   return coding->source - orig;
 973 }
 974
 975
 976 /* Set coding->destination from coding->dst_object.  */
 977
 978 static void
 979 coding_set_destination (struct coding_system *coding)
 980 {
 981   if (BUFFERP (coding->dst_object))
 982     {
 983       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 984         {
 985           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 986           coding->dst_bytes = (GAP_END_ADDR
 987                                - (coding->src_bytes - coding->consumed)
 988                                - coding->destination);
 989         }
 990       else
 991         {
 992           /* We are sure that coding->dst_pos_byte is before the gap
 993              of the buffer. */
 994           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 995                                  + coding->dst_pos_byte - BEG_BYTE);
 996           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 997                                - coding->destination);
 998         }
 999     }
1000   else
1001     {
1002       /* Otherwise, the destination is C string and is never relocated
1003          automatically.  Thus we don't have to update anything.  */
1004     }
1005 }
1006
1007
1008 /* Set coding->destination from coding->dst_object, and return how
1009    many bytes coding->destination was changed.  */
1010
1011 static ptrdiff_t
1012 coding_change_destination (struct coding_system *coding)
1013 {
1014   const unsigned char *orig = coding->destination;
1015   coding_set_destination (coding);
1016   return coding->destination - orig;
1017 }
1018
1019
1020 static void
1021 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1022 {
1023   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1024     string_overflow ();
1025   coding->destination = xrealloc (coding->destination,
1026                                   coding->dst_bytes + bytes);
1027   coding->dst_bytes += bytes;
1028 }
1029
1030 static void
1031 coding_alloc_by_making_gap (struct coding_system *coding,
1032                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1033 {
1034   if (EQ (coding->src_object, coding->dst_object))
1035     {
1036       /* The gap may contain the produced data at the head and not-yet
1037          consumed data at the tail.  To preserve those data, we at
1038          first make the gap size to zero, then increase the gap
1039          size.  */
1040       ptrdiff_t add = GAP_SIZE;
1041
1042       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1043       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1044       make_gap (bytes);
1045       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1046       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1047     }
1048   else
1049     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1050 }
1051
1052
1053 static unsigned char *
1054 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1055                    unsigned char *dst)
1056 {
1057   ptrdiff_t offset = dst - coding->destination;
1058
1059   if (BUFFERP (coding->dst_object))
1060     {
1061       struct buffer *buf = XBUFFER (coding->dst_object);
1062
1063       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1064     }
1065   else
1066     coding_alloc_by_realloc (coding, nbytes);
1067   coding_set_destination (coding);
1068   dst = coding->destination + offset;
1069   return dst;
1070 }
1071
1072 /** Macros for annotations.  */
1073
1074 /* An annotation data is stored in the array coding->charbuf in this
1075    format:
1076      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1077    LENGTH is the number of elements in the annotation.
1078    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1079    NCHARS is the number of characters in the text annotated.
1080
1081    The format of the following elements depend on ANNOTATION_MASK.
1082
1083    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1084    follows:
1085      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1086
1087    NBYTES is the number of bytes specified in the header part of
1088    old-style emacs-mule encoding, or 0 for the other kind of
1089    composition.
1090
1091    METHOD is one of enum composition_method.
1092
1093    Optional COMPOSITION-COMPONENTS are characters and composition
1094    rules.
1095
1096    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1097    follows.
1098
1099    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1100    recover from an invalid annotation, and should be skipped by
1101    produce_annotation.  */
1102
1103 /* Maximum length of the header of annotation data.  */
1104 #define MAX_ANNOTATION_LENGTH 5
1105
1106 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1107   do {                                                  \
1108     *(buf)++ = -(len);                                  \
1109     *(buf)++ = (mask);                                  \
1110     *(buf)++ = (nchars);                                \
1111     coding->annotated = 1;                              \
1112   } while (0);
1113
1114 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1115   do {                                                                      \
1116     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1117     *buf++ = nbytes;                                                        \
1118     *buf++ = method;                                                        \
1119   } while (0)
1120
1121
1122 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1123   do {                                                                  \
1124     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1125     *buf++ = id;                                                        \
1126   } while (0)
1127
1128
1129 /* Bitmasks for coding->eol_seen.  */
1130
1131 #define EOL_SEEN_NONE   0
1132 #define EOL_SEEN_LF     1
1133 #define EOL_SEEN_CR     2
1134 #define EOL_SEEN_CRLF   4
1135
1136 \f
1137 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1138
1139
1140
1141 \f
1142 /*** 3. UTF-8 ***/
1143
1144 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1145    Return true if a text is encoded in UTF-8.  */
1146
1147 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1148 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1149 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1150 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1151 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1152 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1153
1154 #define UTF_8_BOM_1 0xEF
1155 #define UTF_8_BOM_2 0xBB
1156 #define UTF_8_BOM_3 0xBF
1157
1158 /* Unlike the other detect_coding_XXX, this function counts number of
1159    characters and check EOL format.  */
1160
1161 static bool
1162 detect_coding_utf_8 (struct coding_system *coding,
1163                      struct coding_detection_info *detect_info)
1164 {
1165   const unsigned char *src = coding->source, *src_base;
1166   const unsigned char *src_end = coding->source + coding->src_bytes;
1167   bool multibytep = coding->src_multibyte;
1168   ptrdiff_t consumed_chars = 0;
1169   bool bom_found = 0;
1170   int nchars = coding->head_ascii;
1171   int eol_seen = coding->eol_seen;
1172
1173   detect_info->checked |= CATEGORY_MASK_UTF_8;
1174   /* A coding system of this category is always ASCII compatible.  */
1175   src += nchars;
1176
1177   if (src == coding->source     /* BOM should be at the head.  */
1178       && src + 3 < src_end      /* BOM is 3-byte long.  */
1179       && src[0] == UTF_8_BOM_1
1180       && src[1] == UTF_8_BOM_2
1181       && src[2] == UTF_8_BOM_3)
1182     {
1183       bom_found = 1;
1184       src += 3;
1185       nchars++;
1186     }
1187
1188   while (1)
1189     {
1190       int c, c1, c2, c3, c4;
1191
1192       src_base = src;
1193       ONE_MORE_BYTE (c);
1194       if (c < 0 || UTF_8_1_OCTET_P (c))
1195         {
1196           nchars++;
1197           if (c == '\r')
1198             {
1199               if (src < src_end && *src == '\n')
1200                 {
1201                   eol_seen |= EOL_SEEN_CRLF;
1202                   src++;
1203                   nchars++;
1204                 }
1205               else
1206                 eol_seen |= EOL_SEEN_CR;
1207             }
1208           else if (c == '\n')
1209             eol_seen |= EOL_SEEN_LF;
1210           continue;
1211         }
1212       ONE_MORE_BYTE (c1);
1213       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1214         break;
1215       if (UTF_8_2_OCTET_LEADING_P (c))
1216         {
1217           nchars++;
1218           continue;
1219         }
1220       ONE_MORE_BYTE (c2);
1221       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1222         break;
1223       if (UTF_8_3_OCTET_LEADING_P (c))
1224         {
1225           nchars++;
1226           continue;
1227         }
1228       ONE_MORE_BYTE (c3);
1229       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1230         break;
1231       if (UTF_8_4_OCTET_LEADING_P (c))
1232         {
1233           nchars++;
1234           continue;
1235         }
1236       ONE_MORE_BYTE (c4);
1237       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1238         break;
1239       if (UTF_8_5_OCTET_LEADING_P (c))
1240         {
1241           nchars++;
1242           continue;
1243         }
1244       break;
1245     }
1246   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1247   return 0;
1248
1249  no_more_source:
1250   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1251     {
1252       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1253       return 0;
1254     }
1255   if (bom_found)
1256     {
1257       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1258       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1259     }
1260   else
1261     {
1262       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1263       if (nchars < src_end - coding->source)
1264         /* The found characters are less than source bytes, which
1265            means that we found a valid non-ASCII characters.  */
1266         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1267     }
1268   coding->detected_utf8_chars = nchars;
1269   return 1;
1270 }
1271
1272
1273 static void
1274 decode_coding_utf_8 (struct coding_system *coding)
1275 {
1276   const unsigned char *src = coding->source + coding->consumed;
1277   const unsigned char *src_end = coding->source + coding->src_bytes;
1278   const unsigned char *src_base;
1279   int *charbuf = coding->charbuf + coding->charbuf_used;
1280   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1281   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1282   bool multibytep = coding->src_multibyte;
1283   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1284   bool eol_dos
1285     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1286   int byte_after_cr = -1;
1287
1288   if (bom != utf_without_bom)
1289     {
1290       int c1, c2, c3;
1291
1292       src_base = src;
1293       ONE_MORE_BYTE (c1);
1294       if (! UTF_8_3_OCTET_LEADING_P (c1))
1295         src = src_base;
1296       else
1297         {
1298           ONE_MORE_BYTE (c2);
1299           if (! UTF_8_EXTRA_OCTET_P (c2))
1300             src = src_base;
1301           else
1302             {
1303               ONE_MORE_BYTE (c3);
1304               if (! UTF_8_EXTRA_OCTET_P (c3))
1305                 src = src_base;
1306               else
1307                 {
1308                   if ((c1 != UTF_8_BOM_1)
1309                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1310                     src = src_base;
1311                   else
1312                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1313                 }
1314             }
1315         }
1316     }
1317   CODING_UTF_8_BOM (coding) = utf_without_bom;
1318
1319   while (1)
1320     {
1321       int c, c1, c2, c3, c4, c5;
1322
1323       src_base = src;
1324       consumed_chars_base = consumed_chars;
1325
1326       if (charbuf >= charbuf_end)
1327         {
1328           if (byte_after_cr >= 0)
1329             src_base--;
1330           break;
1331         }
1332
1333       if (byte_after_cr >= 0)
1334         c1 = byte_after_cr, byte_after_cr = -1;
1335       else
1336         ONE_MORE_BYTE (c1);
1337       if (c1 < 0)
1338         {
1339           c = - c1;
1340         }
1341       else if (UTF_8_1_OCTET_P (c1))
1342         {
1343           if (eol_dos && c1 == '\r')
1344             ONE_MORE_BYTE (byte_after_cr);
1345           c = c1;
1346         }
1347       else
1348         {
1349           ONE_MORE_BYTE (c2);
1350           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1351             goto invalid_code;
1352           if (UTF_8_2_OCTET_LEADING_P (c1))
1353             {
1354               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1355               /* Reject overlong sequences here and below.  Encoders
1356                  producing them are incorrect, they can be misleading,
1357                  and they mess up read/write invariance.  */
1358               if (c < 128)
1359                 goto invalid_code;
1360             }
1361           else
1362             {
1363               ONE_MORE_BYTE (c3);
1364               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1365                 goto invalid_code;
1366               if (UTF_8_3_OCTET_LEADING_P (c1))
1367                 {
1368                   c = (((c1 & 0xF) << 12)
1369                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1370                   if (c < 0x800
1371                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1372                     goto invalid_code;
1373                 }
1374               else
1375                 {
1376                   ONE_MORE_BYTE (c4);
1377                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1378                     goto invalid_code;
1379                   if (UTF_8_4_OCTET_LEADING_P (c1))
1380                     {
1381                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1382                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1383                     if (c < 0x10000)
1384                       goto invalid_code;
1385                     }
1386                   else
1387                     {
1388                       ONE_MORE_BYTE (c5);
1389                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1390                         goto invalid_code;
1391                       if (UTF_8_5_OCTET_LEADING_P (c1))
1392                         {
1393                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1394                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1395                                | (c5 & 0x3F));
1396                           if ((c > MAX_CHAR) || (c < 0x200000))
1397                             goto invalid_code;
1398                         }
1399                       else
1400                         goto invalid_code;
1401                     }
1402                 }
1403             }
1404         }
1405
1406       *charbuf++ = c;
1407       continue;
1408
1409     invalid_code:
1410       src = src_base;
1411       consumed_chars = consumed_chars_base;
1412       ONE_MORE_BYTE (c);
1413       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1414       coding->errors++;
1415     }
1416
1417  no_more_source:
1418   coding->consumed_char += consumed_chars_base;
1419   coding->consumed = src_base - coding->source;
1420   coding->charbuf_used = charbuf - coding->charbuf;
1421 }
1422
1423
1424 static bool
1425 encode_coding_utf_8 (struct coding_system *coding)
1426 {
1427   bool multibytep = coding->dst_multibyte;
1428   int *charbuf = coding->charbuf;
1429   int *charbuf_end = charbuf + coding->charbuf_used;
1430   unsigned char *dst = coding->destination + coding->produced;
1431   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1432   ptrdiff_t produced_chars = 0;
1433   int c;
1434
1435   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1436     {
1437       ASSURE_DESTINATION (3);
1438       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1439       CODING_UTF_8_BOM (coding) = utf_without_bom;
1440     }
1441
1442   if (multibytep)
1443     {
1444       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1445
1446       while (charbuf < charbuf_end)
1447         {
1448           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1449
1450           ASSURE_DESTINATION (safe_room);
1451           c = *charbuf++;
1452           if (CHAR_BYTE8_P (c))
1453             {
1454               c = CHAR_TO_BYTE8 (c);
1455               EMIT_ONE_BYTE (c);
1456             }
1457           else
1458             {
1459               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1460               for (p = str; p < pend; p++)
1461                 EMIT_ONE_BYTE (*p);
1462             }
1463         }
1464     }
1465   else
1466     {
1467       int safe_room = MAX_MULTIBYTE_LENGTH;
1468
1469       while (charbuf < charbuf_end)
1470         {
1471           ASSURE_DESTINATION (safe_room);
1472           c = *charbuf++;
1473           if (CHAR_BYTE8_P (c))
1474             *dst++ = CHAR_TO_BYTE8 (c);
1475           else
1476             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1477           produced_chars++;
1478         }
1479     }
1480   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1481   coding->produced_char += produced_chars;
1482   coding->produced = dst - coding->destination;
1483   return 0;
1484 }
1485
1486
1487 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1488    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1489
1490 #define UTF_16_HIGH_SURROGATE_P(val) \
1491   (((val) & 0xFC00) == 0xD800)
1492
1493 #define UTF_16_LOW_SURROGATE_P(val) \
1494   (((val) & 0xFC00) == 0xDC00)
1495
1496
1497 static bool
1498 detect_coding_utf_16 (struct coding_system *coding,
1499                       struct coding_detection_info *detect_info)
1500 {
1501   const unsigned char *src = coding->source;
1502   const unsigned char *src_end = coding->source + coding->src_bytes;
1503   bool multibytep = coding->src_multibyte;
1504   int c1, c2;
1505
1506   detect_info->checked |= CATEGORY_MASK_UTF_16;
1507   if (coding->mode & CODING_MODE_LAST_BLOCK
1508       && (coding->src_chars & 1))
1509     {
1510       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1511       return 0;
1512     }
1513
1514   TWO_MORE_BYTES (c1, c2);
1515   if ((c1 == 0xFF) && (c2 == 0xFE))
1516     {
1517       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1518                              | CATEGORY_MASK_UTF_16_AUTO);
1519       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1520                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1521                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1522     }
1523   else if ((c1 == 0xFE) && (c2 == 0xFF))
1524     {
1525       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1526                              | CATEGORY_MASK_UTF_16_AUTO);
1527       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1528                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1529                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1530     }
1531   else if (c2 < 0)
1532     {
1533       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1534       return 0;
1535     }
1536   else
1537     {
1538       /* We check the dispersion of Eth and Oth bytes where E is even and
1539          O is odd.  If both are high, we assume binary data.*/
1540       unsigned char e[256], o[256];
1541       unsigned e_num = 1, o_num = 1;
1542
1543       memset (e, 0, 256);
1544       memset (o, 0, 256);
1545       e[c1] = 1;
1546       o[c2] = 1;
1547
1548       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1549                                 |CATEGORY_MASK_UTF_16_BE
1550                                 | CATEGORY_MASK_UTF_16_LE);
1551
1552       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1553              != CATEGORY_MASK_UTF_16)
1554         {
1555           TWO_MORE_BYTES (c1, c2);
1556           if (c2 < 0)
1557             break;
1558           if (! e[c1])
1559             {
1560               e[c1] = 1;
1561               e_num++;
1562               if (e_num >= 128)
1563                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1564             }
1565           if (! o[c2])
1566             {
1567               o[c2] = 1;
1568               o_num++;
1569               if (o_num >= 128)
1570                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1571             }
1572         }
1573       return 0;
1574     }
1575
1576  no_more_source:
1577   return 1;
1578 }
1579
1580 static void
1581 decode_coding_utf_16 (struct coding_system *coding)
1582 {
1583   const unsigned char *src = coding->source + coding->consumed;
1584   const unsigned char *src_end = coding->source + coding->src_bytes;
1585   const unsigned char *src_base;
1586   int *charbuf = coding->charbuf + coding->charbuf_used;
1587   /* We may produces at most 3 chars in one loop.  */
1588   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1589   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1590   bool multibytep = coding->src_multibyte;
1591   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1592   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1593   int surrogate = CODING_UTF_16_SURROGATE (coding);
1594   bool eol_dos
1595     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1596   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1597
1598   if (bom == utf_with_bom)
1599     {
1600       int c, c1, c2;
1601
1602       src_base = src;
1603       ONE_MORE_BYTE (c1);
1604       ONE_MORE_BYTE (c2);
1605       c = (c1 << 8) | c2;
1606
1607       if (endian == utf_16_big_endian
1608           ? c != 0xFEFF : c != 0xFFFE)
1609         {
1610           /* The first two bytes are not BOM.  Treat them as bytes
1611              for a normal character.  */
1612           src = src_base;
1613           coding->errors++;
1614         }
1615       CODING_UTF_16_BOM (coding) = utf_without_bom;
1616     }
1617   else if (bom == utf_detect_bom)
1618     {
1619       /* We have already tried to detect BOM and failed in
1620          detect_coding.  */
1621       CODING_UTF_16_BOM (coding) = utf_without_bom;
1622     }
1623
1624   while (1)
1625     {
1626       int c, c1, c2;
1627
1628       src_base = src;
1629       consumed_chars_base = consumed_chars;
1630
1631       if (charbuf >= charbuf_end)
1632         {
1633           if (byte_after_cr1 >= 0)
1634             src_base -= 2;
1635           break;
1636         }
1637
1638       if (byte_after_cr1 >= 0)
1639         c1 = byte_after_cr1, byte_after_cr1 = -1;
1640       else
1641         ONE_MORE_BYTE (c1);
1642       if (c1 < 0)
1643         {
1644           *charbuf++ = -c1;
1645           continue;
1646         }
1647       if (byte_after_cr2 >= 0)
1648         c2 = byte_after_cr2, byte_after_cr2 = -1;
1649       else
1650         ONE_MORE_BYTE (c2);
1651       if (c2 < 0)
1652         {
1653           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1654           *charbuf++ = -c2;
1655           continue;
1656         }
1657       c = (endian == utf_16_big_endian
1658            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1659
1660       if (surrogate)
1661         {
1662           if (! UTF_16_LOW_SURROGATE_P (c))
1663             {
1664               if (endian == utf_16_big_endian)
1665                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1666               else
1667                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1668               *charbuf++ = c1;
1669               *charbuf++ = c2;
1670               coding->errors++;
1671               if (UTF_16_HIGH_SURROGATE_P (c))
1672                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1673               else
1674                 *charbuf++ = c;
1675             }
1676           else
1677             {
1678               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1679               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1680               *charbuf++ = 0x10000 + c;
1681             }
1682         }
1683       else
1684         {
1685           if (UTF_16_HIGH_SURROGATE_P (c))
1686             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1687           else
1688             {
1689               if (eol_dos && c == '\r')
1690                 {
1691                   ONE_MORE_BYTE (byte_after_cr1);
1692                   ONE_MORE_BYTE (byte_after_cr2);
1693                 }
1694               *charbuf++ = c;
1695             }
1696         }
1697     }
1698
1699  no_more_source:
1700   coding->consumed_char += consumed_chars_base;
1701   coding->consumed = src_base - coding->source;
1702   coding->charbuf_used = charbuf - coding->charbuf;
1703 }
1704
1705 static bool
1706 encode_coding_utf_16 (struct coding_system *coding)
1707 {
1708   bool multibytep = coding->dst_multibyte;
1709   int *charbuf = coding->charbuf;
1710   int *charbuf_end = charbuf + coding->charbuf_used;
1711   unsigned char *dst = coding->destination + coding->produced;
1712   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1713   int safe_room = 8;
1714   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1715   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1716   ptrdiff_t produced_chars = 0;
1717   int c;
1718
1719   if (bom != utf_without_bom)
1720     {
1721       ASSURE_DESTINATION (safe_room);
1722       if (big_endian)
1723         EMIT_TWO_BYTES (0xFE, 0xFF);
1724       else
1725         EMIT_TWO_BYTES (0xFF, 0xFE);
1726       CODING_UTF_16_BOM (coding) = utf_without_bom;
1727     }
1728
1729   while (charbuf < charbuf_end)
1730     {
1731       ASSURE_DESTINATION (safe_room);
1732       c = *charbuf++;
1733       if (c > MAX_UNICODE_CHAR)
1734         c = coding->default_char;
1735
1736       if (c < 0x10000)
1737         {
1738           if (big_endian)
1739             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1740           else
1741             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1742         }
1743       else
1744         {
1745           int c1, c2;
1746
1747           c -= 0x10000;
1748           c1 = (c >> 10) + 0xD800;
1749           c2 = (c & 0x3FF) + 0xDC00;
1750           if (big_endian)
1751             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1752           else
1753             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1754         }
1755     }
1756   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1757   coding->produced = dst - coding->destination;
1758   coding->produced_char += produced_chars;
1759   return 0;
1760 }
1761
1762 \f
1763 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1764
1765 /* Emacs' internal format for representation of multiple character
1766    sets is a kind of multi-byte encoding, i.e. characters are
1767    represented by variable-length sequences of one-byte codes.
1768
1769    ASCII characters and control characters (e.g. `tab', `newline') are
1770    represented by one-byte sequences which are their ASCII codes, in
1771    the range 0x00 through 0x7F.
1772
1773    8-bit characters of the range 0x80..0x9F are represented by
1774    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1775    code + 0x20).
1776
1777    8-bit characters of the range 0xA0..0xFF are represented by
1778    one-byte sequences which are their 8-bit code.
1779
1780    The other characters are represented by a sequence of `base
1781    leading-code', optional `extended leading-code', and one or two
1782    `position-code's.  The length of the sequence is determined by the
1783    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1784    whereas extended leading-code and position-code take the range 0xA0
1785    through 0xFF.  See `charset.h' for more details about leading-code
1786    and position-code.
1787
1788    --- CODE RANGE of Emacs' internal format ---
1789    character set        range
1790    -------------        -----
1791    ascii                0x00..0x7F
1792    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1793    eight-bit-graphic    0xA0..0xBF
1794    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1795    ---------------------------------------------
1796
1797    As this is the internal character representation, the format is
1798    usually not used externally (i.e. in a file or in a data sent to a
1799    process).  But, it is possible to have a text externally in this
1800    format (i.e. by encoding by the coding system `emacs-mule').
1801
1802    In that case, a sequence of one-byte codes has a slightly different
1803    form.
1804
1805    At first, all characters in eight-bit-control are represented by
1806    one-byte sequences which are their 8-bit code.
1807
1808    Next, character composition data are represented by the byte
1809    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1810    where,
1811         METHOD is 0xF2 plus one of composition method (enum
1812         composition_method),
1813
1814         BYTES is 0xA0 plus a byte length of this composition data,
1815
1816         CHARS is 0xA0 plus a number of characters composed by this
1817         data,
1818
1819         COMPONENTs are characters of multibyte form or composition
1820         rules encoded by two-byte of ASCII codes.
1821
1822    In addition, for backward compatibility, the following formats are
1823    also recognized as composition data on decoding.
1824
1825    0x80 MSEQ ...
1826    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1827
1828    Here,
1829         MSEQ is a multibyte form but in these special format:
1830           ASCII: 0xA0 ASCII_CODE+0x80,
1831           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1832         RULE is a one byte code of the range 0xA0..0xF0 that
1833         represents a composition rule.
1834   */
1835
1836 char emacs_mule_bytes[256];
1837
1838
1839 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1840    Return true if a text is encoded in 'emacs-mule'.  */
1841
1842 static bool
1843 detect_coding_emacs_mule (struct coding_system *coding,
1844                           struct coding_detection_info *detect_info)
1845 {
1846   const unsigned char *src = coding->source, *src_base;
1847   const unsigned char *src_end = coding->source + coding->src_bytes;
1848   bool multibytep = coding->src_multibyte;
1849   ptrdiff_t consumed_chars = 0;
1850   int c;
1851   int found = 0;
1852
1853   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1854   /* A coding system of this category is always ASCII compatible.  */
1855   src += coding->head_ascii;
1856
1857   while (1)
1858     {
1859       src_base = src;
1860       ONE_MORE_BYTE (c);
1861       if (c < 0)
1862         continue;
1863       if (c == 0x80)
1864         {
1865           /* Perhaps the start of composite character.  We simply skip
1866              it because analyzing it is too heavy for detecting.  But,
1867              at least, we check that the composite character
1868              constitutes of more than 4 bytes.  */
1869           const unsigned char *src_start;
1870
1871         repeat:
1872           src_start = src;
1873           do
1874             {
1875               ONE_MORE_BYTE (c);
1876             }
1877           while (c >= 0xA0);
1878
1879           if (src - src_start <= 4)
1880             break;
1881           found = CATEGORY_MASK_EMACS_MULE;
1882           if (c == 0x80)
1883             goto repeat;
1884         }
1885
1886       if (c < 0x80)
1887         {
1888           if (c < 0x20
1889               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1890             break;
1891         }
1892       else
1893         {
1894           int more_bytes = emacs_mule_bytes[c] - 1;
1895
1896           while (more_bytes > 0)
1897             {
1898               ONE_MORE_BYTE (c);
1899               if (c < 0xA0)
1900                 {
1901                   src--;        /* Unread the last byte.  */
1902                   break;
1903                 }
1904               more_bytes--;
1905             }
1906           if (more_bytes != 0)
1907             break;
1908           found = CATEGORY_MASK_EMACS_MULE;
1909         }
1910     }
1911   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1912   return 0;
1913
1914  no_more_source:
1915   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1916     {
1917       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1918       return 0;
1919     }
1920   detect_info->found |= found;
1921   return 1;
1922 }
1923
1924
1925 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1926    character.  If CMP_STATUS indicates that we must expect MSEQ or
1927    RULE described above, decode it and return the negative value of
1928    the decoded character or rule.  If an invalid byte is found, return
1929    -1.  If SRC is too short, return -2.  */
1930
1931 static int
1932 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1933                  int *nbytes, int *nchars, int *id,
1934                  struct composition_status *cmp_status)
1935 {
1936   const unsigned char *src_end = coding->source + coding->src_bytes;
1937   const unsigned char *src_base = src;
1938   bool multibytep = coding->src_multibyte;
1939   int charset_ID;
1940   unsigned code;
1941   int c;
1942   int consumed_chars = 0;
1943   bool mseq_found = 0;
1944
1945   ONE_MORE_BYTE (c);
1946   if (c < 0)
1947     {
1948       c = -c;
1949       charset_ID = emacs_mule_charset[0];
1950     }
1951   else
1952     {
1953       if (c >= 0xA0)
1954         {
1955           if (cmp_status->state != COMPOSING_NO
1956               && cmp_status->old_form)
1957             {
1958               if (cmp_status->state == COMPOSING_CHAR)
1959                 {
1960                   if (c == 0xA0)
1961                     {
1962                       ONE_MORE_BYTE (c);
1963                       c -= 0x80;
1964                       if (c < 0)
1965                         goto invalid_code;
1966                     }
1967                   else
1968                     c -= 0x20;
1969                   mseq_found = 1;
1970                 }
1971               else
1972                 {
1973                   *nbytes = src - src_base;
1974                   *nchars = consumed_chars;
1975                   return -c;
1976                 }
1977             }
1978           else
1979             goto invalid_code;
1980         }
1981
1982       switch (emacs_mule_bytes[c])
1983         {
1984         case 2:
1985           if ((charset_ID = emacs_mule_charset[c]) < 0)
1986             goto invalid_code;
1987           ONE_MORE_BYTE (c);
1988           if (c < 0xA0)
1989             goto invalid_code;
1990           code = c & 0x7F;
1991           break;
1992
1993         case 3:
1994           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1995               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1996             {
1997               ONE_MORE_BYTE (c);
1998               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1999                 goto invalid_code;
2000               ONE_MORE_BYTE (c);
2001               if (c < 0xA0)
2002                 goto invalid_code;
2003               code = c & 0x7F;
2004             }
2005           else
2006             {
2007               if ((charset_ID = emacs_mule_charset[c]) < 0)
2008                 goto invalid_code;
2009               ONE_MORE_BYTE (c);
2010               if (c < 0xA0)
2011                 goto invalid_code;
2012               code = (c & 0x7F) << 8;
2013               ONE_MORE_BYTE (c);
2014               if (c < 0xA0)
2015                 goto invalid_code;
2016               code |= c & 0x7F;
2017             }
2018           break;
2019
2020         case 4:
2021           ONE_MORE_BYTE (c);
2022           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2023             goto invalid_code;
2024           ONE_MORE_BYTE (c);
2025           if (c < 0xA0)
2026             goto invalid_code;
2027           code = (c & 0x7F) << 8;
2028           ONE_MORE_BYTE (c);
2029           if (c < 0xA0)
2030             goto invalid_code;
2031           code |= c & 0x7F;
2032           break;
2033
2034         case 1:
2035           code = c;
2036           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2037           break;
2038
2039         default:
2040           emacs_abort ();
2041         }
2042       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2043                           CHARSET_FROM_ID (charset_ID), code, c);
2044       if (c < 0)
2045         goto invalid_code;
2046     }
2047   *nbytes = src - src_base;
2048   *nchars = consumed_chars;
2049   if (id)
2050     *id = charset_ID;
2051   return (mseq_found ? -c : c);
2052
2053  no_more_source:
2054   return -2;
2055
2056  invalid_code:
2057   return -1;
2058 }
2059
2060
2061 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2062
2063 /* Handle these composition sequence ('|': the end of header elements,
2064    BYTES and CHARS >= 0xA0):
2065
2066    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2067    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2068    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2069
2070    and these old form:
2071
2072    (4) relative composition: 0x80 | MSEQ ... MSEQ
2073    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2074
2075    When the starter 0x80 and the following header elements are found,
2076    this annotation header is produced.
2077
2078         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2079
2080    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2081    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2082
2083    Then, upon reading the following elements, these codes are produced
2084    until the composition end is found:
2085
2086    (1) CHAR ... CHAR
2087    (2) ALT ... ALT CHAR ... CHAR
2088    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2089    (4) CHAR ... CHAR
2090    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2091
2092    When the composition end is found, LENGTH and NCHARS in the
2093    annotation header is updated as below:
2094
2095    (1) LENGTH: unchanged, NCHARS: unchanged
2096    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2097    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2098    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2099    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2100
2101    If an error is found while composing, the annotation header is
2102    changed to the original composition header (plus filler -1s) as
2103    below:
2104
2105    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2106    (5)          [ 0x80 0xFF -1 -1- -1 ]
2107
2108    and the sequence [ -2 DECODED-RULE ] is changed to the original
2109    byte sequence as below:
2110         o the original byte sequence is B: [ B -1 ]
2111         o the original byte sequence is B1 B2: [ B1 B2 ]
2112
2113    Most of the routines are implemented by macros because many
2114    variables and labels in the caller decode_coding_emacs_mule must be
2115    accessible, and they are usually called just once (thus doesn't
2116    increase the size of compiled object).  */
2117
2118 /* Decode a composition rule represented by C as a component of
2119    composition sequence of Emacs 20 style.  Set RULE to the decoded
2120    rule. */
2121
2122 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2123   do {                                                  \
2124     int gref, nref;                                     \
2125                                                         \
2126     c -= 0xA0;                                          \
2127     if (c < 0 || c >= 81)                               \
2128       goto invalid_code;                                \
2129     gref = c / 9, nref = c % 9;                         \
2130     if (gref == 4) gref = 10;                           \
2131     if (nref == 4) nref = 10;                           \
2132     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2133   } while (0)
2134
2135
2136 /* Decode a composition rule represented by C and the following byte
2137    at SRC as a component of composition sequence of Emacs 21 style.
2138    Set RULE to the decoded rule.  */
2139
2140 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2141   do {                                                  \
2142     int gref, nref;                                     \
2143                                                         \
2144     gref = c - 0x20;                                    \
2145     if (gref < 0 || gref >= 81)                         \
2146       goto invalid_code;                                \
2147     ONE_MORE_BYTE (c);                                  \
2148     nref = c - 0x20;                                    \
2149     if (nref < 0 || nref >= 81)                         \
2150       goto invalid_code;                                \
2151     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2152   } while (0)
2153
2154
2155 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2156    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2157    byte length of this composition information, CHARS is the number of
2158    characters composed by this composition.  */
2159
2160 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2161   do {                                                                  \
2162     enum composition_method method = c - 0xF2;                          \
2163     int nbytes, nchars;                                                 \
2164                                                                         \
2165     ONE_MORE_BYTE (c);                                                  \
2166     if (c < 0)                                                          \
2167       goto invalid_code;                                                \
2168     nbytes = c - 0xA0;                                                  \
2169     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2170       goto invalid_code;                                                \
2171     ONE_MORE_BYTE (c);                                                  \
2172     nchars = c - 0xA0;                                                  \
2173     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2174       goto invalid_code;                                                \
2175     cmp_status->old_form = 0;                                           \
2176     cmp_status->method = method;                                        \
2177     if (method == COMPOSITION_RELATIVE)                                 \
2178       cmp_status->state = COMPOSING_CHAR;                               \
2179     else                                                                \
2180       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2181     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2182     cmp_status->nchars = nchars;                                        \
2183     cmp_status->ncomps = nbytes - 4;                                    \
2184     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2185   } while (0)
2186
2187
2188 /* Start of Emacs 20 style format for relative composition.  */
2189
2190 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2191   do {                                                          \
2192     cmp_status->old_form = 1;                                   \
2193     cmp_status->method = COMPOSITION_RELATIVE;                  \
2194     cmp_status->state = COMPOSING_CHAR;                         \
2195     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2196     cmp_status->nchars = cmp_status->ncomps = 0;                \
2197     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2198   } while (0)
2199
2200
2201 /* Start of Emacs 20 style format for rule-base composition.  */
2202
2203 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2204   do {                                                          \
2205     cmp_status->old_form = 1;                                   \
2206     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2207     cmp_status->state = COMPOSING_CHAR;                         \
2208     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2209     cmp_status->nchars = cmp_status->ncomps = 0;                \
2210     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2211   } while (0)
2212
2213
2214 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2215   do {                                                  \
2216     const unsigned char *current_src = src;             \
2217                                                         \
2218     ONE_MORE_BYTE (c);                                  \
2219     if (c < 0)                                          \
2220       goto invalid_code;                                \
2221     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2222         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2223       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2224     else if (c < 0xA0)                                  \
2225       goto invalid_code;                                \
2226     else if (c < 0xC0)                                  \
2227       {                                                 \
2228         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2229         /* Re-read C as a composition component.  */    \
2230         src = current_src;                              \
2231       }                                                 \
2232     else if (c == 0xFF)                                 \
2233       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2234     else                                                \
2235       goto invalid_code;                                \
2236   } while (0)
2237
2238 #define EMACS_MULE_COMPOSITION_END()                            \
2239   do {                                                          \
2240     int idx = - cmp_status->length;                             \
2241                                                                 \
2242     if (cmp_status->old_form)                                   \
2243       charbuf[idx + 2] = cmp_status->nchars;                    \
2244     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2245       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2246     cmp_status->state = COMPOSING_NO;                           \
2247   } while (0)
2248
2249
2250 static int
2251 emacs_mule_finish_composition (int *charbuf,
2252                                struct composition_status *cmp_status)
2253 {
2254   int idx = - cmp_status->length;
2255   int new_chars;
2256
2257   if (cmp_status->old_form && cmp_status->nchars > 0)
2258     {
2259       charbuf[idx + 2] = cmp_status->nchars;
2260       new_chars = 0;
2261       if (cmp_status->method == COMPOSITION_WITH_RULE
2262           && cmp_status->state == COMPOSING_CHAR)
2263         {
2264           /* The last rule was invalid.  */
2265           int rule = charbuf[-1] + 0xA0;
2266
2267           charbuf[-2] = BYTE8_TO_CHAR (rule);
2268           charbuf[-1] = -1;
2269           new_chars = 1;
2270         }
2271     }
2272   else
2273     {
2274       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2275
2276       if (cmp_status->method == COMPOSITION_WITH_RULE)
2277         {
2278           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2279           charbuf[idx++] = -3;
2280           charbuf[idx++] = 0;
2281           new_chars = 1;
2282         }
2283       else
2284         {
2285           int nchars = charbuf[idx + 1] + 0xA0;
2286           int nbytes = charbuf[idx + 2] + 0xA0;
2287
2288           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2289           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2290           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2291           charbuf[idx++] = -1;
2292           new_chars = 4;
2293         }
2294     }
2295   cmp_status->state = COMPOSING_NO;
2296   return new_chars;
2297 }
2298
2299 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2300   do {                                                                    \
2301     if (cmp_status->state != COMPOSING_NO)                                \
2302       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2303   } while (0)
2304
2305
2306 static void
2307 decode_coding_emacs_mule (struct coding_system *coding)
2308 {
2309   const unsigned char *src = coding->source + coding->consumed;
2310   const unsigned char *src_end = coding->source + coding->src_bytes;
2311   const unsigned char *src_base;
2312   int *charbuf = coding->charbuf + coding->charbuf_used;
2313   /* We may produce two annotations (charset and composition) in one
2314      loop and one more charset annotation at the end.  */
2315   int *charbuf_end
2316     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2317       /* We can produce up to 2 characters in a loop.  */
2318       - 1;
2319   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2320   bool multibytep = coding->src_multibyte;
2321   ptrdiff_t char_offset = coding->produced_char;
2322   ptrdiff_t last_offset = char_offset;
2323   int last_id = charset_ascii;
2324   bool eol_dos
2325     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2326   int byte_after_cr = -1;
2327   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2328
2329   if (cmp_status->state != COMPOSING_NO)
2330     {
2331       int i;
2332
2333       if (charbuf_end - charbuf < cmp_status->length)
2334         emacs_abort ();
2335       for (i = 0; i < cmp_status->length; i++)
2336         *charbuf++ = cmp_status->carryover[i];
2337       coding->annotated = 1;
2338     }
2339
2340   while (1)
2341     {
2342       int c, id IF_LINT (= 0);
2343
2344       src_base = src;
2345       consumed_chars_base = consumed_chars;
2346
2347       if (charbuf >= charbuf_end)
2348         {
2349           if (byte_after_cr >= 0)
2350             src_base--;
2351           break;
2352         }
2353
2354       if (byte_after_cr >= 0)
2355         c = byte_after_cr, byte_after_cr = -1;
2356       else
2357         ONE_MORE_BYTE (c);
2358
2359       if (c < 0 || c == 0x80)
2360         {
2361           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2362           if (c < 0)
2363             {
2364               *charbuf++ = -c;
2365               char_offset++;
2366             }
2367           else
2368             DECODE_EMACS_MULE_COMPOSITION_START ();
2369           continue;
2370         }
2371
2372       if (c < 0x80)
2373         {
2374           if (eol_dos && c == '\r')
2375             ONE_MORE_BYTE (byte_after_cr);
2376           id = charset_ascii;
2377           if (cmp_status->state != COMPOSING_NO)
2378             {
2379               if (cmp_status->old_form)
2380                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2381               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2382                 cmp_status->ncomps--;
2383             }
2384         }
2385       else
2386         {
2387           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2388           /* emacs_mule_char can load a charset map from a file, which
2389              allocates a large structure and might cause buffer text
2390              to be relocated as result.  Thus, we need to remember the
2391              original pointer to buffer text, and fix up all related
2392              pointers after the call.  */
2393           const unsigned char *orig = coding->source;
2394           ptrdiff_t offset;
2395
2396           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2397                                cmp_status);
2398           offset = coding->source - orig;
2399           if (offset)
2400             {
2401               src += offset;
2402               src_base += offset;
2403               src_end += offset;
2404             }
2405           if (c < 0)
2406             {
2407               if (c == -1)
2408                 goto invalid_code;
2409               if (c == -2)
2410                 break;
2411             }
2412           src = src_base + nbytes;
2413           consumed_chars = consumed_chars_base + nchars;
2414           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2415             cmp_status->ncomps -= nchars;
2416         }
2417
2418       /* Now if C >= 0, we found a normally encoded character, if C <
2419          0, we found an old-style composition component character or
2420          rule.  */
2421
2422       if (cmp_status->state == COMPOSING_NO)
2423         {
2424           if (last_id != id)
2425             {
2426               if (last_id != charset_ascii)
2427                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2428                                   last_id);
2429               last_id = id;
2430               last_offset = char_offset;
2431             }
2432           *charbuf++ = c;
2433           char_offset++;
2434         }
2435       else if (cmp_status->state == COMPOSING_CHAR)
2436         {
2437           if (cmp_status->old_form)
2438             {
2439               if (c >= 0)
2440                 {
2441                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2442                   *charbuf++ = c;
2443                   char_offset++;
2444                 }
2445               else
2446                 {
2447                   *charbuf++ = -c;
2448                   cmp_status->nchars++;
2449                   cmp_status->length++;
2450                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2451                     EMACS_MULE_COMPOSITION_END ();
2452                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2453                     cmp_status->state = COMPOSING_RULE;
2454                 }
2455             }
2456           else
2457             {
2458               *charbuf++ = c;
2459               cmp_status->length++;
2460               cmp_status->nchars--;
2461               if (cmp_status->nchars == 0)
2462                 EMACS_MULE_COMPOSITION_END ();
2463             }
2464         }
2465       else if (cmp_status->state == COMPOSING_RULE)
2466         {
2467           int rule;
2468
2469           if (c >= 0)
2470             {
2471               EMACS_MULE_COMPOSITION_END ();
2472               *charbuf++ = c;
2473               char_offset++;
2474             }
2475           else
2476             {
2477               c = -c;
2478               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2479               if (rule < 0)
2480                 goto invalid_code;
2481               *charbuf++ = -2;
2482               *charbuf++ = rule;
2483               cmp_status->length += 2;
2484               cmp_status->state = COMPOSING_CHAR;
2485             }
2486         }
2487       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2488         {
2489           *charbuf++ = c;
2490           cmp_status->length++;
2491           if (cmp_status->ncomps == 0)
2492             cmp_status->state = COMPOSING_CHAR;
2493           else if (cmp_status->ncomps > 0)
2494             {
2495               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2496                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2497             }
2498           else
2499             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2500         }
2501       else                      /* COMPOSING_COMPONENT_RULE */
2502         {
2503           int rule;
2504
2505           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2506           if (rule < 0)
2507             goto invalid_code;
2508           *charbuf++ = -2;
2509           *charbuf++ = rule;
2510           cmp_status->length += 2;
2511           cmp_status->ncomps--;
2512           if (cmp_status->ncomps > 0)
2513             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2514           else
2515             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2516         }
2517       continue;
2518
2519     invalid_code:
2520       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2521       src = src_base;
2522       consumed_chars = consumed_chars_base;
2523       ONE_MORE_BYTE (c);
2524       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2525       char_offset++;
2526       coding->errors++;
2527     }
2528
2529  no_more_source:
2530   if (cmp_status->state != COMPOSING_NO)
2531     {
2532       if (coding->mode & CODING_MODE_LAST_BLOCK)
2533         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2534       else
2535         {
2536           int i;
2537
2538           charbuf -= cmp_status->length;
2539           for (i = 0; i < cmp_status->length; i++)
2540             cmp_status->carryover[i] = charbuf[i];
2541         }
2542     }
2543   if (last_id != charset_ascii)
2544     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2545   coding->consumed_char += consumed_chars_base;
2546   coding->consumed = src_base - coding->source;
2547   coding->charbuf_used = charbuf - coding->charbuf;
2548 }
2549
2550
2551 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2552   do {                                          \
2553     if (id < 0xA0)                              \
2554       codes[0] = id, codes[1] = 0;              \
2555     else if (id < 0xE0)                         \
2556       codes[0] = 0x9A, codes[1] = id;           \
2557     else if (id < 0xF0)                         \
2558       codes[0] = 0x9B, codes[1] = id;           \
2559     else if (id < 0xF5)                         \
2560       codes[0] = 0x9C, codes[1] = id;           \
2561     else                                        \
2562       codes[0] = 0x9D, codes[1] = id;           \
2563   } while (0);
2564
2565
2566 static bool
2567 encode_coding_emacs_mule (struct coding_system *coding)
2568 {
2569   bool multibytep = coding->dst_multibyte;
2570   int *charbuf = coding->charbuf;
2571   int *charbuf_end = charbuf + coding->charbuf_used;
2572   unsigned char *dst = coding->destination + coding->produced;
2573   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2574   int safe_room = 8;
2575   ptrdiff_t produced_chars = 0;
2576   Lisp_Object attrs, charset_list;
2577   int c;
2578   int preferred_charset_id = -1;
2579
2580   CODING_GET_INFO (coding, attrs, charset_list);
2581   if (! EQ (charset_list, Vemacs_mule_charset_list))
2582     {
2583       charset_list = Vemacs_mule_charset_list;
2584       ASET (attrs, coding_attr_charset_list, charset_list);
2585     }
2586
2587   while (charbuf < charbuf_end)
2588     {
2589       ASSURE_DESTINATION (safe_room);
2590       c = *charbuf++;
2591
2592       if (c < 0)
2593         {
2594           /* Handle an annotation.  */
2595           switch (*charbuf)
2596             {
2597             case CODING_ANNOTATE_COMPOSITION_MASK:
2598               /* Not yet implemented.  */
2599               break;
2600             case CODING_ANNOTATE_CHARSET_MASK:
2601               preferred_charset_id = charbuf[3];
2602               if (preferred_charset_id >= 0
2603                   && NILP (Fmemq (make_number (preferred_charset_id),
2604                                   charset_list)))
2605                 preferred_charset_id = -1;
2606               break;
2607             default:
2608               emacs_abort ();
2609             }
2610           charbuf += -c - 1;
2611           continue;
2612         }
2613
2614       if (ASCII_CHAR_P (c))
2615         EMIT_ONE_ASCII_BYTE (c);
2616       else if (CHAR_BYTE8_P (c))
2617         {
2618           c = CHAR_TO_BYTE8 (c);
2619           EMIT_ONE_BYTE (c);
2620         }
2621       else
2622         {
2623           struct charset *charset;
2624           unsigned code;
2625           int dimension;
2626           int emacs_mule_id;
2627           unsigned char leading_codes[2];
2628
2629           if (preferred_charset_id >= 0)
2630             {
2631               bool result;
2632
2633               charset = CHARSET_FROM_ID (preferred_charset_id);
2634               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2635               if (result)
2636                 code = ENCODE_CHAR (charset, c);
2637               else
2638                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2639                                      &code, charset);
2640             }
2641           else
2642             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2643                                  &code, charset);
2644           if (! charset)
2645             {
2646               c = coding->default_char;
2647               if (ASCII_CHAR_P (c))
2648                 {
2649                   EMIT_ONE_ASCII_BYTE (c);
2650                   continue;
2651                 }
2652               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2653                                    &code, charset);
2654             }
2655           dimension = CHARSET_DIMENSION (charset);
2656           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2657           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2658           EMIT_ONE_BYTE (leading_codes[0]);
2659           if (leading_codes[1])
2660             EMIT_ONE_BYTE (leading_codes[1]);
2661           if (dimension == 1)
2662             EMIT_ONE_BYTE (code | 0x80);
2663           else
2664             {
2665               code |= 0x8080;
2666               EMIT_ONE_BYTE (code >> 8);
2667               EMIT_ONE_BYTE (code & 0xFF);
2668             }
2669         }
2670     }
2671   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2672   coding->produced_char += produced_chars;
2673   coding->produced = dst - coding->destination;
2674   return 0;
2675 }
2676
2677 \f
2678 /*** 7. ISO2022 handlers ***/
2679
2680 /* The following note describes the coding system ISO2022 briefly.
2681    Since the intention of this note is to help understand the
2682    functions in this file, some parts are NOT ACCURATE or are OVERLY
2683    SIMPLIFIED.  For thorough understanding, please refer to the
2684    original document of ISO2022.  This is equivalent to the standard
2685    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2686
2687    ISO2022 provides many mechanisms to encode several character sets
2688    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2689    is encoded using bytes less than 128.  This may make the encoded
2690    text a little bit longer, but the text passes more easily through
2691    several types of gateway, some of which strip off the MSB (Most
2692    Significant Bit).
2693
2694    There are two kinds of character sets: control character sets and
2695    graphic character sets.  The former contain control characters such
2696    as `newline' and `escape' to provide control functions (control
2697    functions are also provided by escape sequences).  The latter
2698    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2699    two control character sets and many graphic character sets.
2700
2701    Graphic character sets are classified into one of the following
2702    four classes, according to the number of bytes (DIMENSION) and
2703    number of characters in one dimension (CHARS) of the set:
2704    - DIMENSION1_CHARS94
2705    - DIMENSION1_CHARS96
2706    - DIMENSION2_CHARS94
2707    - DIMENSION2_CHARS96
2708
2709    In addition, each character set is assigned an identification tag,
2710    unique for each set, called the "final character" (denoted as <F>
2711    hereafter).  The <F> of each character set is decided by ECMA(*)
2712    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2713    (0x30..0x3F are for private use only).
2714
2715    Note (*): ECMA = European Computer Manufacturers Association
2716
2717    Here are examples of graphic character sets [NAME(<F>)]:
2718         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2719         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2720         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2721         o DIMENSION2_CHARS96 -- none for the moment
2722
2723    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2724         C0 [0x00..0x1F] -- control character plane 0
2725         GL [0x20..0x7F] -- graphic character plane 0
2726         C1 [0x80..0x9F] -- control character plane 1
2727         GR [0xA0..0xFF] -- graphic character plane 1
2728
2729    A control character set is directly designated and invoked to C0 or
2730    C1 by an escape sequence.  The most common case is that:
2731    - ISO646's  control character set is designated/invoked to C0, and
2732    - ISO6429's control character set is designated/invoked to C1,
2733    and usually these designations/invocations are omitted in encoded
2734    text.  In a 7-bit environment, only C0 can be used, and a control
2735    character for C1 is encoded by an appropriate escape sequence to
2736    fit into the environment.  All control characters for C1 are
2737    defined to have corresponding escape sequences.
2738
2739    A graphic character set is at first designated to one of four
2740    graphic registers (G0 through G3), then these graphic registers are
2741    invoked to GL or GR.  These designations and invocations can be
2742    done independently.  The most common case is that G0 is invoked to
2743    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2744    these invocations and designations are omitted in encoded text.
2745    In a 7-bit environment, only GL can be used.
2746
2747    When a graphic character set of CHARS94 is invoked to GL, codes
2748    0x20 and 0x7F of the GL area work as control characters SPACE and
2749    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2750    be used.
2751
2752    There are two ways of invocation: locking-shift and single-shift.
2753    With locking-shift, the invocation lasts until the next different
2754    invocation, whereas with single-shift, the invocation affects the
2755    following character only and doesn't affect the locking-shift
2756    state.  Invocations are done by the following control characters or
2757    escape sequences:
2758
2759    ----------------------------------------------------------------------
2760    abbrev  function                  cntrl escape seq   description
2761    ----------------------------------------------------------------------
2762    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2763    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2764    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2765    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2766    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2767    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2768    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2769    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2770    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2771    ----------------------------------------------------------------------
2772    (*) These are not used by any known coding system.
2773
2774    Control characters for these functions are defined by macros
2775    ISO_CODE_XXX in `coding.h'.
2776
2777    Designations are done by the following escape sequences:
2778    ----------------------------------------------------------------------
2779    escape sequence      description
2780    ----------------------------------------------------------------------
2781    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2782    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2783    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2784    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2785    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2786    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2787    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2788    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2789    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2790    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2791    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2792    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2793    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2794    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2795    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2796    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2797    ----------------------------------------------------------------------
2798
2799    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2800    of dimension 1, chars 94, and final character <F>, etc...
2801
2802    Note (*): Although these designations are not allowed in ISO2022,
2803    Emacs accepts them on decoding, and produces them on encoding
2804    CHARS96 character sets in a coding system which is characterized as
2805    7-bit environment, non-locking-shift, and non-single-shift.
2806
2807    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2808    '(' must be omitted.  We refer to this as "short-form" hereafter.
2809
2810    Now you may notice that there are a lot of ways of encoding the
2811    same multilingual text in ISO2022.  Actually, there exist many
2812    coding systems such as Compound Text (used in X11's inter client
2813    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2814    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2815    localized platforms), and all of these are variants of ISO2022.
2816
2817    In addition to the above, Emacs handles two more kinds of escape
2818    sequences: ISO6429's direction specification and Emacs' private
2819    sequence for specifying character composition.
2820
2821    ISO6429's direction specification takes the following form:
2822         o CSI ']'      -- end of the current direction
2823         o CSI '0' ']'  -- end of the current direction
2824         o CSI '1' ']'  -- start of left-to-right text
2825         o CSI '2' ']'  -- start of right-to-left text
2826    The control character CSI (0x9B: control sequence introducer) is
2827    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2828
2829    Character composition specification takes the following form:
2830         o ESC '0' -- start relative composition
2831         o ESC '1' -- end composition
2832         o ESC '2' -- start rule-base composition (*)
2833         o ESC '3' -- start relative composition with alternate chars  (**)
2834         o ESC '4' -- start rule-base composition with alternate chars  (**)
2835   Since these are not standard escape sequences of any ISO standard,
2836   the use of them with these meanings is restricted to Emacs only.
2837
2838   (*) This form is used only in Emacs 20.7 and older versions,
2839   but newer versions can safely decode it.
2840   (**) This form is used only in Emacs 21.1 and newer versions,
2841   and older versions can't decode it.
2842
2843   Here's a list of example usages of these composition escape
2844   sequences (categorized by `enum composition_method').
2845
2846   COMPOSITION_RELATIVE:
2847         ESC 0 CHAR [ CHAR ] ESC 1
2848   COMPOSITION_WITH_RULE:
2849         ESC 2 CHAR [ RULE CHAR ] ESC 1
2850   COMPOSITION_WITH_ALTCHARS:
2851         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2852   COMPOSITION_WITH_RULE_ALTCHARS:
2853         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2854
2855 static enum iso_code_class_type iso_code_class[256];
2856
2857 #define SAFE_CHARSET_P(coding, id)      \
2858   ((id) <= (coding)->max_charset_id     \
2859    && (coding)->safe_charsets[id] != 255)
2860
2861 static void
2862 setup_iso_safe_charsets (Lisp_Object attrs)
2863 {
2864   Lisp_Object charset_list, safe_charsets;
2865   Lisp_Object request;
2866   Lisp_Object reg_usage;
2867   Lisp_Object tail;
2868   EMACS_INT reg94, reg96;
2869   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2870   int max_charset_id;
2871
2872   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2873   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2874       && ! EQ (charset_list, Viso_2022_charset_list))
2875     {
2876       charset_list = Viso_2022_charset_list;
2877       ASET (attrs, coding_attr_charset_list, charset_list);
2878       ASET (attrs, coding_attr_safe_charsets, Qnil);
2879     }
2880
2881   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2882     return;
2883
2884   max_charset_id = 0;
2885   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2886     {
2887       int id = XINT (XCAR (tail));
2888       if (max_charset_id < id)
2889         max_charset_id = id;
2890     }
2891
2892   safe_charsets = make_uninit_string (max_charset_id + 1);
2893   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2894   request = AREF (attrs, coding_attr_iso_request);
2895   reg_usage = AREF (attrs, coding_attr_iso_usage);
2896   reg94 = XINT (XCAR (reg_usage));
2897   reg96 = XINT (XCDR (reg_usage));
2898
2899   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2900     {
2901       Lisp_Object id;
2902       Lisp_Object reg;
2903       struct charset *charset;
2904
2905       id = XCAR (tail);
2906       charset = CHARSET_FROM_ID (XINT (id));
2907       reg = Fcdr (Fassq (id, request));
2908       if (! NILP (reg))
2909         SSET (safe_charsets, XINT (id), XINT (reg));
2910       else if (charset->iso_chars_96)
2911         {
2912           if (reg96 < 4)
2913             SSET (safe_charsets, XINT (id), reg96);
2914         }
2915       else
2916         {
2917           if (reg94 < 4)
2918             SSET (safe_charsets, XINT (id), reg94);
2919         }
2920     }
2921   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2922 }
2923
2924
2925 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2926    Return true if a text is encoded in one of ISO-2022 based coding
2927    systems.  */
2928
2929 static bool
2930 detect_coding_iso_2022 (struct coding_system *coding,
2931                         struct coding_detection_info *detect_info)
2932 {
2933   const unsigned char *src = coding->source, *src_base = src;
2934   const unsigned char *src_end = coding->source + coding->src_bytes;
2935   bool multibytep = coding->src_multibyte;
2936   bool single_shifting = 0;
2937   int id;
2938   int c, c1;
2939   ptrdiff_t consumed_chars = 0;
2940   int i;
2941   int rejected = 0;
2942   int found = 0;
2943   int composition_count = -1;
2944
2945   detect_info->checked |= CATEGORY_MASK_ISO;
2946
2947   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2948     {
2949       struct coding_system *this = &(coding_categories[i]);
2950       Lisp_Object attrs, val;
2951
2952       if (this->id < 0)
2953         continue;
2954       attrs = CODING_ID_ATTRS (this->id);
2955       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2956           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2957         setup_iso_safe_charsets (attrs);
2958       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2959       this->max_charset_id = SCHARS (val) - 1;
2960       this->safe_charsets = SDATA (val);
2961     }
2962
2963   /* A coding system of this category is always ASCII compatible.  */
2964   src += coding->head_ascii;
2965
2966   while (rejected != CATEGORY_MASK_ISO)
2967     {
2968       src_base = src;
2969       ONE_MORE_BYTE (c);
2970       switch (c)
2971         {
2972         case ISO_CODE_ESC:
2973           if (inhibit_iso_escape_detection)
2974             break;
2975           single_shifting = 0;
2976           ONE_MORE_BYTE (c);
2977           if (c == 'N' || c == 'O')
2978             {
2979               /* ESC <Fe> for SS2 or SS3.  */
2980               single_shifting = 1;
2981               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2982             }
2983           else if (c == '1')
2984             {
2985               /* End of composition.  */
2986               if (composition_count < 0
2987                   || composition_count > MAX_COMPOSITION_COMPONENTS)
2988                 /* Invalid */
2989                 break;
2990               composition_count = -1;
2991               found |= CATEGORY_MASK_ISO;
2992             }
2993           else if (c >= '0' && c <= '4')
2994             {
2995               /* ESC <Fp> for start/end composition.  */
2996               composition_count = 0;
2997             }
2998           else
2999             {
3000               if (c >= '(' && c <= '/')
3001                 {
3002                   /* Designation sequence for a charset of dimension 1.  */
3003                   ONE_MORE_BYTE (c1);
3004                   if (c1 < ' ' || c1 >= 0x80
3005                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3006                     /* Invalid designation sequence.  Just ignore.  */
3007                     break;
3008                 }
3009               else if (c == '$')
3010                 {
3011                   /* Designation sequence for a charset of dimension 2.  */
3012                   ONE_MORE_BYTE (c);
3013                   if (c >= '@' && c <= 'B')
3014                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3015                     id = iso_charset_table[1][0][c];
3016                   else if (c >= '(' && c <= '/')
3017                     {
3018                       ONE_MORE_BYTE (c1);
3019                       if (c1 < ' ' || c1 >= 0x80
3020                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3021                         /* Invalid designation sequence.  Just ignore.  */
3022                         break;
3023                     }
3024                   else
3025                     /* Invalid designation sequence.  Just ignore it.  */
3026                     break;
3027                 }
3028               else
3029                 {
3030                   /* Invalid escape sequence.  Just ignore it.  */
3031                   break;
3032                 }
3033
3034               /* We found a valid designation sequence for CHARSET.  */
3035               rejected |= CATEGORY_MASK_ISO_8BIT;
3036               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3037                                   id))
3038                 found |= CATEGORY_MASK_ISO_7;
3039               else
3040                 rejected |= CATEGORY_MASK_ISO_7;
3041               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3042                                   id))
3043                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3044               else
3045                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3046               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3047                                   id))
3048                 found |= CATEGORY_MASK_ISO_7_ELSE;
3049               else
3050                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3051               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3052                                   id))
3053                 found |= CATEGORY_MASK_ISO_8_ELSE;
3054               else
3055                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3056             }
3057           break;
3058
3059         case ISO_CODE_SO:
3060         case ISO_CODE_SI:
3061           /* Locking shift out/in.  */
3062           if (inhibit_iso_escape_detection)
3063             break;
3064           single_shifting = 0;
3065           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3066           break;
3067
3068         case ISO_CODE_CSI:
3069           /* Control sequence introducer.  */
3070           single_shifting = 0;
3071           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3072           found |= CATEGORY_MASK_ISO_8_ELSE;
3073           goto check_extra_latin;
3074
3075         case ISO_CODE_SS2:
3076         case ISO_CODE_SS3:
3077           /* Single shift.   */
3078           if (inhibit_iso_escape_detection)
3079             break;
3080           single_shifting = 0;
3081           rejected |= CATEGORY_MASK_ISO_7BIT;
3082           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3083               & CODING_ISO_FLAG_SINGLE_SHIFT)
3084             {
3085               found |= CATEGORY_MASK_ISO_8_1;
3086               single_shifting = 1;
3087             }
3088           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3089               & CODING_ISO_FLAG_SINGLE_SHIFT)
3090             {
3091               found |= CATEGORY_MASK_ISO_8_2;
3092               single_shifting = 1;
3093             }
3094           if (single_shifting)
3095             break;
3096           goto check_extra_latin;
3097
3098         default:
3099           if (c < 0)
3100             continue;
3101           if (c < 0x80)
3102             {
3103               if (composition_count >= 0)
3104                 composition_count++;
3105               single_shifting = 0;
3106               break;
3107             }
3108           if (c >= 0xA0)
3109             {
3110               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3111               found |= CATEGORY_MASK_ISO_8_1;
3112               /* Check the length of succeeding codes of the range
3113                  0xA0..0FF.  If the byte length is even, we include
3114                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3115                  only when we are not single shifting.  */
3116               if (! single_shifting
3117                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3118                 {
3119                   int len = 1;
3120                   while (src < src_end)
3121                     {
3122                       src_base = src;
3123                       ONE_MORE_BYTE (c);
3124                       if (c < 0xA0)
3125                         {
3126                           src = src_base;
3127                           break;
3128                         }
3129                       len++;
3130                     }
3131
3132                   if (len & 1 && src < src_end)
3133                     {
3134                       rejected |= CATEGORY_MASK_ISO_8_2;
3135                       if (composition_count >= 0)
3136                         composition_count += len;
3137                     }
3138                   else
3139                     {
3140                       found |= CATEGORY_MASK_ISO_8_2;
3141                       if (composition_count >= 0)
3142                         composition_count += len / 2;
3143                     }
3144                 }
3145               break;
3146             }
3147         check_extra_latin:
3148           if (! VECTORP (Vlatin_extra_code_table)
3149               || NILP (AREF (Vlatin_extra_code_table, c)))
3150             {
3151               rejected = CATEGORY_MASK_ISO;
3152               break;
3153             }
3154           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3155               & CODING_ISO_FLAG_LATIN_EXTRA)
3156             found |= CATEGORY_MASK_ISO_8_1;
3157           else
3158             rejected |= CATEGORY_MASK_ISO_8_1;
3159           rejected |= CATEGORY_MASK_ISO_8_2;
3160           break;
3161         }
3162     }
3163   detect_info->rejected |= CATEGORY_MASK_ISO;
3164   return 0;
3165
3166  no_more_source:
3167   detect_info->rejected |= rejected;
3168   detect_info->found |= (found & ~rejected);
3169   return 1;
3170 }
3171
3172
3173 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3174    escape sequence should be kept.  */
3175 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3176   do {                                                                  \
3177     int id, prev;                                                       \
3178                                                                         \
3179     if (final < '0' || final >= 128                                     \
3180         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3181         || !SAFE_CHARSET_P (coding, id))                                \
3182       {                                                                 \
3183         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3184         chars_96 = -1;                                                  \
3185         break;                                                          \
3186       }                                                                 \
3187     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3188     if (id == charset_jisx0201_roman)                                   \
3189       {                                                                 \
3190         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3191           id = charset_ascii;                                           \
3192       }                                                                 \
3193     else if (id == charset_jisx0208_1978)                               \
3194       {                                                                 \
3195         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3196           id = charset_jisx0208;                                        \
3197       }                                                                 \
3198     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3199     /* If there was an invalid designation to REG previously, and this  \
3200        designation is ASCII to REG, we should keep this designation     \
3201        sequence.  */                                                    \
3202     if (prev == -2 && id == charset_ascii)                              \
3203       chars_96 = -1;                                                    \
3204   } while (0)
3205
3206
3207 /* Handle these composition sequence (ALT: alternate char):
3208
3209    (1) relative composition: ESC 0 CHAR ... ESC 1
3210    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3211    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3212    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3213
3214    When the start sequence (ESC 0/2/3/4) is found, this annotation
3215    header is produced.
3216
3217         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3218
3219    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3220    produced until the end sequence (ESC 1) is found:
3221
3222    (1) CHAR ... CHAR
3223    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3224    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3225    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3226
3227    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3228    annotation header is updated as below:
3229
3230    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3231    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3232    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3233    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3234
3235    If an error is found while composing, the annotation header is
3236    changed to:
3237
3238         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3239
3240    and the sequence [ -2 DECODED-RULE ] is changed to the original
3241    byte sequence as below:
3242         o the original byte sequence is B: [ B -1 ]
3243         o the original byte sequence is B1 B2: [ B1 B2 ]
3244    and the sequence [ -1 -1 ] is changed to the original byte
3245    sequence:
3246         [ ESC '0' ]
3247 */
3248
3249 /* Decode a composition rule C1 and maybe one more byte from the
3250    source, and set RULE to the encoded composition rule.  If the rule
3251    is invalid, goto invalid_code.  */
3252
3253 #define DECODE_COMPOSITION_RULE(rule)                                   \
3254   do {                                                                  \
3255     rule = c1 - 32;                                                     \
3256     if (rule < 0)                                                       \
3257       goto invalid_code;                                                \
3258     if (rule < 81)              /* old format (before ver.21) */        \
3259       {                                                                 \
3260         int gref = (rule) / 9;                                          \
3261         int nref = (rule) % 9;                                          \
3262         if (gref == 4) gref = 10;                                       \
3263         if (nref == 4) nref = 10;                                       \
3264         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3265       }                                                                 \
3266     else                        /* new format (after ver.21) */         \
3267       {                                                                 \
3268         int b;                                                          \
3269                                                                         \
3270         ONE_MORE_BYTE (b);                                              \
3271         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3272           goto invalid_code;                                            \
3273         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3274         rule += 0x100;   /* Distinguish it from the old format.  */     \
3275       }                                                                 \
3276   } while (0)
3277
3278 #define ENCODE_COMPOSITION_RULE(rule)                           \
3279   do {                                                          \
3280     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3281                                                                 \
3282     if (rule < 0x100)           /* old format */                \
3283       {                                                         \
3284         if (gref == 10) gref = 4;                               \
3285         if (nref == 10) nref = 4;                               \
3286         charbuf[idx] = 32 + gref * 9 + nref;                    \
3287         charbuf[idx + 1] = -1;                                  \
3288         new_chars++;                                            \
3289       }                                                         \
3290     else                                /* new format */        \
3291       {                                                         \
3292         charbuf[idx] = 32 + 81 + gref;                          \
3293         charbuf[idx + 1] = 32 + nref;                           \
3294         new_chars += 2;                                         \
3295       }                                                         \
3296   } while (0)
3297
3298 /* Finish the current composition as invalid.  */
3299
3300 static int
3301 finish_composition (int *charbuf, struct composition_status *cmp_status)
3302 {
3303   int idx = - cmp_status->length;
3304   int new_chars;
3305
3306   /* Recover the original ESC sequence */
3307   charbuf[idx++] = ISO_CODE_ESC;
3308   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3309                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3310                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3311                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3312                     : '4');
3313   charbuf[idx++] = -2;
3314   charbuf[idx++] = 0;
3315   charbuf[idx++] = -1;
3316   new_chars = cmp_status->nchars;
3317   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3318     for (; idx < 0; idx++)
3319       {
3320         int elt = charbuf[idx];
3321
3322         if (elt == -2)
3323           {
3324             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3325             idx++;
3326           }
3327         else if (elt == -1)
3328           {
3329             charbuf[idx++] = ISO_CODE_ESC;
3330             charbuf[idx] = '0';
3331             new_chars += 2;
3332           }
3333       }
3334   cmp_status->state = COMPOSING_NO;
3335   return new_chars;
3336 }
3337
3338 /* If characters are under composition, finish the composition.  */
3339 #define MAYBE_FINISH_COMPOSITION()                              \
3340   do {                                                          \
3341     if (cmp_status->state != COMPOSING_NO)                      \
3342       char_offset += finish_composition (charbuf, cmp_status);  \
3343   } while (0)
3344
3345 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3346
3347    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3348    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3349    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3350    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3351
3352    Produce this annotation sequence now:
3353
3354    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3355 */
3356
3357 #define DECODE_COMPOSITION_START(c1)                                       \
3358   do {                                                                     \
3359     if (c1 == '0'                                                          \
3360         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3361              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3362             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3363                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3364       {                                                                    \
3365         *charbuf++ = -1;                                                   \
3366         *charbuf++= -1;                                                    \
3367         cmp_status->state = COMPOSING_CHAR;                                \
3368         cmp_status->length += 2;                                           \
3369       }                                                                    \
3370     else                                                                   \
3371       {                                                                    \
3372         MAYBE_FINISH_COMPOSITION ();                                       \
3373         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3374                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3375                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3376                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3377         cmp_status->state                                                  \
3378           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3379         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3380         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3381         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3382         coding->annotated = 1;                                             \
3383       }                                                                    \
3384   } while (0)
3385
3386
3387 /* Handle composition end sequence ESC 1.  */
3388
3389 #define DECODE_COMPOSITION_END()                                        \
3390   do {                                                                  \
3391     if (cmp_status->nchars == 0                                         \
3392         || ((cmp_status->state == COMPOSING_CHAR)                       \
3393             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3394       {                                                                 \
3395         MAYBE_FINISH_COMPOSITION ();                                    \
3396         goto invalid_code;                                              \
3397       }                                                                 \
3398     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3399       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3400     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3401       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3402     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3403     char_offset += cmp_status->nchars;                                  \
3404     cmp_status->state = COMPOSING_NO;                                   \
3405   } while (0)
3406
3407 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3408
3409 #define STORE_COMPOSITION_RULE(rule)    \
3410   do {                                  \
3411     *charbuf++ = -2;                    \
3412     *charbuf++ = rule;                  \
3413     cmp_status->length += 2;            \
3414     cmp_status->state--;                \
3415   } while (0)
3416
3417 /* Store a composed char or a component char C in charbuf, and update
3418    cmp_status.  */
3419
3420 #define STORE_COMPOSITION_CHAR(c)                                       \
3421   do {                                                                  \
3422     *charbuf++ = (c);                                                   \
3423     cmp_status->length++;                                               \
3424     if (cmp_status->state == COMPOSING_CHAR)                            \
3425       cmp_status->nchars++;                                             \
3426     else                                                                \
3427       cmp_status->ncomps++;                                             \
3428     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3429         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3430             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3431       cmp_status->state++;                                              \
3432   } while (0)
3433
3434
3435 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3436
3437 static void
3438 decode_coding_iso_2022 (struct coding_system *coding)
3439 {
3440   const unsigned char *src = coding->source + coding->consumed;
3441   const unsigned char *src_end = coding->source + coding->src_bytes;
3442   const unsigned char *src_base;
3443   int *charbuf = coding->charbuf + coding->charbuf_used;
3444   /* We may produce two annotations (charset and composition) in one
3445      loop and one more charset annotation at the end.  */
3446   int *charbuf_end
3447     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3448   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3449   bool multibytep = coding->src_multibyte;
3450   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3451   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3452   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3453   int charset_id_2, charset_id_3;
3454   struct charset *charset;
3455   int c;
3456   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3457   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3458   ptrdiff_t char_offset = coding->produced_char;
3459   ptrdiff_t last_offset = char_offset;
3460   int last_id = charset_ascii;
3461   bool eol_dos
3462     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3463   int byte_after_cr = -1;
3464   int i;
3465
3466   setup_iso_safe_charsets (attrs);
3467   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3468
3469   if (cmp_status->state != COMPOSING_NO)
3470     {
3471       if (charbuf_end - charbuf < cmp_status->length)
3472         emacs_abort ();
3473       for (i = 0; i < cmp_status->length; i++)
3474         *charbuf++ = cmp_status->carryover[i];
3475       coding->annotated = 1;
3476     }
3477
3478   while (1)
3479     {
3480       int c1, c2, c3;
3481
3482       src_base = src;
3483       consumed_chars_base = consumed_chars;
3484
3485       if (charbuf >= charbuf_end)
3486         {
3487           if (byte_after_cr >= 0)
3488             src_base--;
3489           break;
3490         }
3491
3492       if (byte_after_cr >= 0)
3493         c1 = byte_after_cr, byte_after_cr = -1;
3494       else
3495         ONE_MORE_BYTE (c1);
3496       if (c1 < 0)
3497         goto invalid_code;
3498
3499       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3500         {
3501           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3502           char_offset++;
3503           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3504           continue;
3505         }
3506
3507       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3508         {
3509           if (c1 == ISO_CODE_ESC)
3510             {
3511               if (src + 1 >= src_end)
3512                 goto no_more_source;
3513               *charbuf++ = ISO_CODE_ESC;
3514               char_offset++;
3515               if (src[0] == '%' && src[1] == '@')
3516                 {
3517                   src += 2;
3518                   consumed_chars += 2;
3519                   char_offset += 2;
3520                   /* We are sure charbuf can contain two more chars. */
3521                   *charbuf++ = '%';
3522                   *charbuf++ = '@';
3523                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3524                 }
3525             }
3526           else
3527             {
3528               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3529               char_offset++;
3530             }
3531           continue;
3532         }
3533
3534       if ((cmp_status->state == COMPOSING_RULE
3535            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3536           && c1 != ISO_CODE_ESC)
3537         {
3538           int rule;
3539
3540           DECODE_COMPOSITION_RULE (rule);
3541           STORE_COMPOSITION_RULE (rule);
3542           continue;
3543         }
3544
3545       /* We produce at most one character.  */
3546       switch (iso_code_class [c1])
3547         {
3548         case ISO_0x20_or_0x7F:
3549           if (charset_id_0 < 0
3550               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3551             /* This is SPACE or DEL.  */
3552             charset = CHARSET_FROM_ID (charset_ascii);
3553           else
3554             charset = CHARSET_FROM_ID (charset_id_0);
3555           break;
3556
3557         case ISO_graphic_plane_0:
3558           if (charset_id_0 < 0)
3559             charset = CHARSET_FROM_ID (charset_ascii);
3560           else
3561             charset = CHARSET_FROM_ID (charset_id_0);
3562           break;
3563
3564         case ISO_0xA0_or_0xFF:
3565           if (charset_id_1 < 0
3566               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3567               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3568             goto invalid_code;
3569           /* This is a graphic character, we fall down ... */
3570
3571         case ISO_graphic_plane_1:
3572           if (charset_id_1 < 0)
3573             goto invalid_code;
3574           charset = CHARSET_FROM_ID (charset_id_1);
3575           break;
3576
3577         case ISO_control_0:
3578           if (eol_dos && c1 == '\r')
3579             ONE_MORE_BYTE (byte_after_cr);
3580           MAYBE_FINISH_COMPOSITION ();
3581           charset = CHARSET_FROM_ID (charset_ascii);
3582           break;
3583
3584         case ISO_control_1:
3585           goto invalid_code;
3586
3587         case ISO_shift_out:
3588           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3589               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3590             goto invalid_code;
3591           CODING_ISO_INVOCATION (coding, 0) = 1;
3592           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3593           continue;
3594
3595         case ISO_shift_in:
3596           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3597             goto invalid_code;
3598           CODING_ISO_INVOCATION (coding, 0) = 0;
3599           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3600           continue;
3601
3602         case ISO_single_shift_2_7:
3603           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3604             goto invalid_code;
3605         case ISO_single_shift_2:
3606           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3607             goto invalid_code;
3608           /* SS2 is handled as an escape sequence of ESC 'N' */
3609           c1 = 'N';
3610           goto label_escape_sequence;
3611
3612         case ISO_single_shift_3:
3613           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3614             goto invalid_code;
3615           /* SS2 is handled as an escape sequence of ESC 'O' */
3616           c1 = 'O';
3617           goto label_escape_sequence;
3618
3619         case ISO_control_sequence_introducer:
3620           /* CSI is handled as an escape sequence of ESC '[' ...  */
3621           c1 = '[';
3622           goto label_escape_sequence;
3623
3624         case ISO_escape:
3625           ONE_MORE_BYTE (c1);
3626         label_escape_sequence:
3627           /* Escape sequences handled here are invocation,
3628              designation, direction specification, and character
3629              composition specification.  */
3630           switch (c1)
3631             {
3632             case '&':           /* revision of following character set */
3633               ONE_MORE_BYTE (c1);
3634               if (!(c1 >= '@' && c1 <= '~'))
3635                 goto invalid_code;
3636               ONE_MORE_BYTE (c1);
3637               if (c1 != ISO_CODE_ESC)
3638                 goto invalid_code;
3639               ONE_MORE_BYTE (c1);
3640               goto label_escape_sequence;
3641
3642             case '$':           /* designation of 2-byte character set */
3643               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3644                 goto invalid_code;
3645               {
3646                 int reg, chars96;
3647
3648                 ONE_MORE_BYTE (c1);
3649                 if (c1 >= '@' && c1 <= 'B')
3650                   {     /* designation of JISX0208.1978, GB2312.1980,
3651                            or JISX0208.1980 */
3652                     reg = 0, chars96 = 0;
3653                   }
3654                 else if (c1 >= 0x28 && c1 <= 0x2B)
3655                   { /* designation of DIMENSION2_CHARS94 character set */
3656                     reg = c1 - 0x28, chars96 = 0;
3657                     ONE_MORE_BYTE (c1);
3658                   }
3659                 else if (c1 >= 0x2C && c1 <= 0x2F)
3660                   { /* designation of DIMENSION2_CHARS96 character set */
3661                     reg = c1 - 0x2C, chars96 = 1;
3662                     ONE_MORE_BYTE (c1);
3663                   }
3664                 else
3665                   goto invalid_code;
3666                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3667                 /* We must update these variables now.  */
3668                 if (reg == 0)
3669                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3670                 else if (reg == 1)
3671                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3672                 if (chars96 < 0)
3673                   goto invalid_code;
3674               }
3675               continue;
3676
3677             case 'n':           /* invocation of locking-shift-2 */
3678               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3679                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3680                 goto invalid_code;
3681               CODING_ISO_INVOCATION (coding, 0) = 2;
3682               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3683               continue;
3684
3685             case 'o':           /* invocation of locking-shift-3 */
3686               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3687                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3688                 goto invalid_code;
3689               CODING_ISO_INVOCATION (coding, 0) = 3;
3690               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3691               continue;
3692
3693             case 'N':           /* invocation of single-shift-2 */
3694               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3695                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3696                 goto invalid_code;
3697               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3698               if (charset_id_2 < 0)
3699                 charset = CHARSET_FROM_ID (charset_ascii);
3700               else
3701                 charset = CHARSET_FROM_ID (charset_id_2);
3702               ONE_MORE_BYTE (c1);
3703               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3704                 goto invalid_code;
3705               break;
3706
3707             case 'O':           /* invocation of single-shift-3 */
3708               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3709                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3710                 goto invalid_code;
3711               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3712               if (charset_id_3 < 0)
3713                 charset = CHARSET_FROM_ID (charset_ascii);
3714               else
3715                 charset = CHARSET_FROM_ID (charset_id_3);
3716               ONE_MORE_BYTE (c1);
3717               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3718                 goto invalid_code;
3719               break;
3720
3721             case '0': case '2': case '3': case '4': /* start composition */
3722               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3723                 goto invalid_code;
3724               if (last_id != charset_ascii)
3725                 {
3726                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3727                   last_id = charset_ascii;
3728                   last_offset = char_offset;
3729                 }
3730               DECODE_COMPOSITION_START (c1);
3731               continue;
3732
3733             case '1':           /* end composition */
3734               if (cmp_status->state == COMPOSING_NO)
3735                 goto invalid_code;
3736               DECODE_COMPOSITION_END ();
3737               continue;
3738
3739             case '[':           /* specification of direction */
3740               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3741                 goto invalid_code;
3742               /* For the moment, nested direction is not supported.
3743                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3744                  left-to-right, and nonzero means right-to-left.  */
3745               ONE_MORE_BYTE (c1);
3746               switch (c1)
3747                 {
3748                 case ']':       /* end of the current direction */
3749                   coding->mode &= ~CODING_MODE_DIRECTION;
3750
3751                 case '0':       /* end of the current direction */
3752                 case '1':       /* start of left-to-right direction */
3753                   ONE_MORE_BYTE (c1);
3754                   if (c1 == ']')
3755                     coding->mode &= ~CODING_MODE_DIRECTION;
3756                   else
3757                     goto invalid_code;
3758                   break;
3759
3760                 case '2':       /* start of right-to-left direction */
3761                   ONE_MORE_BYTE (c1);
3762                   if (c1 == ']')
3763                     coding->mode |= CODING_MODE_DIRECTION;
3764                   else
3765                     goto invalid_code;
3766                   break;
3767
3768                 default:
3769                   goto invalid_code;
3770                 }
3771               continue;
3772
3773             case '%':
3774               ONE_MORE_BYTE (c1);
3775               if (c1 == '/')
3776                 {
3777                   /* CTEXT extended segment:
3778                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3779                      We keep these bytes as is for the moment.
3780                      They may be decoded by post-read-conversion.  */
3781                   int dim, M, L;
3782                   int size;
3783
3784                   ONE_MORE_BYTE (dim);
3785                   if (dim < '0' || dim > '4')
3786                     goto invalid_code;
3787                   ONE_MORE_BYTE (M);
3788                   if (M < 128)
3789                     goto invalid_code;
3790                   ONE_MORE_BYTE (L);
3791                   if (L < 128)
3792                     goto invalid_code;
3793                   size = ((M - 128) * 128) + (L - 128);
3794                   if (charbuf + 6 > charbuf_end)
3795                     goto break_loop;
3796                   *charbuf++ = ISO_CODE_ESC;
3797                   *charbuf++ = '%';
3798                   *charbuf++ = '/';
3799                   *charbuf++ = dim;
3800                   *charbuf++ = BYTE8_TO_CHAR (M);
3801                   *charbuf++ = BYTE8_TO_CHAR (L);
3802                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3803                 }
3804               else if (c1 == 'G')
3805                 {
3806                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3807                      ESC % G --UTF-8-BYTES-- ESC % @
3808                      We keep these bytes as is for the moment.
3809                      They may be decoded by post-read-conversion.  */
3810                   if (charbuf + 3 > charbuf_end)
3811                     goto break_loop;
3812                   *charbuf++ = ISO_CODE_ESC;
3813                   *charbuf++ = '%';
3814                   *charbuf++ = 'G';
3815                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3816                 }
3817               else
3818                 goto invalid_code;
3819               continue;
3820               break;
3821
3822             default:
3823               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3824                 goto invalid_code;
3825               {
3826                 int reg, chars96;
3827
3828                 if (c1 >= 0x28 && c1 <= 0x2B)
3829                   { /* designation of DIMENSION1_CHARS94 character set */
3830                     reg = c1 - 0x28, chars96 = 0;
3831                     ONE_MORE_BYTE (c1);
3832                   }
3833                 else if (c1 >= 0x2C && c1 <= 0x2F)
3834                   { /* designation of DIMENSION1_CHARS96 character set */
3835                     reg = c1 - 0x2C, chars96 = 1;
3836                     ONE_MORE_BYTE (c1);
3837                   }
3838                 else
3839                   goto invalid_code;
3840                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3841                 /* We must update these variables now.  */
3842                 if (reg == 0)
3843                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3844                 else if (reg == 1)
3845                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3846                 if (chars96 < 0)
3847                   goto invalid_code;
3848               }
3849               continue;
3850             }
3851           break;
3852
3853         default:
3854           emacs_abort ();
3855         }
3856
3857       if (cmp_status->state == COMPOSING_NO
3858           && charset->id != charset_ascii
3859           && last_id != charset->id)
3860         {
3861           if (last_id != charset_ascii)
3862             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3863           last_id = charset->id;
3864           last_offset = char_offset;
3865         }
3866
3867       /* Now we know CHARSET and 1st position code C1 of a character.
3868          Produce a decoded character while getting 2nd and 3rd
3869          position codes C2, C3 if necessary.  */
3870       if (CHARSET_DIMENSION (charset) > 1)
3871         {
3872           ONE_MORE_BYTE (c2);
3873           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3874               || ((c1 & 0x80) != (c2 & 0x80)))
3875             /* C2 is not in a valid range.  */
3876             goto invalid_code;
3877           if (CHARSET_DIMENSION (charset) == 2)
3878             c1 = (c1 << 8) | c2;
3879           else
3880             {
3881               ONE_MORE_BYTE (c3);
3882               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3883                   || ((c1 & 0x80) != (c3 & 0x80)))
3884                 /* C3 is not in a valid range.  */
3885                 goto invalid_code;
3886               c1 = (c1 << 16) | (c2 << 8) | c2;
3887             }
3888         }
3889       c1 &= 0x7F7F7F;
3890       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3891       if (c < 0)
3892         {
3893           MAYBE_FINISH_COMPOSITION ();
3894           for (; src_base < src; src_base++, char_offset++)
3895             {
3896               if (ASCII_BYTE_P (*src_base))
3897                 *charbuf++ = *src_base;
3898               else
3899                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3900             }
3901         }
3902       else if (cmp_status->state == COMPOSING_NO)
3903         {
3904           *charbuf++ = c;
3905           char_offset++;
3906         }
3907       else if ((cmp_status->state == COMPOSING_CHAR
3908                 ? cmp_status->nchars
3909                 : cmp_status->ncomps)
3910                >= MAX_COMPOSITION_COMPONENTS)
3911         {
3912           /* Too long composition.  */
3913           MAYBE_FINISH_COMPOSITION ();
3914           *charbuf++ = c;
3915           char_offset++;
3916         }
3917       else
3918         STORE_COMPOSITION_CHAR (c);
3919       continue;
3920
3921     invalid_code:
3922       MAYBE_FINISH_COMPOSITION ();
3923       src = src_base;
3924       consumed_chars = consumed_chars_base;
3925       ONE_MORE_BYTE (c);
3926       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3927       char_offset++;
3928       coding->errors++;
3929       /* Reset the invocation and designation status to the safest
3930          one; i.e. designate ASCII to the graphic register 0, and
3931          invoke that register to the graphic plane 0.  This typically
3932          helps the case that an designation sequence for ASCII "ESC (
3933          B" is somehow broken (e.g. broken by a newline).  */
3934       CODING_ISO_INVOCATION (coding, 0) = 0;
3935       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3936       charset_id_0 = charset_ascii;
3937       continue;
3938
3939     break_loop:
3940       break;
3941     }
3942
3943  no_more_source:
3944   if (cmp_status->state != COMPOSING_NO)
3945     {
3946       if (coding->mode & CODING_MODE_LAST_BLOCK)
3947         MAYBE_FINISH_COMPOSITION ();
3948       else
3949         {
3950           charbuf -= cmp_status->length;
3951           for (i = 0; i < cmp_status->length; i++)
3952             cmp_status->carryover[i] = charbuf[i];
3953         }
3954     }
3955   else if (last_id != charset_ascii)
3956     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3957   coding->consumed_char += consumed_chars_base;
3958   coding->consumed = src_base - coding->source;
3959   coding->charbuf_used = charbuf - coding->charbuf;
3960 }
3961
3962
3963 /* ISO2022 encoding stuff.  */
3964
3965 /*
3966    It is not enough to say just "ISO2022" on encoding, we have to
3967    specify more details.  In Emacs, each coding system of ISO2022
3968    variant has the following specifications:
3969         1. Initial designation to G0 thru G3.
3970         2. Allows short-form designation?
3971         3. ASCII should be designated to G0 before control characters?
3972         4. ASCII should be designated to G0 at end of line?
3973         5. 7-bit environment or 8-bit environment?
3974         6. Use locking-shift?
3975         7. Use Single-shift?
3976    And the following two are only for Japanese:
3977         8. Use ASCII in place of JIS0201-1976-Roman?
3978         9. Use JISX0208-1983 in place of JISX0208-1978?
3979    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3980    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3981    details.
3982 */
3983
3984 /* Produce codes (escape sequence) for designating CHARSET to graphic
3985    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3986    '@', 'A', or 'B' and the coding system CODING allows, produce
3987    designation sequence of short-form.  */
3988
3989 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3990   do {                                                                  \
3991     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3992     const char *intermediate_char_94 = "()*+";                          \
3993     const char *intermediate_char_96 = ",-./";                          \
3994     int revision = -1;                                                  \
3995                                                                         \
3996     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3997       revision = CHARSET_ISO_REVISION (charset);                        \
3998                                                                         \
3999     if (revision >= 0)                                                  \
4000       {                                                                 \
4001         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4002         EMIT_ONE_BYTE ('@' + revision);                                 \
4003       }                                                                 \
4004     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4005     if (CHARSET_DIMENSION (charset) == 1)                               \
4006       {                                                                 \
4007         int b;                                                          \
4008         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4009           b = intermediate_char_94[reg];                                \
4010         else                                                            \
4011           b = intermediate_char_96[reg];                                \
4012         EMIT_ONE_ASCII_BYTE (b);                                        \
4013       }                                                                 \
4014     else                                                                \
4015       {                                                                 \
4016         EMIT_ONE_ASCII_BYTE ('$');                                      \
4017         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4018           {                                                             \
4019             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4020                 || reg != 0                                             \
4021                 || final_char < '@' || final_char > 'B')                \
4022               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4023           }                                                             \
4024         else                                                            \
4025           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4026       }                                                                 \
4027     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4028                                                                         \
4029     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4030   } while (0)
4031
4032
4033 /* The following two macros produce codes (control character or escape
4034    sequence) for ISO2022 single-shift functions (single-shift-2 and
4035    single-shift-3).  */
4036
4037 #define ENCODE_SINGLE_SHIFT_2                                           \
4038   do {                                                                  \
4039     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4040       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4041     else                                                                \
4042       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4043     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4044   } while (0)
4045
4046
4047 #define ENCODE_SINGLE_SHIFT_3                                           \
4048   do {                                                                  \
4049     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4050       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4051     else                                                                \
4052       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4053     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4054   } while (0)
4055
4056
4057 /* The following four macros produce codes (control character or
4058    escape sequence) for ISO2022 locking-shift functions (shift-in,
4059    shift-out, locking-shift-2, and locking-shift-3).  */
4060
4061 #define ENCODE_SHIFT_IN                                 \
4062   do {                                                  \
4063     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4064     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4065   } while (0)
4066
4067
4068 #define ENCODE_SHIFT_OUT                                \
4069   do {                                                  \
4070     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4071     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4072   } while (0)
4073
4074
4075 #define ENCODE_LOCKING_SHIFT_2                          \
4076   do {                                                  \
4077     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4078     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4079   } while (0)
4080
4081
4082 #define ENCODE_LOCKING_SHIFT_3                          \
4083   do {                                                  \
4084     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4085     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4086   } while (0)
4087
4088
4089 /* Produce codes for a DIMENSION1 character whose character set is
4090    CHARSET and whose position-code is C1.  Designation and invocation
4091    sequences are also produced in advance if necessary.  */
4092
4093 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4094   do {                                                                  \
4095     int id = CHARSET_ID (charset);                                      \
4096                                                                         \
4097     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4098         && id == charset_ascii)                                         \
4099       {                                                                 \
4100         id = charset_jisx0201_roman;                                    \
4101         charset = CHARSET_FROM_ID (id);                                 \
4102       }                                                                 \
4103                                                                         \
4104     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4105       {                                                                 \
4106         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4107           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4108         else                                                            \
4109           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4110         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4111         break;                                                          \
4112       }                                                                 \
4113     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4114       {                                                                 \
4115         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4116         break;                                                          \
4117       }                                                                 \
4118     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4119       {                                                                 \
4120         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4121         break;                                                          \
4122       }                                                                 \
4123     else                                                                \
4124       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4125          must invoke it, or, at first, designate it to some graphic     \
4126          register.  Then repeat the loop to actually produce the        \
4127          character.  */                                                 \
4128       dst = encode_invocation_designation (charset, coding, dst,        \
4129                                            &produced_chars);            \
4130   } while (1)
4131
4132
4133 /* Produce codes for a DIMENSION2 character whose character set is
4134    CHARSET and whose position-codes are C1 and C2.  Designation and
4135    invocation codes are also produced in advance if necessary.  */
4136
4137 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4138   do {                                                                  \
4139     int id = CHARSET_ID (charset);                                      \
4140                                                                         \
4141     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4142         && id == charset_jisx0208)                                      \
4143       {                                                                 \
4144         id = charset_jisx0208_1978;                                     \
4145         charset = CHARSET_FROM_ID (id);                                 \
4146       }                                                                 \
4147                                                                         \
4148     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4149       {                                                                 \
4150         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4151           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4152         else                                                            \
4153           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4154         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4155         break;                                                          \
4156       }                                                                 \
4157     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4158       {                                                                 \
4159         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4160         break;                                                          \
4161       }                                                                 \
4162     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4163       {                                                                 \
4164         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4165         break;                                                          \
4166       }                                                                 \
4167     else                                                                \
4168       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4169          must invoke it, or, at first, designate it to some graphic     \
4170          register.  Then repeat the loop to actually produce the        \
4171          character.  */                                                 \
4172       dst = encode_invocation_designation (charset, coding, dst,        \
4173                                            &produced_chars);            \
4174   } while (1)
4175
4176
4177 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4178   do {                                                                     \
4179     unsigned code;                                                         \
4180     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4181                                                                            \
4182     if (CHARSET_DIMENSION (charset) == 1)                                  \
4183       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4184     else                                                                   \
4185       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4186   } while (0)
4187
4188
4189 /* Produce designation and invocation codes at a place pointed by DST
4190    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4191    Return new DST.  */
4192
4193 static unsigned char *
4194 encode_invocation_designation (struct charset *charset,
4195                                struct coding_system *coding,
4196                                unsigned char *dst, ptrdiff_t *p_nchars)
4197 {
4198   bool multibytep = coding->dst_multibyte;
4199   ptrdiff_t produced_chars = *p_nchars;
4200   int reg;                      /* graphic register number */
4201   int id = CHARSET_ID (charset);
4202
4203   /* At first, check designations.  */
4204   for (reg = 0; reg < 4; reg++)
4205     if (id == CODING_ISO_DESIGNATION (coding, reg))
4206       break;
4207
4208   if (reg >= 4)
4209     {
4210       /* CHARSET is not yet designated to any graphic registers.  */
4211       /* At first check the requested designation.  */
4212       reg = CODING_ISO_REQUEST (coding, id);
4213       if (reg < 0)
4214         /* Since CHARSET requests no special designation, designate it
4215            to graphic register 0.  */
4216         reg = 0;
4217
4218       ENCODE_DESIGNATION (charset, reg, coding);
4219     }
4220
4221   if (CODING_ISO_INVOCATION (coding, 0) != reg
4222       && CODING_ISO_INVOCATION (coding, 1) != reg)
4223     {
4224       /* Since the graphic register REG is not invoked to any graphic
4225          planes, invoke it to graphic plane 0.  */
4226       switch (reg)
4227         {
4228         case 0:                 /* graphic register 0 */
4229           ENCODE_SHIFT_IN;
4230           break;
4231
4232         case 1:                 /* graphic register 1 */
4233           ENCODE_SHIFT_OUT;
4234           break;
4235
4236         case 2:                 /* graphic register 2 */
4237           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4238             ENCODE_SINGLE_SHIFT_2;
4239           else
4240             ENCODE_LOCKING_SHIFT_2;
4241           break;
4242
4243         case 3:                 /* graphic register 3 */
4244           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4245             ENCODE_SINGLE_SHIFT_3;
4246           else
4247             ENCODE_LOCKING_SHIFT_3;
4248           break;
4249         }
4250     }
4251
4252   *p_nchars = produced_chars;
4253   return dst;
4254 }
4255
4256
4257 /* Produce codes for designation and invocation to reset the graphic
4258    planes and registers to initial state.  */
4259 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4260   do {                                                                  \
4261     int reg;                                                            \
4262     struct charset *charset;                                            \
4263                                                                         \
4264     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4265       ENCODE_SHIFT_IN;                                                  \
4266     for (reg = 0; reg < 4; reg++)                                       \
4267       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4268           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4269               != CODING_ISO_INITIAL (coding, reg)))                     \
4270         {                                                               \
4271           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4272           ENCODE_DESIGNATION (charset, reg, coding);                    \
4273         }                                                               \
4274   } while (0)
4275
4276
4277 /* Produce designation sequences of charsets in the line started from
4278    CHARBUF to a place pointed by DST, and return the number of
4279    produced bytes.  DST should not directly point a buffer text area
4280    which may be relocated by char_charset call.
4281
4282    If the current block ends before any end-of-line, we may fail to
4283    find all the necessary designations.  */
4284
4285 static ptrdiff_t
4286 encode_designation_at_bol (struct coding_system *coding,
4287                            int *charbuf, int *charbuf_end,
4288                            unsigned char *dst)
4289 {
4290   unsigned char *orig = dst;
4291   struct charset *charset;
4292   /* Table of charsets to be designated to each graphic register.  */
4293   int r[4];
4294   int c, found = 0, reg;
4295   ptrdiff_t produced_chars = 0;
4296   bool multibytep = coding->dst_multibyte;
4297   Lisp_Object attrs;
4298   Lisp_Object charset_list;
4299
4300   attrs = CODING_ID_ATTRS (coding->id);
4301   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4302   if (EQ (charset_list, Qiso_2022))
4303     charset_list = Viso_2022_charset_list;
4304
4305   for (reg = 0; reg < 4; reg++)
4306     r[reg] = -1;
4307
4308   while (charbuf < charbuf_end && found < 4)
4309     {
4310       int id;
4311
4312       c = *charbuf++;
4313       if (c == '\n')
4314         break;
4315       charset = char_charset (c, charset_list, NULL);
4316       id = CHARSET_ID (charset);
4317       reg = CODING_ISO_REQUEST (coding, id);
4318       if (reg >= 0 && r[reg] < 0)
4319         {
4320           found++;
4321           r[reg] = id;
4322         }
4323     }
4324
4325   if (found)
4326     {
4327       for (reg = 0; reg < 4; reg++)
4328         if (r[reg] >= 0
4329             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4330           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4331     }
4332
4333   return dst - orig;
4334 }
4335
4336 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4337
4338 static bool
4339 encode_coding_iso_2022 (struct coding_system *coding)
4340 {
4341   bool multibytep = coding->dst_multibyte;
4342   int *charbuf = coding->charbuf;
4343   int *charbuf_end = charbuf + coding->charbuf_used;
4344   unsigned char *dst = coding->destination + coding->produced;
4345   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4346   int safe_room = 16;
4347   bool bol_designation
4348     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4349        && CODING_ISO_BOL (coding));
4350   ptrdiff_t produced_chars = 0;
4351   Lisp_Object attrs, eol_type, charset_list;
4352   bool ascii_compatible;
4353   int c;
4354   int preferred_charset_id = -1;
4355
4356   CODING_GET_INFO (coding, attrs, charset_list);
4357   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4358   if (VECTORP (eol_type))
4359     eol_type = Qunix;
4360
4361   setup_iso_safe_charsets (attrs);
4362   /* Charset list may have been changed.  */
4363   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4364   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4365
4366   ascii_compatible
4367     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4368        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4369                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4370
4371   while (charbuf < charbuf_end)
4372     {
4373       ASSURE_DESTINATION (safe_room);
4374
4375       if (bol_designation)
4376         {
4377           /* We have to produce designation sequences if any now.  */
4378           unsigned char desig_buf[16];
4379           int nbytes;
4380           ptrdiff_t offset;
4381
4382           charset_map_loaded = 0;
4383           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4384                                               desig_buf);
4385           if (charset_map_loaded
4386               && (offset = coding_change_destination (coding)))
4387             {
4388               dst += offset;
4389               dst_end += offset;
4390             }
4391           memcpy (dst, desig_buf, nbytes);
4392           dst += nbytes;
4393           /* We are sure that designation sequences are all ASCII bytes.  */
4394           produced_chars += nbytes;
4395           bol_designation = 0;
4396           ASSURE_DESTINATION (safe_room);
4397         }
4398
4399       c = *charbuf++;
4400
4401       if (c < 0)
4402         {
4403           /* Handle an annotation.  */
4404           switch (*charbuf)
4405             {
4406             case CODING_ANNOTATE_COMPOSITION_MASK:
4407               /* Not yet implemented.  */
4408               break;
4409             case CODING_ANNOTATE_CHARSET_MASK:
4410               preferred_charset_id = charbuf[2];
4411               if (preferred_charset_id >= 0
4412                   && NILP (Fmemq (make_number (preferred_charset_id),
4413                                   charset_list)))
4414                 preferred_charset_id = -1;
4415               break;
4416             default:
4417               emacs_abort ();
4418             }
4419           charbuf += -c - 1;
4420           continue;
4421         }
4422
4423       /* Now encode the character C.  */
4424       if (c < 0x20 || c == 0x7F)
4425         {
4426           if (c == '\n'
4427               || (c == '\r' && EQ (eol_type, Qmac)))
4428             {
4429               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4430                 ENCODE_RESET_PLANE_AND_REGISTER ();
4431               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4432                 {
4433                   int i;
4434
4435                   for (i = 0; i < 4; i++)
4436                     CODING_ISO_DESIGNATION (coding, i)
4437                       = CODING_ISO_INITIAL (coding, i);
4438                 }
4439               bol_designation = ((CODING_ISO_FLAGS (coding)
4440                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4441                                  != 0);
4442             }
4443           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4444             ENCODE_RESET_PLANE_AND_REGISTER ();
4445           EMIT_ONE_ASCII_BYTE (c);
4446         }
4447       else if (ASCII_CHAR_P (c))
4448         {
4449           if (ascii_compatible)
4450             EMIT_ONE_ASCII_BYTE (c);
4451           else
4452             {
4453               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4454               ENCODE_ISO_CHARACTER (charset, c);
4455             }
4456         }
4457       else if (CHAR_BYTE8_P (c))
4458         {
4459           c = CHAR_TO_BYTE8 (c);
4460           EMIT_ONE_BYTE (c);
4461         }
4462       else
4463         {
4464           struct charset *charset;
4465
4466           if (preferred_charset_id >= 0)
4467             {
4468               bool result;
4469
4470               charset = CHARSET_FROM_ID (preferred_charset_id);
4471               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4472               if (! result)
4473                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4474                                      NULL, charset);
4475             }
4476           else
4477             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4478                                  NULL, charset);
4479           if (!charset)
4480             {
4481               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4482                 {
4483                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4484                   charset = CHARSET_FROM_ID (charset_ascii);
4485                 }
4486               else
4487                 {
4488                   c = coding->default_char;
4489                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4490                                        charset_list, NULL, charset);
4491                 }
4492             }
4493           ENCODE_ISO_CHARACTER (charset, c);
4494         }
4495     }
4496
4497   if (coding->mode & CODING_MODE_LAST_BLOCK
4498       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4499     {
4500       ASSURE_DESTINATION (safe_room);
4501       ENCODE_RESET_PLANE_AND_REGISTER ();
4502     }
4503   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4504   CODING_ISO_BOL (coding) = bol_designation;
4505   coding->produced_char += produced_chars;
4506   coding->produced = dst - coding->destination;
4507   return 0;
4508 }
4509
4510 \f
4511 /*** 8,9. SJIS and BIG5 handlers ***/
4512
4513 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4514    quite widely.  So, for the moment, Emacs supports them in the bare
4515    C code.  But, in the future, they may be supported only by CCL.  */
4516
4517 /* SJIS is a coding system encoding three character sets: ASCII, right
4518    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4519    as is.  A character of charset katakana-jisx0201 is encoded by
4520    "position-code + 0x80".  A character of charset japanese-jisx0208
4521    is encoded in 2-byte but two position-codes are divided and shifted
4522    so that it fit in the range below.
4523
4524    --- CODE RANGE of SJIS ---
4525    (character set)      (range)
4526    ASCII                0x00 .. 0x7F
4527    KATAKANA-JISX0201    0xA0 .. 0xDF
4528    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4529             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4530    -------------------------------
4531
4532 */
4533
4534 /* BIG5 is a coding system encoding two character sets: ASCII and
4535    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4536    character set and is encoded in two-byte.
4537
4538    --- CODE RANGE of BIG5 ---
4539    (character set)      (range)
4540    ASCII                0x00 .. 0x7F
4541    Big5 (1st byte)      0xA1 .. 0xFE
4542         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4543    --------------------------
4544
4545   */
4546
4547 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4548    Return true if a text is encoded in SJIS.  */
4549
4550 static bool
4551 detect_coding_sjis (struct coding_system *coding,
4552                     struct coding_detection_info *detect_info)
4553 {
4554   const unsigned char *src = coding->source, *src_base;
4555   const unsigned char *src_end = coding->source + coding->src_bytes;
4556   bool multibytep = coding->src_multibyte;
4557   ptrdiff_t consumed_chars = 0;
4558   int found = 0;
4559   int c;
4560   Lisp_Object attrs, charset_list;
4561   int max_first_byte_of_2_byte_code;
4562
4563   CODING_GET_INFO (coding, attrs, charset_list);
4564   max_first_byte_of_2_byte_code
4565     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4566
4567   detect_info->checked |= CATEGORY_MASK_SJIS;
4568   /* A coding system of this category is always ASCII compatible.  */
4569   src += coding->head_ascii;
4570
4571   while (1)
4572     {
4573       src_base = src;
4574       ONE_MORE_BYTE (c);
4575       if (c < 0x80)
4576         continue;
4577       if ((c >= 0x81 && c <= 0x9F)
4578           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4579         {
4580           ONE_MORE_BYTE (c);
4581           if (c < 0x40 || c == 0x7F || c > 0xFC)
4582             break;
4583           found = CATEGORY_MASK_SJIS;
4584         }
4585       else if (c >= 0xA0 && c < 0xE0)
4586         found = CATEGORY_MASK_SJIS;
4587       else
4588         break;
4589     }
4590   detect_info->rejected |= CATEGORY_MASK_SJIS;
4591   return 0;
4592
4593  no_more_source:
4594   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4595     {
4596       detect_info->rejected |= CATEGORY_MASK_SJIS;
4597       return 0;
4598     }
4599   detect_info->found |= found;
4600   return 1;
4601 }
4602
4603 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4604    Return true if a text is encoded in BIG5.  */
4605
4606 static bool
4607 detect_coding_big5 (struct coding_system *coding,
4608                     struct coding_detection_info *detect_info)
4609 {
4610   const unsigned char *src = coding->source, *src_base;
4611   const unsigned char *src_end = coding->source + coding->src_bytes;
4612   bool multibytep = coding->src_multibyte;
4613   ptrdiff_t consumed_chars = 0;
4614   int found = 0;
4615   int c;
4616
4617   detect_info->checked |= CATEGORY_MASK_BIG5;
4618   /* A coding system of this category is always ASCII compatible.  */
4619   src += coding->head_ascii;
4620
4621   while (1)
4622     {
4623       src_base = src;
4624       ONE_MORE_BYTE (c);
4625       if (c < 0x80)
4626         continue;
4627       if (c >= 0xA1)
4628         {
4629           ONE_MORE_BYTE (c);
4630           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4631             return 0;
4632           found = CATEGORY_MASK_BIG5;
4633         }
4634       else
4635         break;
4636     }
4637   detect_info->rejected |= CATEGORY_MASK_BIG5;
4638   return 0;
4639
4640  no_more_source:
4641   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4642     {
4643       detect_info->rejected |= CATEGORY_MASK_BIG5;
4644       return 0;
4645     }
4646   detect_info->found |= found;
4647   return 1;
4648 }
4649
4650 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4651
4652 static void
4653 decode_coding_sjis (struct coding_system *coding)
4654 {
4655   const unsigned char *src = coding->source + coding->consumed;
4656   const unsigned char *src_end = coding->source + coding->src_bytes;
4657   const unsigned char *src_base;
4658   int *charbuf = coding->charbuf + coding->charbuf_used;
4659   /* We may produce one charset annotation in one loop and one more at
4660      the end.  */
4661   int *charbuf_end
4662     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4663   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4664   bool multibytep = coding->src_multibyte;
4665   struct charset *charset_roman, *charset_kanji, *charset_kana;
4666   struct charset *charset_kanji2;
4667   Lisp_Object attrs, charset_list, val;
4668   ptrdiff_t char_offset = coding->produced_char;
4669   ptrdiff_t last_offset = char_offset;
4670   int last_id = charset_ascii;
4671   bool eol_dos
4672     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4673   int byte_after_cr = -1;
4674
4675   CODING_GET_INFO (coding, attrs, charset_list);
4676
4677   val = charset_list;
4678   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4679   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4680   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4681   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4682
4683   while (1)
4684     {
4685       int c, c1;
4686       struct charset *charset;
4687
4688       src_base = src;
4689       consumed_chars_base = consumed_chars;
4690
4691       if (charbuf >= charbuf_end)
4692         {
4693           if (byte_after_cr >= 0)
4694             src_base--;
4695           break;
4696         }
4697
4698       if (byte_after_cr >= 0)
4699         c = byte_after_cr, byte_after_cr = -1;
4700       else
4701         ONE_MORE_BYTE (c);
4702       if (c < 0)
4703         goto invalid_code;
4704       if (c < 0x80)
4705         {
4706           if (eol_dos && c == '\r')
4707             ONE_MORE_BYTE (byte_after_cr);
4708           charset = charset_roman;
4709         }
4710       else if (c == 0x80 || c == 0xA0)
4711         goto invalid_code;
4712       else if (c >= 0xA1 && c <= 0xDF)
4713         {
4714           /* SJIS -> JISX0201-Kana */
4715           c &= 0x7F;
4716           charset = charset_kana;
4717         }
4718       else if (c <= 0xEF)
4719         {
4720           /* SJIS -> JISX0208 */
4721           ONE_MORE_BYTE (c1);
4722           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4723             goto invalid_code;
4724           c = (c << 8) | c1;
4725           SJIS_TO_JIS (c);
4726           charset = charset_kanji;
4727         }
4728       else if (c <= 0xFC && charset_kanji2)
4729         {
4730           /* SJIS -> JISX0213-2 */
4731           ONE_MORE_BYTE (c1);
4732           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4733             goto invalid_code;
4734           c = (c << 8) | c1;
4735           SJIS_TO_JIS2 (c);
4736           charset = charset_kanji2;
4737         }
4738       else
4739         goto invalid_code;
4740       if (charset->id != charset_ascii
4741           && last_id != charset->id)
4742         {
4743           if (last_id != charset_ascii)
4744             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4745           last_id = charset->id;
4746           last_offset = char_offset;
4747         }
4748       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4749       *charbuf++ = c;
4750       char_offset++;
4751       continue;
4752
4753     invalid_code:
4754       src = src_base;
4755       consumed_chars = consumed_chars_base;
4756       ONE_MORE_BYTE (c);
4757       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4758       char_offset++;
4759       coding->errors++;
4760     }
4761
4762  no_more_source:
4763   if (last_id != charset_ascii)
4764     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4765   coding->consumed_char += consumed_chars_base;
4766   coding->consumed = src_base - coding->source;
4767   coding->charbuf_used = charbuf - coding->charbuf;
4768 }
4769
4770 static void
4771 decode_coding_big5 (struct coding_system *coding)
4772 {
4773   const unsigned char *src = coding->source + coding->consumed;
4774   const unsigned char *src_end = coding->source + coding->src_bytes;
4775   const unsigned char *src_base;
4776   int *charbuf = coding->charbuf + coding->charbuf_used;
4777   /* We may produce one charset annotation in one loop and one more at
4778      the end.  */
4779   int *charbuf_end
4780     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4781   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4782   bool multibytep = coding->src_multibyte;
4783   struct charset *charset_roman, *charset_big5;
4784   Lisp_Object attrs, charset_list, val;
4785   ptrdiff_t char_offset = coding->produced_char;
4786   ptrdiff_t last_offset = char_offset;
4787   int last_id = charset_ascii;
4788   bool eol_dos
4789     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4790   int byte_after_cr = -1;
4791
4792   CODING_GET_INFO (coding, attrs, charset_list);
4793   val = charset_list;
4794   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4795   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4796
4797   while (1)
4798     {
4799       int c, c1;
4800       struct charset *charset;
4801
4802       src_base = src;
4803       consumed_chars_base = consumed_chars;
4804
4805       if (charbuf >= charbuf_end)
4806         {
4807           if (byte_after_cr >= 0)
4808             src_base--;
4809           break;
4810         }
4811
4812       if (byte_after_cr >= 0)
4813         c = byte_after_cr, byte_after_cr = -1;
4814       else
4815         ONE_MORE_BYTE (c);
4816
4817       if (c < 0)
4818         goto invalid_code;
4819       if (c < 0x80)
4820         {
4821           if (eol_dos && c == '\r')
4822             ONE_MORE_BYTE (byte_after_cr);
4823           charset = charset_roman;
4824         }
4825       else
4826         {
4827           /* BIG5 -> Big5 */
4828           if (c < 0xA1 || c > 0xFE)
4829             goto invalid_code;
4830           ONE_MORE_BYTE (c1);
4831           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4832             goto invalid_code;
4833           c = c << 8 | c1;
4834           charset = charset_big5;
4835         }
4836       if (charset->id != charset_ascii
4837           && last_id != charset->id)
4838         {
4839           if (last_id != charset_ascii)
4840             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4841           last_id = charset->id;
4842           last_offset = char_offset;
4843         }
4844       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4845       *charbuf++ = c;
4846       char_offset++;
4847       continue;
4848
4849     invalid_code:
4850       src = src_base;
4851       consumed_chars = consumed_chars_base;
4852       ONE_MORE_BYTE (c);
4853       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4854       char_offset++;
4855       coding->errors++;
4856     }
4857
4858  no_more_source:
4859   if (last_id != charset_ascii)
4860     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4861   coding->consumed_char += consumed_chars_base;
4862   coding->consumed = src_base - coding->source;
4863   coding->charbuf_used = charbuf - coding->charbuf;
4864 }
4865
4866 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4867    This function can encode charsets `ascii', `katakana-jisx0201',
4868    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4869    are sure that all these charsets are registered as official charset
4870    (i.e. do not have extended leading-codes).  Characters of other
4871    charsets are produced without any encoding.  */
4872
4873 static bool
4874 encode_coding_sjis (struct coding_system *coding)
4875 {
4876   bool multibytep = coding->dst_multibyte;
4877   int *charbuf = coding->charbuf;
4878   int *charbuf_end = charbuf + coding->charbuf_used;
4879   unsigned char *dst = coding->destination + coding->produced;
4880   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4881   int safe_room = 4;
4882   ptrdiff_t produced_chars = 0;
4883   Lisp_Object attrs, charset_list, val;
4884   bool ascii_compatible;
4885   struct charset *charset_kanji, *charset_kana;
4886   struct charset *charset_kanji2;
4887   int c;
4888
4889   CODING_GET_INFO (coding, attrs, charset_list);
4890   val = XCDR (charset_list);
4891   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4892   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4893   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4894
4895   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4896
4897   while (charbuf < charbuf_end)
4898     {
4899       ASSURE_DESTINATION (safe_room);
4900       c = *charbuf++;
4901       /* Now encode the character C.  */
4902       if (ASCII_CHAR_P (c) && ascii_compatible)
4903         EMIT_ONE_ASCII_BYTE (c);
4904       else if (CHAR_BYTE8_P (c))
4905         {
4906           c = CHAR_TO_BYTE8 (c);
4907           EMIT_ONE_BYTE (c);
4908         }
4909       else
4910         {
4911           unsigned code;
4912           struct charset *charset;
4913           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4914                                &code, charset);
4915
4916           if (!charset)
4917             {
4918               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4919                 {
4920                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4921                   charset = CHARSET_FROM_ID (charset_ascii);
4922                 }
4923               else
4924                 {
4925                   c = coding->default_char;
4926                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4927                                        charset_list, &code, charset);
4928                 }
4929             }
4930           if (code == CHARSET_INVALID_CODE (charset))
4931             emacs_abort ();
4932           if (charset == charset_kanji)
4933             {
4934               int c1, c2;
4935               JIS_TO_SJIS (code);
4936               c1 = code >> 8, c2 = code & 0xFF;
4937               EMIT_TWO_BYTES (c1, c2);
4938             }
4939           else if (charset == charset_kana)
4940             EMIT_ONE_BYTE (code | 0x80);
4941           else if (charset_kanji2 && charset == charset_kanji2)
4942             {
4943               int c1, c2;
4944
4945               c1 = code >> 8;
4946               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4947                   || c1 == 0x28
4948                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4949                 {
4950                   JIS_TO_SJIS2 (code);
4951                   c1 = code >> 8, c2 = code & 0xFF;
4952                   EMIT_TWO_BYTES (c1, c2);
4953                 }
4954               else
4955                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4956             }
4957           else
4958             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4959         }
4960     }
4961   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4962   coding->produced_char += produced_chars;
4963   coding->produced = dst - coding->destination;
4964   return 0;
4965 }
4966
4967 static bool
4968 encode_coding_big5 (struct coding_system *coding)
4969 {
4970   bool multibytep = coding->dst_multibyte;
4971   int *charbuf = coding->charbuf;
4972   int *charbuf_end = charbuf + coding->charbuf_used;
4973   unsigned char *dst = coding->destination + coding->produced;
4974   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4975   int safe_room = 4;
4976   ptrdiff_t produced_chars = 0;
4977   Lisp_Object attrs, charset_list, val;
4978   bool ascii_compatible;
4979   struct charset *charset_big5;
4980   int c;
4981
4982   CODING_GET_INFO (coding, attrs, charset_list);
4983   val = XCDR (charset_list);
4984   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4985   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4986
4987   while (charbuf < charbuf_end)
4988     {
4989       ASSURE_DESTINATION (safe_room);
4990       c = *charbuf++;
4991       /* Now encode the character C.  */
4992       if (ASCII_CHAR_P (c) && ascii_compatible)
4993         EMIT_ONE_ASCII_BYTE (c);
4994       else if (CHAR_BYTE8_P (c))
4995         {
4996           c = CHAR_TO_BYTE8 (c);
4997           EMIT_ONE_BYTE (c);
4998         }
4999       else
5000         {
5001           unsigned code;
5002           struct charset *charset;
5003           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5004                                &code, charset);
5005
5006           if (! charset)
5007             {
5008               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5009                 {
5010                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5011                   charset = CHARSET_FROM_ID (charset_ascii);
5012                 }
5013               else
5014                 {
5015                   c = coding->default_char;
5016                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5017                                        charset_list, &code, charset);
5018                 }
5019             }
5020           if (code == CHARSET_INVALID_CODE (charset))
5021             emacs_abort ();
5022           if (charset == charset_big5)
5023             {
5024               int c1, c2;
5025
5026               c1 = code >> 8, c2 = code & 0xFF;
5027               EMIT_TWO_BYTES (c1, c2);
5028             }
5029           else
5030             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5031         }
5032     }
5033   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5034   coding->produced_char += produced_chars;
5035   coding->produced = dst - coding->destination;
5036   return 0;
5037 }
5038
5039 \f
5040 /*** 10. CCL handlers ***/
5041
5042 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5043    Return true if a text is encoded in a coding system of which
5044    encoder/decoder are written in CCL program.  */
5045
5046 static bool
5047 detect_coding_ccl (struct coding_system *coding,
5048                    struct coding_detection_info *detect_info)
5049 {
5050   const unsigned char *src = coding->source, *src_base;
5051   const unsigned char *src_end = coding->source + coding->src_bytes;
5052   bool multibytep = coding->src_multibyte;
5053   ptrdiff_t consumed_chars = 0;
5054   int found = 0;
5055   unsigned char *valids;
5056   ptrdiff_t head_ascii = coding->head_ascii;
5057   Lisp_Object attrs;
5058
5059   detect_info->checked |= CATEGORY_MASK_CCL;
5060
5061   coding = &coding_categories[coding_category_ccl];
5062   valids = CODING_CCL_VALIDS (coding);
5063   attrs = CODING_ID_ATTRS (coding->id);
5064   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5065     src += head_ascii;
5066
5067   while (1)
5068     {
5069       int c;
5070
5071       src_base = src;
5072       ONE_MORE_BYTE (c);
5073       if (c < 0 || ! valids[c])
5074         break;
5075       if ((valids[c] > 1))
5076         found = CATEGORY_MASK_CCL;
5077     }
5078   detect_info->rejected |= CATEGORY_MASK_CCL;
5079   return 0;
5080
5081  no_more_source:
5082   detect_info->found |= found;
5083   return 1;
5084 }
5085
5086 static void
5087 decode_coding_ccl (struct coding_system *coding)
5088 {
5089   const unsigned char *src = coding->source + coding->consumed;
5090   const unsigned char *src_end = coding->source + coding->src_bytes;
5091   int *charbuf = coding->charbuf + coding->charbuf_used;
5092   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5093   ptrdiff_t consumed_chars = 0;
5094   bool multibytep = coding->src_multibyte;
5095   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5096   int source_charbuf[1024];
5097   int source_byteidx[1025];
5098   Lisp_Object attrs, charset_list;
5099
5100   CODING_GET_INFO (coding, attrs, charset_list);
5101
5102   while (1)
5103     {
5104       const unsigned char *p = src;
5105       ptrdiff_t offset;
5106       int i = 0;
5107
5108       if (multibytep)
5109         {
5110           while (i < 1024 && p < src_end)
5111             {
5112               source_byteidx[i] = p - src;
5113               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5114             }
5115           source_byteidx[i] = p - src;
5116         }
5117       else
5118         while (i < 1024 && p < src_end)
5119           source_charbuf[i++] = *p++;
5120
5121       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5122         ccl->last_block = 1;
5123       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5124       charset_map_loaded = 0;
5125       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5126                   charset_list);
5127       if (charset_map_loaded
5128           && (offset = coding_change_source (coding)))
5129         {
5130           p += offset;
5131           src += offset;
5132           src_end += offset;
5133         }
5134       charbuf += ccl->produced;
5135       if (multibytep)
5136         src += source_byteidx[ccl->consumed];
5137       else
5138         src += ccl->consumed;
5139       consumed_chars += ccl->consumed;
5140       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5141         break;
5142     }
5143
5144   switch (ccl->status)
5145     {
5146     case CCL_STAT_SUSPEND_BY_SRC:
5147       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5148       break;
5149     case CCL_STAT_SUSPEND_BY_DST:
5150       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5151       break;
5152     case CCL_STAT_QUIT:
5153     case CCL_STAT_INVALID_CMD:
5154       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5155       break;
5156     default:
5157       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5158       break;
5159     }
5160   coding->consumed_char += consumed_chars;
5161   coding->consumed = src - coding->source;
5162   coding->charbuf_used = charbuf - coding->charbuf;
5163 }
5164
5165 static bool
5166 encode_coding_ccl (struct coding_system *coding)
5167 {
5168   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5169   bool multibytep = coding->dst_multibyte;
5170   int *charbuf = coding->charbuf;
5171   int *charbuf_end = charbuf + coding->charbuf_used;
5172   unsigned char *dst = coding->destination + coding->produced;
5173   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5174   int destination_charbuf[1024];
5175   ptrdiff_t produced_chars = 0;
5176   int i;
5177   Lisp_Object attrs, charset_list;
5178
5179   CODING_GET_INFO (coding, attrs, charset_list);
5180   if (coding->consumed_char == coding->src_chars
5181       && coding->mode & CODING_MODE_LAST_BLOCK)
5182     ccl->last_block = 1;
5183
5184   do
5185     {
5186       ptrdiff_t offset;
5187
5188       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5189       charset_map_loaded = 0;
5190       ccl_driver (ccl, charbuf, destination_charbuf,
5191                   charbuf_end - charbuf, 1024, charset_list);
5192       if (charset_map_loaded
5193           && (offset = coding_change_destination (coding)))
5194         dst += offset;
5195       if (multibytep)
5196         {
5197           ASSURE_DESTINATION (ccl->produced * 2);
5198           for (i = 0; i < ccl->produced; i++)
5199             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5200         }
5201       else
5202         {
5203           ASSURE_DESTINATION (ccl->produced);
5204           for (i = 0; i < ccl->produced; i++)
5205             *dst++ = destination_charbuf[i] & 0xFF;
5206           produced_chars += ccl->produced;
5207         }
5208       charbuf += ccl->consumed;
5209       if (ccl->status == CCL_STAT_QUIT
5210           || ccl->status == CCL_STAT_INVALID_CMD)
5211         break;
5212     }
5213   while (charbuf < charbuf_end);
5214
5215   switch (ccl->status)
5216     {
5217     case CCL_STAT_SUSPEND_BY_SRC:
5218       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5219       break;
5220     case CCL_STAT_SUSPEND_BY_DST:
5221       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5222       break;
5223     case CCL_STAT_QUIT:
5224     case CCL_STAT_INVALID_CMD:
5225       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5226       break;
5227     default:
5228       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5229       break;
5230     }
5231
5232   coding->produced_char += produced_chars;
5233   coding->produced = dst - coding->destination;
5234   return 0;
5235 }
5236
5237 \f
5238 /*** 10, 11. no-conversion handlers ***/
5239
5240 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5241
5242 static void
5243 decode_coding_raw_text (struct coding_system *coding)
5244 {
5245   bool eol_dos
5246     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5247
5248   coding->chars_at_source = 1;
5249   coding->consumed_char = coding->src_chars;
5250   coding->consumed = coding->src_bytes;
5251   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5252     {
5253       coding->consumed_char--;
5254       coding->consumed--;
5255       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5256     }
5257   else
5258     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5259 }
5260
5261 static bool
5262 encode_coding_raw_text (struct coding_system *coding)
5263 {
5264   bool multibytep = coding->dst_multibyte;
5265   int *charbuf = coding->charbuf;
5266   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5267   unsigned char *dst = coding->destination + coding->produced;
5268   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5269   ptrdiff_t produced_chars = 0;
5270   int c;
5271
5272   if (multibytep)
5273     {
5274       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5275
5276       if (coding->src_multibyte)
5277         while (charbuf < charbuf_end)
5278           {
5279             ASSURE_DESTINATION (safe_room);
5280             c = *charbuf++;
5281             if (ASCII_CHAR_P (c))
5282               EMIT_ONE_ASCII_BYTE (c);
5283             else if (CHAR_BYTE8_P (c))
5284               {
5285                 c = CHAR_TO_BYTE8 (c);
5286                 EMIT_ONE_BYTE (c);
5287               }
5288             else
5289               {
5290                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5291
5292                 CHAR_STRING_ADVANCE (c, p1);
5293                 do
5294                   {
5295                     EMIT_ONE_BYTE (*p0);
5296                     p0++;
5297                   }
5298                 while (p0 < p1);
5299               }
5300           }
5301       else
5302         while (charbuf < charbuf_end)
5303           {
5304             ASSURE_DESTINATION (safe_room);
5305             c = *charbuf++;
5306             EMIT_ONE_BYTE (c);
5307           }
5308     }
5309   else
5310     {
5311       if (coding->src_multibyte)
5312         {
5313           int safe_room = MAX_MULTIBYTE_LENGTH;
5314
5315           while (charbuf < charbuf_end)
5316             {
5317               ASSURE_DESTINATION (safe_room);
5318               c = *charbuf++;
5319               if (ASCII_CHAR_P (c))
5320                 *dst++ = c;
5321               else if (CHAR_BYTE8_P (c))
5322                 *dst++ = CHAR_TO_BYTE8 (c);
5323               else
5324                 CHAR_STRING_ADVANCE (c, dst);
5325             }
5326         }
5327       else
5328         {
5329           ASSURE_DESTINATION (charbuf_end - charbuf);
5330           while (charbuf < charbuf_end && dst < dst_end)
5331             *dst++ = *charbuf++;
5332         }
5333       produced_chars = dst - (coding->destination + coding->produced);
5334     }
5335   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5336   coding->produced_char += produced_chars;
5337   coding->produced = dst - coding->destination;
5338   return 0;
5339 }
5340
5341 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5342    Return true if a text is encoded in a charset-based coding system.  */
5343
5344 static bool
5345 detect_coding_charset (struct coding_system *coding,
5346                        struct coding_detection_info *detect_info)
5347 {
5348   const unsigned char *src = coding->source, *src_base;
5349   const unsigned char *src_end = coding->source + coding->src_bytes;
5350   bool multibytep = coding->src_multibyte;
5351   ptrdiff_t consumed_chars = 0;
5352   Lisp_Object attrs, valids, name;
5353   int found = 0;
5354   ptrdiff_t head_ascii = coding->head_ascii;
5355   bool check_latin_extra = 0;
5356
5357   detect_info->checked |= CATEGORY_MASK_CHARSET;
5358
5359   coding = &coding_categories[coding_category_charset];
5360   attrs = CODING_ID_ATTRS (coding->id);
5361   valids = AREF (attrs, coding_attr_charset_valids);
5362   name = CODING_ID_NAME (coding->id);
5363   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5364                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5365       || strncmp (SSDATA (SYMBOL_NAME (name)),
5366                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5367     check_latin_extra = 1;
5368
5369   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5370     src += head_ascii;
5371
5372   while (1)
5373     {
5374       int c;
5375       Lisp_Object val;
5376       struct charset *charset;
5377       int dim, idx;
5378
5379       src_base = src;
5380       ONE_MORE_BYTE (c);
5381       if (c < 0)
5382         continue;
5383       val = AREF (valids, c);
5384       if (NILP (val))
5385         break;
5386       if (c >= 0x80)
5387         {
5388           if (c < 0xA0
5389               && check_latin_extra
5390               && (!VECTORP (Vlatin_extra_code_table)
5391                   || NILP (AREF (Vlatin_extra_code_table, c))))
5392             break;
5393           found = CATEGORY_MASK_CHARSET;
5394         }
5395       if (INTEGERP (val))
5396         {
5397           charset = CHARSET_FROM_ID (XFASTINT (val));
5398           dim = CHARSET_DIMENSION (charset);
5399           for (idx = 1; idx < dim; idx++)
5400             {
5401               if (src == src_end)
5402                 goto too_short;
5403               ONE_MORE_BYTE (c);
5404               if (c < charset->code_space[(dim - 1 - idx) * 4]
5405                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5406                 break;
5407             }
5408           if (idx < dim)
5409             break;
5410         }
5411       else
5412         {
5413           idx = 1;
5414           for (; CONSP (val); val = XCDR (val))
5415             {
5416               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5417               dim = CHARSET_DIMENSION (charset);
5418               while (idx < dim)
5419                 {
5420                   if (src == src_end)
5421                     goto too_short;
5422                   ONE_MORE_BYTE (c);
5423                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5424                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5425                     break;
5426                   idx++;
5427                 }
5428               if (idx == dim)
5429                 {
5430                   val = Qnil;
5431                   break;
5432                 }
5433             }
5434           if (CONSP (val))
5435             break;
5436         }
5437     }
5438  too_short:
5439   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5440   return 0;
5441
5442  no_more_source:
5443   detect_info->found |= found;
5444   return 1;
5445 }
5446
5447 static void
5448 decode_coding_charset (struct coding_system *coding)
5449 {
5450   const unsigned char *src = coding->source + coding->consumed;
5451   const unsigned char *src_end = coding->source + coding->src_bytes;
5452   const unsigned char *src_base;
5453   int *charbuf = coding->charbuf + coding->charbuf_used;
5454   /* We may produce one charset annotation in one loop and one more at
5455      the end.  */
5456   int *charbuf_end
5457     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5458   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5459   bool multibytep = coding->src_multibyte;
5460   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5461   Lisp_Object valids;
5462   ptrdiff_t char_offset = coding->produced_char;
5463   ptrdiff_t last_offset = char_offset;
5464   int last_id = charset_ascii;
5465   bool eol_dos
5466     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5467   int byte_after_cr = -1;
5468
5469   valids = AREF (attrs, coding_attr_charset_valids);
5470
5471   while (1)
5472     {
5473       int c;
5474       Lisp_Object val;
5475       struct charset *charset;
5476       int dim;
5477       int len = 1;
5478       unsigned code;
5479
5480       src_base = src;
5481       consumed_chars_base = consumed_chars;
5482
5483       if (charbuf >= charbuf_end)
5484         {
5485           if (byte_after_cr >= 0)
5486             src_base--;
5487           break;
5488         }
5489
5490       if (byte_after_cr >= 0)
5491         {
5492           c = byte_after_cr;
5493           byte_after_cr = -1;
5494         }
5495       else
5496         {
5497           ONE_MORE_BYTE (c);
5498           if (eol_dos && c == '\r')
5499             ONE_MORE_BYTE (byte_after_cr);
5500         }
5501       if (c < 0)
5502         goto invalid_code;
5503       code = c;
5504
5505       val = AREF (valids, c);
5506       if (! INTEGERP (val) && ! CONSP (val))
5507         goto invalid_code;
5508       if (INTEGERP (val))
5509         {
5510           charset = CHARSET_FROM_ID (XFASTINT (val));
5511           dim = CHARSET_DIMENSION (charset);
5512           while (len < dim)
5513             {
5514               ONE_MORE_BYTE (c);
5515               code = (code << 8) | c;
5516               len++;
5517             }
5518           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5519                               charset, code, c);
5520         }
5521       else
5522         {
5523           /* VAL is a list of charset IDs.  It is assured that the
5524              list is sorted by charset dimensions (smaller one
5525              comes first).  */
5526           while (CONSP (val))
5527             {
5528               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5529               dim = CHARSET_DIMENSION (charset);
5530               while (len < dim)
5531                 {
5532                   ONE_MORE_BYTE (c);
5533                   code = (code << 8) | c;
5534                   len++;
5535                 }
5536               CODING_DECODE_CHAR (coding, src, src_base,
5537                                   src_end, charset, code, c);
5538               if (c >= 0)
5539                 break;
5540               val = XCDR (val);
5541             }
5542         }
5543       if (c < 0)
5544         goto invalid_code;
5545       if (charset->id != charset_ascii
5546           && last_id != charset->id)
5547         {
5548           if (last_id != charset_ascii)
5549             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5550           last_id = charset->id;
5551           last_offset = char_offset;
5552         }
5553
5554       *charbuf++ = c;
5555       char_offset++;
5556       continue;
5557
5558     invalid_code:
5559       src = src_base;
5560       consumed_chars = consumed_chars_base;
5561       ONE_MORE_BYTE (c);
5562       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5563       char_offset++;
5564       coding->errors++;
5565     }
5566
5567  no_more_source:
5568   if (last_id != charset_ascii)
5569     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5570   coding->consumed_char += consumed_chars_base;
5571   coding->consumed = src_base - coding->source;
5572   coding->charbuf_used = charbuf - coding->charbuf;
5573 }
5574
5575 static bool
5576 encode_coding_charset (struct coding_system *coding)
5577 {
5578   bool multibytep = coding->dst_multibyte;
5579   int *charbuf = coding->charbuf;
5580   int *charbuf_end = charbuf + coding->charbuf_used;
5581   unsigned char *dst = coding->destination + coding->produced;
5582   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5583   int safe_room = MAX_MULTIBYTE_LENGTH;
5584   ptrdiff_t produced_chars = 0;
5585   Lisp_Object attrs, charset_list;
5586   bool ascii_compatible;
5587   int c;
5588
5589   CODING_GET_INFO (coding, attrs, charset_list);
5590   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5591
5592   while (charbuf < charbuf_end)
5593     {
5594       struct charset *charset;
5595       unsigned code;
5596
5597       ASSURE_DESTINATION (safe_room);
5598       c = *charbuf++;
5599       if (ascii_compatible && ASCII_CHAR_P (c))
5600         EMIT_ONE_ASCII_BYTE (c);
5601       else if (CHAR_BYTE8_P (c))
5602         {
5603           c = CHAR_TO_BYTE8 (c);
5604           EMIT_ONE_BYTE (c);
5605         }
5606       else
5607         {
5608           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5609                                &code, charset);
5610
5611           if (charset)
5612             {
5613               if (CHARSET_DIMENSION (charset) == 1)
5614                 EMIT_ONE_BYTE (code);
5615               else if (CHARSET_DIMENSION (charset) == 2)
5616                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5617               else if (CHARSET_DIMENSION (charset) == 3)
5618                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5619               else
5620                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5621                                  (code >> 8) & 0xFF, code & 0xFF);
5622             }
5623           else
5624             {
5625               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5626                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5627               else
5628                 c = coding->default_char;
5629               EMIT_ONE_BYTE (c);
5630             }
5631         }
5632     }
5633
5634   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5635   coding->produced_char += produced_chars;
5636   coding->produced = dst - coding->destination;
5637   return 0;
5638 }
5639
5640 \f
5641 /*** 7. C library functions ***/
5642
5643 /* Setup coding context CODING from information about CODING_SYSTEM.
5644    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5645    CODING_SYSTEM is invalid, signal an error.  */
5646
5647 void
5648 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5649 {
5650   Lisp_Object attrs;
5651   Lisp_Object eol_type;
5652   Lisp_Object coding_type;
5653   Lisp_Object val;
5654
5655   if (NILP (coding_system))
5656     coding_system = Qundecided;
5657
5658   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5659
5660   attrs = CODING_ID_ATTRS (coding->id);
5661   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5662
5663   coding->mode = 0;
5664   if (VECTORP (eol_type))
5665     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5666                             | CODING_REQUIRE_DETECTION_MASK);
5667   else if (! EQ (eol_type, Qunix))
5668     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5669                             | CODING_REQUIRE_ENCODING_MASK);
5670   else
5671     coding->common_flags = 0;
5672   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5673     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5674   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5675     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5676   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5677     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5678
5679   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5680   coding->max_charset_id = SCHARS (val) - 1;
5681   coding->safe_charsets = SDATA (val);
5682   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5683   coding->carryover_bytes = 0;
5684
5685   coding_type = CODING_ATTR_TYPE (attrs);
5686   if (EQ (coding_type, Qundecided))
5687     {
5688       coding->detector = NULL;
5689       coding->decoder = decode_coding_raw_text;
5690       coding->encoder = encode_coding_raw_text;
5691       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5692     }
5693   else if (EQ (coding_type, Qiso_2022))
5694     {
5695       int i;
5696       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5697
5698       /* Invoke graphic register 0 to plane 0.  */
5699       CODING_ISO_INVOCATION (coding, 0) = 0;
5700       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5701       CODING_ISO_INVOCATION (coding, 1)
5702         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5703       /* Setup the initial status of designation.  */
5704       for (i = 0; i < 4; i++)
5705         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5706       /* Not single shifting initially.  */
5707       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5708       /* Beginning of buffer should also be regarded as bol. */
5709       CODING_ISO_BOL (coding) = 1;
5710       coding->detector = detect_coding_iso_2022;
5711       coding->decoder = decode_coding_iso_2022;
5712       coding->encoder = encode_coding_iso_2022;
5713       if (flags & CODING_ISO_FLAG_SAFE)
5714         coding->mode |= CODING_MODE_SAFE_ENCODING;
5715       coding->common_flags
5716         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5717             | CODING_REQUIRE_FLUSHING_MASK);
5718       if (flags & CODING_ISO_FLAG_COMPOSITION)
5719         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5720       if (flags & CODING_ISO_FLAG_DESIGNATION)
5721         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5722       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5723         {
5724           setup_iso_safe_charsets (attrs);
5725           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5726           coding->max_charset_id = SCHARS (val) - 1;
5727           coding->safe_charsets = SDATA (val);
5728         }
5729       CODING_ISO_FLAGS (coding) = flags;
5730       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5731       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5732       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5733       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5734     }
5735   else if (EQ (coding_type, Qcharset))
5736     {
5737       coding->detector = detect_coding_charset;
5738       coding->decoder = decode_coding_charset;
5739       coding->encoder = encode_coding_charset;
5740       coding->common_flags
5741         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5742     }
5743   else if (EQ (coding_type, Qutf_8))
5744     {
5745       val = AREF (attrs, coding_attr_utf_bom);
5746       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5747                                    : EQ (val, Qt) ? utf_with_bom
5748                                    : utf_without_bom);
5749       coding->detector = detect_coding_utf_8;
5750       coding->decoder = decode_coding_utf_8;
5751       coding->encoder = encode_coding_utf_8;
5752       coding->common_flags
5753         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5754       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5755         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5756     }
5757   else if (EQ (coding_type, Qutf_16))
5758     {
5759       val = AREF (attrs, coding_attr_utf_bom);
5760       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5761                                     : EQ (val, Qt) ? utf_with_bom
5762                                     : utf_without_bom);
5763       val = AREF (attrs, coding_attr_utf_16_endian);
5764       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5765                                        : utf_16_little_endian);
5766       CODING_UTF_16_SURROGATE (coding) = 0;
5767       coding->detector = detect_coding_utf_16;
5768       coding->decoder = decode_coding_utf_16;
5769       coding->encoder = encode_coding_utf_16;
5770       coding->common_flags
5771         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5772       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5773         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5774     }
5775   else if (EQ (coding_type, Qccl))
5776     {
5777       coding->detector = detect_coding_ccl;
5778       coding->decoder = decode_coding_ccl;
5779       coding->encoder = encode_coding_ccl;
5780       coding->common_flags
5781         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5782             | CODING_REQUIRE_FLUSHING_MASK);
5783     }
5784   else if (EQ (coding_type, Qemacs_mule))
5785     {
5786       coding->detector = detect_coding_emacs_mule;
5787       coding->decoder = decode_coding_emacs_mule;
5788       coding->encoder = encode_coding_emacs_mule;
5789       coding->common_flags
5790         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5791       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5792           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5793         {
5794           Lisp_Object tail, safe_charsets;
5795           int max_charset_id = 0;
5796
5797           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5798                tail = XCDR (tail))
5799             if (max_charset_id < XFASTINT (XCAR (tail)))
5800               max_charset_id = XFASTINT (XCAR (tail));
5801           safe_charsets = make_uninit_string (max_charset_id + 1);
5802           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5803           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5804                tail = XCDR (tail))
5805             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5806           coding->max_charset_id = max_charset_id;
5807           coding->safe_charsets = SDATA (safe_charsets);
5808         }
5809       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5810       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5811     }
5812   else if (EQ (coding_type, Qshift_jis))
5813     {
5814       coding->detector = detect_coding_sjis;
5815       coding->decoder = decode_coding_sjis;
5816       coding->encoder = encode_coding_sjis;
5817       coding->common_flags
5818         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5819     }
5820   else if (EQ (coding_type, Qbig5))
5821     {
5822       coding->detector = detect_coding_big5;
5823       coding->decoder = decode_coding_big5;
5824       coding->encoder = encode_coding_big5;
5825       coding->common_flags
5826         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5827     }
5828   else                          /* EQ (coding_type, Qraw_text) */
5829     {
5830       coding->detector = NULL;
5831       coding->decoder = decode_coding_raw_text;
5832       coding->encoder = encode_coding_raw_text;
5833       if (! EQ (eol_type, Qunix))
5834         {
5835           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5836           if (! VECTORP (eol_type))
5837             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5838         }
5839
5840     }
5841
5842   return;
5843 }
5844
5845 /* Return a list of charsets supported by CODING.  */
5846
5847 Lisp_Object
5848 coding_charset_list (struct coding_system *coding)
5849 {
5850   Lisp_Object attrs, charset_list;
5851
5852   CODING_GET_INFO (coding, attrs, charset_list);
5853   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5854     {
5855       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5856
5857       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5858         charset_list = Viso_2022_charset_list;
5859     }
5860   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5861     {
5862       charset_list = Vemacs_mule_charset_list;
5863     }
5864   return charset_list;
5865 }
5866
5867
5868 /* Return a list of charsets supported by CODING-SYSTEM.  */
5869
5870 Lisp_Object
5871 coding_system_charset_list (Lisp_Object coding_system)
5872 {
5873   ptrdiff_t id;
5874   Lisp_Object attrs, charset_list;
5875
5876   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5877   attrs = CODING_ID_ATTRS (id);
5878
5879   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5880     {
5881       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5882
5883       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5884         charset_list = Viso_2022_charset_list;
5885       else
5886         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5887     }
5888   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5889     {
5890       charset_list = Vemacs_mule_charset_list;
5891     }
5892   else
5893     {
5894       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5895     }
5896   return charset_list;
5897 }
5898
5899
5900 /* Return raw-text or one of its subsidiaries that has the same
5901    eol_type as CODING-SYSTEM.  */
5902
5903 Lisp_Object
5904 raw_text_coding_system (Lisp_Object coding_system)
5905 {
5906   Lisp_Object spec, attrs;
5907   Lisp_Object eol_type, raw_text_eol_type;
5908
5909   if (NILP (coding_system))
5910     return Qraw_text;
5911   spec = CODING_SYSTEM_SPEC (coding_system);
5912   attrs = AREF (spec, 0);
5913
5914   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5915     return coding_system;
5916
5917   eol_type = AREF (spec, 2);
5918   if (VECTORP (eol_type))
5919     return Qraw_text;
5920   spec = CODING_SYSTEM_SPEC (Qraw_text);
5921   raw_text_eol_type = AREF (spec, 2);
5922   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5923           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5924           : AREF (raw_text_eol_type, 2));
5925 }
5926
5927
5928 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5929    the subsidiary that has the same eol-spec as PARENT (if it is not
5930    nil and specifies end-of-line format) or the system's setting
5931    (system_eol_type).  */
5932
5933 Lisp_Object
5934 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5935 {
5936   Lisp_Object spec, eol_type;
5937
5938   if (NILP (coding_system))
5939     coding_system = Qraw_text;
5940   spec = CODING_SYSTEM_SPEC (coding_system);
5941   eol_type = AREF (spec, 2);
5942   if (VECTORP (eol_type))
5943     {
5944       Lisp_Object parent_eol_type;
5945
5946       if (! NILP (parent))
5947         {
5948           Lisp_Object parent_spec;
5949
5950           parent_spec = CODING_SYSTEM_SPEC (parent);
5951           parent_eol_type = AREF (parent_spec, 2);
5952           if (VECTORP (parent_eol_type))
5953             parent_eol_type = system_eol_type;
5954         }
5955       else
5956         parent_eol_type = system_eol_type;
5957       if (EQ (parent_eol_type, Qunix))
5958         coding_system = AREF (eol_type, 0);
5959       else if (EQ (parent_eol_type, Qdos))
5960         coding_system = AREF (eol_type, 1);
5961       else if (EQ (parent_eol_type, Qmac))
5962         coding_system = AREF (eol_type, 2);
5963     }
5964   return coding_system;
5965 }
5966
5967
5968 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5969    decided for writing to a process.  If not, complement them, and
5970    return a new coding system.  */
5971
5972 Lisp_Object
5973 complement_process_encoding_system (Lisp_Object coding_system)
5974 {
5975   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5976   Lisp_Object spec, attrs;
5977   int i;
5978
5979   for (i = 0; i < 3; i++)
5980     {
5981       if (i == 1)
5982         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5983       else if (i == 2)
5984         coding_system = preferred_coding_system ();
5985       spec = CODING_SYSTEM_SPEC (coding_system);
5986       if (NILP (spec))
5987         continue;
5988       attrs = AREF (spec, 0);
5989       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5990         coding_base = CODING_ATTR_BASE_NAME (attrs);
5991       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5992         eol_base = coding_system;
5993       if (! NILP (coding_base) && ! NILP (eol_base))
5994         break;
5995     }
5996
5997   if (i > 0)
5998     /* The original CODING_SYSTEM didn't specify text-conversion or
5999        eol-conversion.  Be sure that we return a fully complemented
6000        coding system.  */
6001     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6002   return coding_system;
6003 }
6004
6005
6006 /* Emacs has a mechanism to automatically detect a coding system if it
6007    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6008    it's impossible to distinguish some coding systems accurately
6009    because they use the same range of codes.  So, at first, coding
6010    systems are categorized into 7, those are:
6011
6012    o coding-category-emacs-mule
6013
6014         The category for a coding system which has the same code range
6015         as Emacs' internal format.  Assigned the coding-system (Lisp
6016         symbol) `emacs-mule' by default.
6017
6018    o coding-category-sjis
6019
6020         The category for a coding system which has the same code range
6021         as SJIS.  Assigned the coding-system (Lisp
6022         symbol) `japanese-shift-jis' by default.
6023
6024    o coding-category-iso-7
6025
6026         The category for a coding system which has the same code range
6027         as ISO2022 of 7-bit environment.  This doesn't use any locking
6028         shift and single shift functions.  This can encode/decode all
6029         charsets.  Assigned the coding-system (Lisp symbol)
6030         `iso-2022-7bit' by default.
6031
6032    o coding-category-iso-7-tight
6033
6034         Same as coding-category-iso-7 except that this can
6035         encode/decode only the specified charsets.
6036
6037    o coding-category-iso-8-1
6038
6039         The category for a coding system which has the same code range
6040         as ISO2022 of 8-bit environment and graphic plane 1 used only
6041         for DIMENSION1 charset.  This doesn't use any locking shift
6042         and single shift functions.  Assigned the coding-system (Lisp
6043         symbol) `iso-latin-1' by default.
6044
6045    o coding-category-iso-8-2
6046
6047         The category for a coding system which has the same code range
6048         as ISO2022 of 8-bit environment and graphic plane 1 used only
6049         for DIMENSION2 charset.  This doesn't use any locking shift
6050         and single shift functions.  Assigned the coding-system (Lisp
6051         symbol) `japanese-iso-8bit' by default.
6052
6053    o coding-category-iso-7-else
6054
6055         The category for a coding system which has the same code range
6056         as ISO2022 of 7-bit environment but uses locking shift or
6057         single shift functions.  Assigned the coding-system (Lisp
6058         symbol) `iso-2022-7bit-lock' by default.
6059
6060    o coding-category-iso-8-else
6061
6062         The category for a coding system which has the same code range
6063         as ISO2022 of 8-bit environment but uses locking shift or
6064         single shift functions.  Assigned the coding-system (Lisp
6065         symbol) `iso-2022-8bit-ss2' by default.
6066
6067    o coding-category-big5
6068
6069         The category for a coding system which has the same code range
6070         as BIG5.  Assigned the coding-system (Lisp symbol)
6071         `cn-big5' by default.
6072
6073    o coding-category-utf-8
6074
6075         The category for a coding system which has the same code range
6076         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6077         symbol) `utf-8' by default.
6078
6079    o coding-category-utf-16-be
6080
6081         The category for a coding system in which a text has an
6082         Unicode signature (cf. Unicode Standard) in the order of BIG
6083         endian at the head.  Assigned the coding-system (Lisp symbol)
6084         `utf-16-be' by default.
6085
6086    o coding-category-utf-16-le
6087
6088         The category for a coding system in which a text has an
6089         Unicode signature (cf. Unicode Standard) in the order of
6090         LITTLE endian at the head.  Assigned the coding-system (Lisp
6091         symbol) `utf-16-le' by default.
6092
6093    o coding-category-ccl
6094
6095         The category for a coding system of which encoder/decoder is
6096         written in CCL programs.  The default value is nil, i.e., no
6097         coding system is assigned.
6098
6099    o coding-category-binary
6100
6101         The category for a coding system not categorized in any of the
6102         above.  Assigned the coding-system (Lisp symbol)
6103         `no-conversion' by default.
6104
6105    Each of them is a Lisp symbol and the value is an actual
6106    `coding-system's (this is also a Lisp symbol) assigned by a user.
6107    What Emacs does actually is to detect a category of coding system.
6108    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6109    decide only one possible category, it selects a category of the
6110    highest priority.  Priorities of categories are also specified by a
6111    user in a Lisp variable `coding-category-list'.
6112
6113 */
6114
6115 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6116                                            int eol_seen);
6117
6118
6119 /* Return the number of ASCII characters at the head of the source.
6120    By side effects, set coding->head_ascii and update
6121    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6122    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6123    reliable only when all the source bytes are ASCII.  */
6124
6125 static int
6126 check_ascii (struct coding_system *coding)
6127 {
6128   const unsigned char *src, *end;
6129   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6130   int eol_seen = coding->eol_seen;
6131
6132   coding_set_source (coding);
6133   src = coding->source;
6134   end = src + coding->src_bytes;
6135
6136   if (inhibit_eol_conversion
6137       || SYMBOLP (eol_type))
6138     {
6139       /* We don't have to check EOL format.  */
6140       while (src < end && !( *src & 0x80))
6141         {
6142           if (*src++ == '\n')
6143             eol_seen |= EOL_SEEN_LF;
6144         }
6145     }
6146   else
6147     {
6148       end--;                /* We look ahead one byte for "CR LF".  */
6149       while (src < end)
6150         {
6151           int c = *src;
6152
6153           if (c & 0x80)
6154             break;
6155           src++;
6156           if (c == '\r')
6157             {
6158               if (*src == '\n')
6159                 {
6160                   eol_seen |= EOL_SEEN_CRLF;
6161                   src++;
6162                 }
6163               else
6164                 eol_seen |= EOL_SEEN_CR;
6165             }
6166           else if (c == '\n')
6167             eol_seen |= EOL_SEEN_LF;
6168         }
6169       if (src == end)
6170         {
6171           int c = *src;
6172
6173           /* All bytes but the last one C are ASCII.  */
6174           if (! (c & 0x80))
6175             {
6176               if (c == '\r')
6177                 eol_seen |= EOL_SEEN_CR;
6178               else if (c  == '\n')
6179                 eol_seen |= EOL_SEEN_LF;
6180               src++;
6181             }
6182         }
6183     }
6184   coding->head_ascii = src - coding->source;
6185   coding->eol_seen = eol_seen;
6186   return (coding->head_ascii);
6187 }
6188
6189
6190 /* Return the number of characters at the source if all the bytes are
6191    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6192    effects, update coding->eol_seen.  The value of coding->eol_seen is
6193    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6194    the value is reliable only when all the source bytes are valid
6195    UTF-8.  */
6196
6197 static int
6198 check_utf_8 (struct coding_system *coding)
6199 {
6200   const unsigned char *src, *end;
6201   int eol_seen;
6202   int nchars = coding->head_ascii;
6203
6204   if (coding->head_ascii < 0)
6205     check_ascii (coding);
6206   else
6207     coding_set_source (coding);
6208   src = coding->source + coding->head_ascii;
6209   /* We look ahead one byte for CR LF.  */
6210   end = coding->source + coding->src_bytes - 1;
6211   eol_seen = coding->eol_seen;
6212   while (src < end)
6213     {
6214       int c = *src;
6215
6216       if (UTF_8_1_OCTET_P (*src))
6217         {
6218           src++;
6219           if (c < 0x20)
6220             {
6221               if (c == '\r')
6222                 {
6223                   if (*src == '\n')
6224                     {
6225                       eol_seen |= EOL_SEEN_CRLF;
6226                       src++;
6227                       nchars++;
6228                     }
6229                   else
6230                     eol_seen |= EOL_SEEN_CR;
6231                 }
6232               else if (c == '\n')
6233                 eol_seen |= EOL_SEEN_LF;
6234             }
6235         }
6236       else if (UTF_8_2_OCTET_LEADING_P (c))
6237         {
6238           if (c < 0xC2          /* overlong sequence */
6239               || src + 1 >= end
6240               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6241             return -1;
6242           src += 2;
6243         }
6244       else if (UTF_8_3_OCTET_LEADING_P (c))
6245         {
6246           if (src + 2 >= end
6247               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6248                     && UTF_8_EXTRA_OCTET_P (src[2])))
6249             return -1;
6250           c = (((c & 0xF) << 12)
6251                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6252           if (c < 0x800                       /* overlong sequence */
6253               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6254             return -1;
6255           src += 3;
6256         }
6257       else if (UTF_8_4_OCTET_LEADING_P (c))
6258         {
6259           if (src + 3 >= end
6260               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6261                     && UTF_8_EXTRA_OCTET_P (src[2])
6262                     && UTF_8_EXTRA_OCTET_P (src[3])))
6263             return -1;
6264           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6265                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6266           if (c < 0x10000       /* overlong sequence */
6267               || c >= 0x110000) /* non-Unicode character  */
6268             return -1;
6269           src += 4;
6270         }
6271       else
6272         return -1;
6273       nchars++;
6274     }
6275
6276   if (src == end)
6277     {
6278       if (! UTF_8_1_OCTET_P (*src))
6279         return -1;
6280       nchars++;
6281       if (*src == '\r')
6282         eol_seen |= EOL_SEEN_CR;
6283       else if (*src  == '\n')
6284         eol_seen |= EOL_SEEN_LF;
6285     }
6286   coding->eol_seen = eol_seen;
6287   return nchars;
6288 }
6289
6290
6291 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6292    SOURCE is encoded.  If CATEGORY is one of
6293    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6294    two-byte, else they are encoded by one-byte.
6295
6296    Return one of EOL_SEEN_XXX.  */
6297
6298 #define MAX_EOL_CHECK_COUNT 3
6299
6300 static int
6301 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6302             enum coding_category category)
6303 {
6304   const unsigned char *src = source, *src_end = src + src_bytes;
6305   unsigned char c;
6306   int total  = 0;
6307   int eol_seen = EOL_SEEN_NONE;
6308
6309   if ((1 << category) & CATEGORY_MASK_UTF_16)
6310     {
6311       bool msb = category == (coding_category_utf_16_le
6312                               | coding_category_utf_16_le_nosig);
6313       bool lsb = !msb;
6314
6315       while (src + 1 < src_end)
6316         {
6317           c = src[lsb];
6318           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6319             {
6320               int this_eol;
6321
6322               if (c == '\n')
6323                 this_eol = EOL_SEEN_LF;
6324               else if (src + 3 >= src_end
6325                        || src[msb + 2] != 0
6326                        || src[lsb + 2] != '\n')
6327                 this_eol = EOL_SEEN_CR;
6328               else
6329                 {
6330                   this_eol = EOL_SEEN_CRLF;
6331                   src += 2;
6332                 }
6333
6334               if (eol_seen == EOL_SEEN_NONE)
6335                 /* This is the first end-of-line.  */
6336                 eol_seen = this_eol;
6337               else if (eol_seen != this_eol)
6338                 {
6339                   /* The found type is different from what found before.
6340                      Allow for stray ^M characters in DOS EOL files.  */
6341                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6342                       || (eol_seen == EOL_SEEN_CRLF
6343                           && this_eol == EOL_SEEN_CR))
6344                     eol_seen = EOL_SEEN_CRLF;
6345                   else
6346                     {
6347                       eol_seen = EOL_SEEN_LF;
6348                       break;
6349                     }
6350                 }
6351               if (++total == MAX_EOL_CHECK_COUNT)
6352                 break;
6353             }
6354           src += 2;
6355         }
6356     }
6357   else
6358     while (src < src_end)
6359       {
6360         c = *src++;
6361         if (c == '\n' || c == '\r')
6362           {
6363             int this_eol;
6364
6365             if (c == '\n')
6366               this_eol = EOL_SEEN_LF;
6367             else if (src >= src_end || *src != '\n')
6368               this_eol = EOL_SEEN_CR;
6369             else
6370               this_eol = EOL_SEEN_CRLF, src++;
6371
6372             if (eol_seen == EOL_SEEN_NONE)
6373               /* This is the first end-of-line.  */
6374               eol_seen = this_eol;
6375             else if (eol_seen != this_eol)
6376               {
6377                 /* The found type is different from what found before.
6378                    Allow for stray ^M characters in DOS EOL files.  */
6379                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6380                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6381                   eol_seen = EOL_SEEN_CRLF;
6382                 else
6383                   {
6384                     eol_seen = EOL_SEEN_LF;
6385                     break;
6386                   }
6387               }
6388             if (++total == MAX_EOL_CHECK_COUNT)
6389               break;
6390           }
6391       }
6392   return eol_seen;
6393 }
6394
6395
6396 static Lisp_Object
6397 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6398 {
6399   Lisp_Object eol_type;
6400
6401   eol_type = CODING_ID_EOL_TYPE (coding->id);
6402   if (! VECTORP (eol_type))
6403     /* Already adjusted.  */
6404     return eol_type;
6405   if (eol_seen & EOL_SEEN_LF)
6406     {
6407       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6408       eol_type = Qunix;
6409     }
6410   else if (eol_seen & EOL_SEEN_CRLF)
6411     {
6412       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6413       eol_type = Qdos;
6414     }
6415   else if (eol_seen & EOL_SEEN_CR)
6416     {
6417       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6418       eol_type = Qmac;
6419     }
6420   return eol_type;
6421 }
6422
6423 /* Detect how a text specified in CODING is encoded.  If a coding
6424    system is detected, update fields of CODING by the detected coding
6425    system.  */
6426
6427 static void
6428 detect_coding (struct coding_system *coding)
6429 {
6430   const unsigned char *src, *src_end;
6431   unsigned int saved_mode = coding->mode;
6432   Lisp_Object found = Qnil;
6433   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6434
6435   coding->consumed = coding->consumed_char = 0;
6436   coding->produced = coding->produced_char = 0;
6437   coding_set_source (coding);
6438
6439   src_end = coding->source + coding->src_bytes;
6440
6441   coding->eol_seen = EOL_SEEN_NONE;
6442   /* If we have not yet decided the text encoding type, detect it
6443      now.  */
6444   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6445     {
6446       int c, i;
6447       struct coding_detection_info detect_info;
6448       bool null_byte_found = 0, eight_bit_found = 0;
6449
6450       coding->head_ascii = 0;
6451       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6452       for (src = coding->source; src < src_end; src++)
6453         {
6454           c = *src;
6455           if (c & 0x80)
6456             {
6457               eight_bit_found = 1;
6458               if (null_byte_found)
6459                 break;
6460             }
6461           else if (c < 0x20)
6462             {
6463               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6464                   && ! inhibit_iso_escape_detection
6465                   && ! detect_info.checked)
6466                 {
6467                   if (detect_coding_iso_2022 (coding, &detect_info))
6468                     {
6469                       /* We have scanned the whole data.  */
6470                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6471                         {
6472                           /* We didn't find an 8-bit code.  We may
6473                              have found a null-byte, but it's very
6474                              rare that a binary file conforms to
6475                              ISO-2022.  */
6476                           src = src_end;
6477                           coding->head_ascii = src - coding->source;
6478                         }
6479                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6480                       break;
6481                     }
6482                 }
6483               else if (! c && !inhibit_null_byte_detection)
6484                 {
6485                   null_byte_found = 1;
6486                   if (eight_bit_found)
6487                     break;
6488                 }
6489               else if (! disable_ascii_optimization
6490                        && ! inhibit_eol_conversion)
6491                 {
6492                   if (c == '\r')
6493                     {
6494                       if (src < src_end && src[1] == '\n')
6495                         {
6496                           coding->eol_seen |= EOL_SEEN_CRLF;
6497                           src++;
6498                           if (! eight_bit_found)
6499                             coding->head_ascii++;
6500                         }
6501                       else
6502                         coding->eol_seen |= EOL_SEEN_CR;
6503                     }
6504                   else if (c == '\n')
6505                     {
6506                       coding->eol_seen |= EOL_SEEN_LF;
6507                     }
6508                 }
6509
6510               if (! eight_bit_found)
6511                 coding->head_ascii++;
6512             }
6513           else if (! eight_bit_found)
6514             coding->head_ascii++;
6515         }
6516
6517       if (null_byte_found || eight_bit_found
6518           || coding->head_ascii < coding->src_bytes
6519           || detect_info.found)
6520         {
6521           enum coding_category category;
6522           struct coding_system *this;
6523
6524           if (coding->head_ascii == coding->src_bytes)
6525             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6526             for (i = 0; i < coding_category_raw_text; i++)
6527               {
6528                 category = coding_priorities[i];
6529                 this = coding_categories + category;
6530                 if (detect_info.found & (1 << category))
6531                   break;
6532               }
6533           else
6534             {
6535               if (null_byte_found)
6536                 {
6537                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6538                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6539                 }
6540               for (i = 0; i < coding_category_raw_text; i++)
6541                 {
6542                   category = coding_priorities[i];
6543                   this = coding_categories + category;
6544                   /* Some of this->detector (e.g. detect_coding_sjis)
6545                      require this information.  */
6546                   coding->id = this->id;
6547                   if (this->id < 0)
6548                     {
6549                       /* No coding system of this category is defined.  */
6550                       detect_info.rejected |= (1 << category);
6551                     }
6552                   else if (category >= coding_category_raw_text)
6553                     continue;
6554                   else if (detect_info.checked & (1 << category))
6555                     {
6556                       if (detect_info.found & (1 << category))
6557                         break;
6558                     }
6559                   else if ((*(this->detector)) (coding, &detect_info)
6560                            && detect_info.found & (1 << category))
6561                     break;
6562                 }
6563             }
6564
6565           if (i < coding_category_raw_text)
6566             {
6567               if (category == coding_category_utf_8_auto)
6568                 {
6569                   Lisp_Object coding_systems;
6570
6571                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6572                                          coding_attr_utf_bom);
6573                   if (CONSP (coding_systems))
6574                     {
6575                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6576                         found = XCAR (coding_systems);
6577                       else
6578                         found = XCDR (coding_systems);
6579                     }
6580                   else
6581                     found = CODING_ID_NAME (this->id);
6582                 }
6583               else if (category == coding_category_utf_16_auto)
6584                 {
6585                   Lisp_Object coding_systems;
6586
6587                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6588                                          coding_attr_utf_bom);
6589                   if (CONSP (coding_systems))
6590                     {
6591                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6592                         found = XCAR (coding_systems);
6593                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6594                         found = XCDR (coding_systems);
6595                     }
6596                   else
6597                     found = CODING_ID_NAME (this->id);
6598                 }
6599               else
6600                 found = CODING_ID_NAME (this->id);
6601             }
6602           else if (null_byte_found)
6603             found = Qno_conversion;
6604           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6605                    == CATEGORY_MASK_ANY)
6606             found = Qraw_text;
6607           else if (detect_info.rejected)
6608             for (i = 0; i < coding_category_raw_text; i++)
6609               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6610                 {
6611                   this = coding_categories + coding_priorities[i];
6612                   found = CODING_ID_NAME (this->id);
6613                   break;
6614                 }
6615         }
6616     }
6617   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6618            == coding_category_utf_8_auto)
6619     {
6620       Lisp_Object coding_systems;
6621       struct coding_detection_info detect_info;
6622
6623       coding_systems
6624         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6625       detect_info.found = detect_info.rejected = 0;
6626       if (check_ascii (coding) == coding->src_bytes)
6627         {
6628           if (CONSP (coding_systems))
6629             found = XCDR (coding_systems);
6630         }
6631       else
6632         {
6633           if (CONSP (coding_systems)
6634               && detect_coding_utf_8 (coding, &detect_info))
6635             {
6636               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6637                 found = XCAR (coding_systems);
6638               else
6639                 found = XCDR (coding_systems);
6640             }
6641         }
6642     }
6643   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6644            == coding_category_utf_16_auto)
6645     {
6646       Lisp_Object coding_systems;
6647       struct coding_detection_info detect_info;
6648
6649       coding_systems
6650         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6651       detect_info.found = detect_info.rejected = 0;
6652       coding->head_ascii = 0;
6653       if (CONSP (coding_systems)
6654           && detect_coding_utf_16 (coding, &detect_info))
6655         {
6656           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6657             found = XCAR (coding_systems);
6658           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6659             found = XCDR (coding_systems);
6660         }
6661     }
6662
6663   if (! NILP (found))
6664     {
6665       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6666                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6667                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6668                            : EOL_SEEN_LF);
6669
6670       setup_coding_system (found, coding);
6671       if (specified_eol != EOL_SEEN_NONE)
6672         adjust_coding_eol_type (coding, specified_eol);
6673     }
6674
6675   coding->mode = saved_mode;
6676 }
6677
6678
6679 static void
6680 decode_eol (struct coding_system *coding)
6681 {
6682   Lisp_Object eol_type;
6683   unsigned char *p, *pbeg, *pend;
6684
6685   eol_type = CODING_ID_EOL_TYPE (coding->id);
6686   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6687     return;
6688
6689   if (NILP (coding->dst_object))
6690     pbeg = coding->destination;
6691   else
6692     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6693   pend = pbeg + coding->produced;
6694
6695   if (VECTORP (eol_type))
6696     {
6697       int eol_seen = EOL_SEEN_NONE;
6698
6699       for (p = pbeg; p < pend; p++)
6700         {
6701           if (*p == '\n')
6702             eol_seen |= EOL_SEEN_LF;
6703           else if (*p == '\r')
6704             {
6705               if (p + 1 < pend && *(p + 1) == '\n')
6706                 {
6707                   eol_seen |= EOL_SEEN_CRLF;
6708                   p++;
6709                 }
6710               else
6711                 eol_seen |= EOL_SEEN_CR;
6712             }
6713         }
6714       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6715       if ((eol_seen & EOL_SEEN_CRLF) != 0
6716           && (eol_seen & EOL_SEEN_CR) != 0
6717           && (eol_seen & EOL_SEEN_LF) == 0)
6718         eol_seen = EOL_SEEN_CRLF;
6719       else if (eol_seen != EOL_SEEN_NONE
6720           && eol_seen != EOL_SEEN_LF
6721           && eol_seen != EOL_SEEN_CRLF
6722           && eol_seen != EOL_SEEN_CR)
6723         eol_seen = EOL_SEEN_LF;
6724       if (eol_seen != EOL_SEEN_NONE)
6725         eol_type = adjust_coding_eol_type (coding, eol_seen);
6726     }
6727
6728   if (EQ (eol_type, Qmac))
6729     {
6730       for (p = pbeg; p < pend; p++)
6731         if (*p == '\r')
6732           *p = '\n';
6733     }
6734   else if (EQ (eol_type, Qdos))
6735     {
6736       ptrdiff_t n = 0;
6737
6738       if (NILP (coding->dst_object))
6739         {
6740           /* Start deleting '\r' from the tail to minimize the memory
6741              movement.  */
6742           for (p = pend - 2; p >= pbeg; p--)
6743             if (*p == '\r')
6744               {
6745                 memmove (p, p + 1, pend-- - p - 1);
6746                 n++;
6747               }
6748         }
6749       else
6750         {
6751           ptrdiff_t pos_byte = coding->dst_pos_byte;
6752           ptrdiff_t pos = coding->dst_pos;
6753           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6754
6755           while (pos < pos_end)
6756             {
6757               p = BYTE_POS_ADDR (pos_byte);
6758               if (*p == '\r' && p[1] == '\n')
6759                 {
6760                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6761                   n++;
6762                   pos_end--;
6763                 }
6764               pos++;
6765               if (coding->dst_multibyte)
6766                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6767               else
6768                 pos_byte++;
6769             }
6770         }
6771       coding->produced -= n;
6772       coding->produced_char -= n;
6773     }
6774 }
6775
6776
6777 /* Return a translation table (or list of them) from coding system
6778    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6779    not ENCODEP). */
6780
6781 static Lisp_Object
6782 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6783 {
6784   Lisp_Object standard, translation_table;
6785   Lisp_Object val;
6786
6787   if (NILP (Venable_character_translation))
6788     {
6789       if (max_lookup)
6790         *max_lookup = 0;
6791       return Qnil;
6792     }
6793   if (encodep)
6794     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6795       standard = Vstandard_translation_table_for_encode;
6796   else
6797     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6798       standard = Vstandard_translation_table_for_decode;
6799   if (NILP (translation_table))
6800     translation_table = standard;
6801   else
6802     {
6803       if (SYMBOLP (translation_table))
6804         translation_table = Fget (translation_table, Qtranslation_table);
6805       else if (CONSP (translation_table))
6806         {
6807           translation_table = Fcopy_sequence (translation_table);
6808           for (val = translation_table; CONSP (val); val = XCDR (val))
6809             if (SYMBOLP (XCAR (val)))
6810               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6811         }
6812       if (CHAR_TABLE_P (standard))
6813         {
6814           if (CONSP (translation_table))
6815             translation_table = nconc2 (translation_table,
6816                                         Fcons (standard, Qnil));
6817           else
6818             translation_table = Fcons (translation_table,
6819                                        Fcons (standard, Qnil));
6820         }
6821     }
6822
6823   if (max_lookup)
6824     {
6825       *max_lookup = 1;
6826       if (CHAR_TABLE_P (translation_table)
6827           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6828         {
6829           val = XCHAR_TABLE (translation_table)->extras[1];
6830           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6831             *max_lookup = XFASTINT (val);
6832         }
6833       else if (CONSP (translation_table))
6834         {
6835           Lisp_Object tail;
6836
6837           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6838             if (CHAR_TABLE_P (XCAR (tail))
6839                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6840               {
6841                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6842                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6843                   *max_lookup = XFASTINT (tailval);
6844               }
6845         }
6846     }
6847   return translation_table;
6848 }
6849
6850 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6851   do {                                                          \
6852     trans = Qnil;                                               \
6853     if (CHAR_TABLE_P (table))                                   \
6854       {                                                         \
6855         trans = CHAR_TABLE_REF (table, c);                      \
6856         if (CHARACTERP (trans))                                 \
6857           c = XFASTINT (trans), trans = Qnil;                   \
6858       }                                                         \
6859     else if (CONSP (table))                                     \
6860       {                                                         \
6861         Lisp_Object tail;                                       \
6862                                                                 \
6863         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6864           if (CHAR_TABLE_P (XCAR (tail)))                       \
6865             {                                                   \
6866               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6867               if (CHARACTERP (trans))                           \
6868                 c = XFASTINT (trans), trans = Qnil;             \
6869               else if (! NILP (trans))                          \
6870                 break;                                          \
6871             }                                                   \
6872       }                                                         \
6873   } while (0)
6874
6875
6876 /* Return a translation of character(s) at BUF according to TRANS.
6877    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6878    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6879    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6880    translation is found, and Qnil if not found..
6881    If BUF is too short to lookup characters in FROM, return Qt.  */
6882
6883 static Lisp_Object
6884 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6885 {
6886
6887   if (INTEGERP (trans))
6888     return trans;
6889   for (; CONSP (trans); trans = XCDR (trans))
6890     {
6891       Lisp_Object val = XCAR (trans);
6892       Lisp_Object from = XCAR (val);
6893       ptrdiff_t len = ASIZE (from);
6894       ptrdiff_t i;
6895
6896       for (i = 0; i < len; i++)
6897         {
6898           if (buf + i == buf_end)
6899             return Qt;
6900           if (XINT (AREF (from, i)) != buf[i])
6901             break;
6902         }
6903       if (i == len)
6904         return val;
6905     }
6906   return Qnil;
6907 }
6908
6909
6910 static int
6911 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6912                bool last_block)
6913 {
6914   unsigned char *dst = coding->destination + coding->produced;
6915   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6916   ptrdiff_t produced;
6917   ptrdiff_t produced_chars = 0;
6918   int carryover = 0;
6919
6920   if (! coding->chars_at_source)
6921     {
6922       /* Source characters are in coding->charbuf.  */
6923       int *buf = coding->charbuf;
6924       int *buf_end = buf + coding->charbuf_used;
6925
6926       if (EQ (coding->src_object, coding->dst_object))
6927         {
6928           coding_set_source (coding);
6929           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6930         }
6931
6932       while (buf < buf_end)
6933         {
6934           int c = *buf;
6935           ptrdiff_t i;
6936
6937           if (c >= 0)
6938             {
6939               ptrdiff_t from_nchars = 1, to_nchars = 1;
6940               Lisp_Object trans = Qnil;
6941
6942               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6943               if (! NILP (trans))
6944                 {
6945                   trans = get_translation (trans, buf, buf_end);
6946                   if (INTEGERP (trans))
6947                     c = XINT (trans);
6948                   else if (CONSP (trans))
6949                     {
6950                       from_nchars = ASIZE (XCAR (trans));
6951                       trans = XCDR (trans);
6952                       if (INTEGERP (trans))
6953                         c = XINT (trans);
6954                       else
6955                         {
6956                           to_nchars = ASIZE (trans);
6957                           c = XINT (AREF (trans, 0));
6958                         }
6959                     }
6960                   else if (EQ (trans, Qt) && ! last_block)
6961                     break;
6962                 }
6963
6964               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6965                 {
6966                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6967                        / MAX_MULTIBYTE_LENGTH)
6968                       < to_nchars)
6969                     memory_full (SIZE_MAX);
6970                   dst = alloc_destination (coding,
6971                                            buf_end - buf
6972                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6973                                            dst);
6974                   if (EQ (coding->src_object, coding->dst_object))
6975                     {
6976                       coding_set_source (coding);
6977                       dst_end = (((unsigned char *) coding->source)
6978                                  + coding->consumed);
6979                     }
6980                   else
6981                     dst_end = coding->destination + coding->dst_bytes;
6982                 }
6983
6984               for (i = 0; i < to_nchars; i++)
6985                 {
6986                   if (i > 0)
6987                     c = XINT (AREF (trans, i));
6988                   if (coding->dst_multibyte
6989                       || ! CHAR_BYTE8_P (c))
6990                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6991                   else
6992                     *dst++ = CHAR_TO_BYTE8 (c);
6993                 }
6994               produced_chars += to_nchars;
6995               buf += from_nchars;
6996             }
6997           else
6998             /* This is an annotation datum.  (-C) is the length.  */
6999             buf += -c;
7000         }
7001       carryover = buf_end - buf;
7002     }
7003   else
7004     {
7005       /* Source characters are at coding->source.  */
7006       const unsigned char *src = coding->source;
7007       const unsigned char *src_end = src + coding->consumed;
7008
7009       if (EQ (coding->dst_object, coding->src_object))
7010         dst_end = (unsigned char *) src;
7011       if (coding->src_multibyte != coding->dst_multibyte)
7012         {
7013           if (coding->src_multibyte)
7014             {
7015               bool multibytep = 1;
7016               ptrdiff_t consumed_chars = 0;
7017
7018               while (1)
7019                 {
7020                   const unsigned char *src_base = src;
7021                   int c;
7022
7023                   ONE_MORE_BYTE (c);
7024                   if (dst == dst_end)
7025                     {
7026                       if (EQ (coding->src_object, coding->dst_object))
7027                         dst_end = (unsigned char *) src;
7028                       if (dst == dst_end)
7029                         {
7030                           ptrdiff_t offset = src - coding->source;
7031
7032                           dst = alloc_destination (coding, src_end - src + 1,
7033                                                    dst);
7034                           dst_end = coding->destination + coding->dst_bytes;
7035                           coding_set_source (coding);
7036                           src = coding->source + offset;
7037                           src_end = coding->source + coding->consumed;
7038                           if (EQ (coding->src_object, coding->dst_object))
7039                             dst_end = (unsigned char *) src;
7040                         }
7041                     }
7042                   *dst++ = c;
7043                   produced_chars++;
7044                 }
7045             no_more_source:
7046               ;
7047             }
7048           else
7049             while (src < src_end)
7050               {
7051                 bool multibytep = 1;
7052                 int c = *src++;
7053
7054                 if (dst >= dst_end - 1)
7055                   {
7056                     if (EQ (coding->src_object, coding->dst_object))
7057                       dst_end = (unsigned char *) src;
7058                     if (dst >= dst_end - 1)
7059                       {
7060                         ptrdiff_t offset = src - coding->source;
7061                         ptrdiff_t more_bytes;
7062
7063                         if (EQ (coding->src_object, coding->dst_object))
7064                           more_bytes = ((src_end - src) / 2) + 2;
7065                         else
7066                           more_bytes = src_end - src + 2;
7067                         dst = alloc_destination (coding, more_bytes, dst);
7068                         dst_end = coding->destination + coding->dst_bytes;
7069                         coding_set_source (coding);
7070                         src = coding->source + offset;
7071                         src_end = coding->source + coding->consumed;
7072                         if (EQ (coding->src_object, coding->dst_object))
7073                           dst_end = (unsigned char *) src;
7074                       }
7075                   }
7076                 EMIT_ONE_BYTE (c);
7077               }
7078         }
7079       else
7080         {
7081           if (!EQ (coding->src_object, coding->dst_object))
7082             {
7083               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7084
7085               if (require > 0)
7086                 {
7087                   ptrdiff_t offset = src - coding->source;
7088
7089                   dst = alloc_destination (coding, require, dst);
7090                   coding_set_source (coding);
7091                   src = coding->source + offset;
7092                   src_end = coding->source + coding->consumed;
7093                 }
7094             }
7095           produced_chars = coding->consumed_char;
7096           while (src < src_end)
7097             *dst++ = *src++;
7098         }
7099     }
7100
7101   produced = dst - (coding->destination + coding->produced);
7102   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7103     insert_from_gap (produced_chars, produced, 0);
7104   coding->produced += produced;
7105   coding->produced_char += produced_chars;
7106   return carryover;
7107 }
7108
7109 /* Compose text in CODING->object according to the annotation data at
7110    CHARBUF.  CHARBUF is an array:
7111      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7112  */
7113
7114 static void
7115 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7116 {
7117   int len;
7118   ptrdiff_t to;
7119   enum composition_method method;
7120   Lisp_Object components;
7121
7122   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7123   to = pos + charbuf[2];
7124   method = (enum composition_method) (charbuf[4]);
7125
7126   if (method == COMPOSITION_RELATIVE)
7127     components = Qnil;
7128   else
7129     {
7130       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7131       int i, j;
7132
7133       if (method == COMPOSITION_WITH_RULE)
7134         len = charbuf[2] * 3 - 2;
7135       charbuf += MAX_ANNOTATION_LENGTH;
7136       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7137       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7138         {
7139           if (charbuf[i] >= 0)
7140             args[j] = make_number (charbuf[i]);
7141           else
7142             {
7143               i++;
7144               args[j] = make_number (charbuf[i] % 0x100);
7145             }
7146         }
7147       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7148     }
7149   compose_text (pos, to, components, Qnil, coding->dst_object);
7150 }
7151
7152
7153 /* Put `charset' property on text in CODING->object according to
7154    the annotation data at CHARBUF.  CHARBUF is an array:
7155      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7156  */
7157
7158 static void
7159 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7160 {
7161   ptrdiff_t from = pos - charbuf[2];
7162   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7163
7164   Fput_text_property (make_number (from), make_number (pos),
7165                       Qcharset, CHARSET_NAME (charset),
7166                       coding->dst_object);
7167 }
7168
7169
7170 #define CHARBUF_SIZE 0x4000
7171
7172 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7173   do {                                                                  \
7174     coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int));        \
7175     coding->charbuf_size = CHARBUF_SIZE;                                \
7176   } while (0)
7177
7178
7179 static void
7180 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7181 {
7182   int *charbuf = coding->charbuf;
7183   int *charbuf_end = charbuf + coding->charbuf_used;
7184
7185   if (NILP (coding->dst_object))
7186     return;
7187
7188   while (charbuf < charbuf_end)
7189     {
7190       if (*charbuf >= 0)
7191         pos++, charbuf++;
7192       else
7193         {
7194           int len = -*charbuf;
7195
7196           if (len > 2)
7197             switch (charbuf[1])
7198               {
7199               case CODING_ANNOTATE_COMPOSITION_MASK:
7200                 produce_composition (coding, charbuf, pos);
7201                 break;
7202               case CODING_ANNOTATE_CHARSET_MASK:
7203                 produce_charset (coding, charbuf, pos);
7204                 break;
7205               }
7206           charbuf += len;
7207         }
7208     }
7209 }
7210
7211 /* Decode the data at CODING->src_object into CODING->dst_object.
7212    CODING->src_object is a buffer, a string, or nil.
7213    CODING->dst_object is a buffer.
7214
7215    If CODING->src_object is a buffer, it must be the current buffer.
7216    In this case, if CODING->src_pos is positive, it is a position of
7217    the source text in the buffer, otherwise, the source text is in the
7218    gap area of the buffer, and CODING->src_pos specifies the offset of
7219    the text from GPT (which must be the same as PT).  If this is the
7220    same buffer as CODING->dst_object, CODING->src_pos must be
7221    negative.
7222
7223    If CODING->src_object is a string, CODING->src_pos is an index to
7224    that string.
7225
7226    If CODING->src_object is nil, CODING->source must already point to
7227    the non-relocatable memory area.  In this case, CODING->src_pos is
7228    an offset from CODING->source.
7229
7230    The decoded data is inserted at the current point of the buffer
7231    CODING->dst_object.
7232 */
7233
7234 static void
7235 decode_coding (struct coding_system *coding)
7236 {
7237   Lisp_Object attrs;
7238   Lisp_Object undo_list;
7239   Lisp_Object translation_table;
7240   struct ccl_spec cclspec;
7241   int carryover;
7242   int i;
7243
7244   USE_SAFE_ALLOCA;
7245
7246   if (BUFFERP (coding->src_object)
7247       && coding->src_pos > 0
7248       && coding->src_pos < GPT
7249       && coding->src_pos + coding->src_chars > GPT)
7250     move_gap_both (coding->src_pos, coding->src_pos_byte);
7251
7252   undo_list = Qt;
7253   if (BUFFERP (coding->dst_object))
7254     {
7255       set_buffer_internal (XBUFFER (coding->dst_object));
7256       if (GPT != PT)
7257         move_gap_both (PT, PT_BYTE);
7258
7259       /* We must disable undo_list in order to record the whole insert
7260          transaction via record_insert at the end.  But doing so also
7261          disables the recording of the first change to the undo_list.
7262          Therefore we check for first change here and record it via
7263          record_first_change if needed.  */
7264       if (MODIFF <= SAVE_MODIFF)
7265         record_first_change ();
7266
7267       undo_list = BVAR (current_buffer, undo_list);
7268       bset_undo_list (current_buffer, Qt);
7269     }
7270
7271   coding->consumed = coding->consumed_char = 0;
7272   coding->produced = coding->produced_char = 0;
7273   coding->chars_at_source = 0;
7274   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7275   coding->errors = 0;
7276
7277   ALLOC_CONVERSION_WORK_AREA (coding);
7278
7279   attrs = CODING_ID_ATTRS (coding->id);
7280   translation_table = get_translation_table (attrs, 0, NULL);
7281
7282   carryover = 0;
7283   if (coding->decoder == decode_coding_ccl)
7284     {
7285       coding->spec.ccl = &cclspec;
7286       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7287     }
7288   do
7289     {
7290       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7291
7292       coding_set_source (coding);
7293       coding->annotated = 0;
7294       coding->charbuf_used = carryover;
7295       (*(coding->decoder)) (coding);
7296       coding_set_destination (coding);
7297       carryover = produce_chars (coding, translation_table, 0);
7298       if (coding->annotated)
7299         produce_annotation (coding, pos);
7300       for (i = 0; i < carryover; i++)
7301         coding->charbuf[i]
7302           = coding->charbuf[coding->charbuf_used - carryover + i];
7303     }
7304   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7305          || (coding->consumed < coding->src_bytes
7306              && (coding->result == CODING_RESULT_SUCCESS
7307                  || coding->result == CODING_RESULT_INVALID_SRC)));
7308
7309   if (carryover > 0)
7310     {
7311       coding_set_destination (coding);
7312       coding->charbuf_used = carryover;
7313       produce_chars (coding, translation_table, 1);
7314     }
7315
7316   coding->carryover_bytes = 0;
7317   if (coding->consumed < coding->src_bytes)
7318     {
7319       int nbytes = coding->src_bytes - coding->consumed;
7320       const unsigned char *src;
7321
7322       coding_set_source (coding);
7323       coding_set_destination (coding);
7324       src = coding->source + coding->consumed;
7325
7326       if (coding->mode & CODING_MODE_LAST_BLOCK)
7327         {
7328           /* Flush out unprocessed data as binary chars.  We are sure
7329              that the number of data is less than the size of
7330              coding->charbuf.  */
7331           coding->charbuf_used = 0;
7332           coding->chars_at_source = 0;
7333
7334           while (nbytes-- > 0)
7335             {
7336               int c = *src++;
7337
7338               if (c & 0x80)
7339                 c = BYTE8_TO_CHAR (c);
7340               coding->charbuf[coding->charbuf_used++] = c;
7341             }
7342           produce_chars (coding, Qnil, 1);
7343         }
7344       else
7345         {
7346           /* Record unprocessed bytes in coding->carryover.  We are
7347              sure that the number of data is less than the size of
7348              coding->carryover.  */
7349           unsigned char *p = coding->carryover;
7350
7351           if (nbytes > sizeof coding->carryover)
7352             nbytes = sizeof coding->carryover;
7353           coding->carryover_bytes = nbytes;
7354           while (nbytes-- > 0)
7355             *p++ = *src++;
7356         }
7357       coding->consumed = coding->src_bytes;
7358     }
7359
7360   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7361       && !inhibit_eol_conversion)
7362     decode_eol (coding);
7363   if (BUFFERP (coding->dst_object))
7364     {
7365       bset_undo_list (current_buffer, undo_list);
7366       record_insert (coding->dst_pos, coding->produced_char);
7367     }
7368
7369   SAFE_FREE ();
7370 }
7371
7372
7373 /* Extract an annotation datum from a composition starting at POS and
7374    ending before LIMIT of CODING->src_object (buffer or string), store
7375    the data in BUF, set *STOP to a starting position of the next
7376    composition (if any) or to LIMIT, and return the address of the
7377    next element of BUF.
7378
7379    If such an annotation is not found, set *STOP to a starting
7380    position of a composition after POS (if any) or to LIMIT, and
7381    return BUF.  */
7382
7383 static int *
7384 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7385                                struct coding_system *coding, int *buf,
7386                                ptrdiff_t *stop)
7387 {
7388   ptrdiff_t start, end;
7389   Lisp_Object prop;
7390
7391   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7392       || end > limit)
7393     *stop = limit;
7394   else if (start > pos)
7395     *stop = start;
7396   else
7397     {
7398       if (start == pos)
7399         {
7400           /* We found a composition.  Store the corresponding
7401              annotation data in BUF.  */
7402           int *head = buf;
7403           enum composition_method method = COMPOSITION_METHOD (prop);
7404           int nchars = COMPOSITION_LENGTH (prop);
7405
7406           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7407           if (method != COMPOSITION_RELATIVE)
7408             {
7409               Lisp_Object components;
7410               ptrdiff_t i, len, i_byte;
7411
7412               components = COMPOSITION_COMPONENTS (prop);
7413               if (VECTORP (components))
7414                 {
7415                   len = ASIZE (components);
7416                   for (i = 0; i < len; i++)
7417                     *buf++ = XINT (AREF (components, i));
7418                 }
7419               else if (STRINGP (components))
7420                 {
7421                   len = SCHARS (components);
7422                   i = i_byte = 0;
7423                   while (i < len)
7424                     {
7425                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7426                       buf++;
7427                     }
7428                 }
7429               else if (INTEGERP (components))
7430                 {
7431                   len = 1;
7432                   *buf++ = XINT (components);
7433                 }
7434               else if (CONSP (components))
7435                 {
7436                   for (len = 0; CONSP (components);
7437                        len++, components = XCDR (components))
7438                     *buf++ = XINT (XCAR (components));
7439                 }
7440               else
7441                 emacs_abort ();
7442               *head -= len;
7443             }
7444         }
7445
7446       if (find_composition (end, limit, &start, &end, &prop,
7447                             coding->src_object)
7448           && end <= limit)
7449         *stop = start;
7450       else
7451         *stop = limit;
7452     }
7453   return buf;
7454 }
7455
7456
7457 /* Extract an annotation datum from a text property `charset' at POS of
7458    CODING->src_object (buffer of string), store the data in BUF, set
7459    *STOP to the position where the value of `charset' property changes
7460    (limiting by LIMIT), and return the address of the next element of
7461    BUF.
7462
7463    If the property value is nil, set *STOP to the position where the
7464    property value is non-nil (limiting by LIMIT), and return BUF.  */
7465
7466 static int *
7467 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7468                            struct coding_system *coding, int *buf,
7469                            ptrdiff_t *stop)
7470 {
7471   Lisp_Object val, next;
7472   int id;
7473
7474   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7475   if (! NILP (val) && CHARSETP (val))
7476     id = XINT (CHARSET_SYMBOL_ID (val));
7477   else
7478     id = -1;
7479   ADD_CHARSET_DATA (buf, 0, id);
7480   next = Fnext_single_property_change (make_number (pos), Qcharset,
7481                                        coding->src_object,
7482                                        make_number (limit));
7483   *stop = XINT (next);
7484   return buf;
7485 }
7486
7487
7488 static void
7489 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7490                int max_lookup)
7491 {
7492   int *buf = coding->charbuf;
7493   int *buf_end = coding->charbuf + coding->charbuf_size;
7494   const unsigned char *src = coding->source + coding->consumed;
7495   const unsigned char *src_end = coding->source + coding->src_bytes;
7496   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7497   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7498   bool multibytep = coding->src_multibyte;
7499   Lisp_Object eol_type;
7500   int c;
7501   ptrdiff_t stop, stop_composition, stop_charset;
7502   int *lookup_buf = NULL;
7503
7504   if (! NILP (translation_table))
7505     lookup_buf = alloca (sizeof (int) * max_lookup);
7506
7507   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7508   if (VECTORP (eol_type))
7509     eol_type = Qunix;
7510
7511   /* Note: composition handling is not yet implemented.  */
7512   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7513
7514   if (NILP (coding->src_object))
7515     stop = stop_composition = stop_charset = end_pos;
7516   else
7517     {
7518       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7519         stop = stop_composition = pos;
7520       else
7521         stop = stop_composition = end_pos;
7522       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7523         stop = stop_charset = pos;
7524       else
7525         stop_charset = end_pos;
7526     }
7527
7528   /* Compensate for CRLF and conversion.  */
7529   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7530   while (buf < buf_end)
7531     {
7532       Lisp_Object trans;
7533
7534       if (pos == stop)
7535         {
7536           if (pos == end_pos)
7537             break;
7538           if (pos == stop_composition)
7539             buf = handle_composition_annotation (pos, end_pos, coding,
7540                                                  buf, &stop_composition);
7541           if (pos == stop_charset)
7542             buf = handle_charset_annotation (pos, end_pos, coding,
7543                                              buf, &stop_charset);
7544           stop = (stop_composition < stop_charset
7545                   ? stop_composition : stop_charset);
7546         }
7547
7548       if (! multibytep)
7549         {
7550           int bytes;
7551
7552           if (coding->encoder == encode_coding_raw_text
7553               || coding->encoder == encode_coding_ccl)
7554             c = *src++, pos++;
7555           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7556             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7557           else
7558             c = BYTE8_TO_CHAR (*src), src++, pos++;
7559         }
7560       else
7561         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7562       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7563         c = '\n';
7564       if (! EQ (eol_type, Qunix))
7565         {
7566           if (c == '\n')
7567             {
7568               if (EQ (eol_type, Qdos))
7569                 *buf++ = '\r';
7570               else
7571                 c = '\r';
7572             }
7573         }
7574
7575       trans = Qnil;
7576       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7577       if (NILP (trans))
7578         *buf++ = c;
7579       else
7580         {
7581           ptrdiff_t from_nchars = 1, to_nchars = 1;
7582           int *lookup_buf_end;
7583           const unsigned char *p = src;
7584           int i;
7585
7586           lookup_buf[0] = c;
7587           for (i = 1; i < max_lookup && p < src_end; i++)
7588             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7589           lookup_buf_end = lookup_buf + i;
7590           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7591           if (INTEGERP (trans))
7592             c = XINT (trans);
7593           else if (CONSP (trans))
7594             {
7595               from_nchars = ASIZE (XCAR (trans));
7596               trans = XCDR (trans);
7597               if (INTEGERP (trans))
7598                 c = XINT (trans);
7599               else
7600                 {
7601                   to_nchars = ASIZE (trans);
7602                   if (buf_end - buf < to_nchars)
7603                     break;
7604                   c = XINT (AREF (trans, 0));
7605                 }
7606             }
7607           else
7608             break;
7609           *buf++ = c;
7610           for (i = 1; i < to_nchars; i++)
7611             *buf++ = XINT (AREF (trans, i));
7612           for (i = 1; i < from_nchars; i++, pos++)
7613             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7614         }
7615     }
7616
7617   coding->consumed = src - coding->source;
7618   coding->consumed_char = pos - coding->src_pos;
7619   coding->charbuf_used = buf - coding->charbuf;
7620   coding->chars_at_source = 0;
7621 }
7622
7623
7624 /* Encode the text at CODING->src_object into CODING->dst_object.
7625    CODING->src_object is a buffer or a string.
7626    CODING->dst_object is a buffer or nil.
7627
7628    If CODING->src_object is a buffer, it must be the current buffer.
7629    In this case, if CODING->src_pos is positive, it is a position of
7630    the source text in the buffer, otherwise. the source text is in the
7631    gap area of the buffer, and coding->src_pos specifies the offset of
7632    the text from GPT (which must be the same as PT).  If this is the
7633    same buffer as CODING->dst_object, CODING->src_pos must be
7634    negative and CODING should not have `pre-write-conversion'.
7635
7636    If CODING->src_object is a string, CODING should not have
7637    `pre-write-conversion'.
7638
7639    If CODING->dst_object is a buffer, the encoded data is inserted at
7640    the current point of that buffer.
7641
7642    If CODING->dst_object is nil, the encoded data is placed at the
7643    memory area specified by CODING->destination.  */
7644
7645 static void
7646 encode_coding (struct coding_system *coding)
7647 {
7648   Lisp_Object attrs;
7649   Lisp_Object translation_table;
7650   int max_lookup;
7651   struct ccl_spec cclspec;
7652
7653   USE_SAFE_ALLOCA;
7654
7655   attrs = CODING_ID_ATTRS (coding->id);
7656   if (coding->encoder == encode_coding_raw_text)
7657     translation_table = Qnil, max_lookup = 0;
7658   else
7659     translation_table = get_translation_table (attrs, 1, &max_lookup);
7660
7661   if (BUFFERP (coding->dst_object))
7662     {
7663       set_buffer_internal (XBUFFER (coding->dst_object));
7664       coding->dst_multibyte
7665         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7666     }
7667
7668   coding->consumed = coding->consumed_char = 0;
7669   coding->produced = coding->produced_char = 0;
7670   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7671   coding->errors = 0;
7672
7673   ALLOC_CONVERSION_WORK_AREA (coding);
7674
7675   if (coding->encoder == encode_coding_ccl)
7676     {
7677       coding->spec.ccl = &cclspec;
7678       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7679     }
7680   do {
7681     coding_set_source (coding);
7682     consume_chars (coding, translation_table, max_lookup);
7683     coding_set_destination (coding);
7684     (*(coding->encoder)) (coding);
7685   } while (coding->consumed_char < coding->src_chars);
7686
7687   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7688     insert_from_gap (coding->produced_char, coding->produced, 0);
7689
7690   SAFE_FREE ();
7691 }
7692
7693
7694 /* Name (or base name) of work buffer for code conversion.  */
7695 static Lisp_Object Vcode_conversion_workbuf_name;
7696
7697 /* A working buffer used by the top level conversion.  Once it is
7698    created, it is never destroyed.  It has the name
7699    Vcode_conversion_workbuf_name.  The other working buffers are
7700    destroyed after the use is finished, and their names are modified
7701    versions of Vcode_conversion_workbuf_name.  */
7702 static Lisp_Object Vcode_conversion_reused_workbuf;
7703
7704 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7705 static bool reused_workbuf_in_use;
7706
7707
7708 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7709    multibyteness of returning buffer.  */
7710
7711 static Lisp_Object
7712 make_conversion_work_buffer (bool multibyte)
7713 {
7714   Lisp_Object name, workbuf;
7715   struct buffer *current;
7716
7717   if (reused_workbuf_in_use)
7718     {
7719       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7720       workbuf = Fget_buffer_create (name);
7721     }
7722   else
7723     {
7724       reused_workbuf_in_use = 1;
7725       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7726         Vcode_conversion_reused_workbuf
7727           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7728       workbuf = Vcode_conversion_reused_workbuf;
7729     }
7730   current = current_buffer;
7731   set_buffer_internal (XBUFFER (workbuf));
7732   /* We can't allow modification hooks to run in the work buffer.  For
7733      instance, directory_files_internal assumes that file decoding
7734      doesn't compile new regexps.  */
7735   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7736   Ferase_buffer ();
7737   bset_undo_list (current_buffer, Qt);
7738   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7739   set_buffer_internal (current);
7740   return workbuf;
7741 }
7742
7743
7744 static Lisp_Object
7745 code_conversion_restore (Lisp_Object arg)
7746 {
7747   Lisp_Object current, workbuf;
7748   struct gcpro gcpro1;
7749
7750   GCPRO1 (arg);
7751   current = XCAR (arg);
7752   workbuf = XCDR (arg);
7753   if (! NILP (workbuf))
7754     {
7755       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7756         reused_workbuf_in_use = 0;
7757       else
7758         Fkill_buffer (workbuf);
7759     }
7760   set_buffer_internal (XBUFFER (current));
7761   UNGCPRO;
7762   return Qnil;
7763 }
7764
7765 Lisp_Object
7766 code_conversion_save (bool with_work_buf, bool multibyte)
7767 {
7768   Lisp_Object workbuf = Qnil;
7769
7770   if (with_work_buf)
7771     workbuf = make_conversion_work_buffer (multibyte);
7772   record_unwind_protect (code_conversion_restore,
7773                          Fcons (Fcurrent_buffer (), workbuf));
7774   return workbuf;
7775 }
7776
7777 void
7778 decode_coding_gap (struct coding_system *coding,
7779                    ptrdiff_t chars, ptrdiff_t bytes)
7780 {
7781   ptrdiff_t count = SPECPDL_INDEX ();
7782   Lisp_Object attrs;
7783
7784   coding->src_object = Fcurrent_buffer ();
7785   coding->src_chars = chars;
7786   coding->src_bytes = bytes;
7787   coding->src_pos = -chars;
7788   coding->src_pos_byte = -bytes;
7789   coding->src_multibyte = chars < bytes;
7790   coding->dst_object = coding->src_object;
7791   coding->dst_pos = PT;
7792   coding->dst_pos_byte = PT_BYTE;
7793   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7794
7795   coding->head_ascii = -1;
7796   coding->detected_utf8_chars = -1;
7797   coding->eol_seen = EOL_SEEN_NONE;
7798   if (CODING_REQUIRE_DETECTION (coding))
7799     detect_coding (coding);
7800   attrs = CODING_ID_ATTRS (coding->id);
7801   if (! disable_ascii_optimization
7802       && ! coding->src_multibyte
7803       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7804       && NILP (CODING_ATTR_POST_READ (attrs))
7805       && NILP (get_translation_table (attrs, 0, NULL)))
7806     {
7807       chars = coding->head_ascii;
7808       if (chars < 0)
7809         chars = check_ascii (coding);
7810       if (chars != bytes)
7811         {
7812           /* There exists a non-ASCII byte.  */
7813           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7814             {
7815               if (coding->detected_utf8_chars >= 0)
7816                 chars = coding->detected_utf8_chars;
7817               else
7818                 chars = check_utf_8 (coding);
7819               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7820                   && coding->head_ascii == 0
7821                   && coding->source[0] == UTF_8_BOM_1
7822                   && coding->source[1] == UTF_8_BOM_2
7823                   && coding->source[2] == UTF_8_BOM_3)
7824                 {
7825                   chars--;
7826                   bytes -= 3;
7827                   coding->src_bytes -= 3;
7828                 }
7829             }
7830           else
7831             chars = -1;
7832         }
7833       if (chars >= 0)
7834         {
7835           Lisp_Object eol_type;
7836
7837           eol_type = CODING_ID_EOL_TYPE (coding->id);
7838           if (VECTORP (eol_type))
7839             {
7840               if (coding->eol_seen != EOL_SEEN_NONE)
7841                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7842             }
7843           if (EQ (eol_type, Qmac))
7844             {
7845               unsigned char *src_end = GAP_END_ADDR;
7846               unsigned char *src = src_end - coding->src_bytes;
7847
7848               while (src < src_end)
7849                 {
7850                   if (*src++ == '\r')
7851                     src[-1] = '\n';
7852                 }
7853             }
7854           else if (EQ (eol_type, Qdos))
7855             {
7856               unsigned char *src = GAP_END_ADDR;
7857               unsigned char *src_beg = src - coding->src_bytes;
7858               unsigned char *dst = src;
7859               ptrdiff_t diff;
7860
7861               while (src_beg < src)
7862                 {
7863                   *--dst = *--src;
7864                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7865                     src--;
7866                 }
7867               diff = dst - src;
7868               bytes -= diff;
7869               chars -= diff;
7870             }
7871           coding->produced = bytes;
7872           coding->produced_char = chars;
7873           insert_from_gap (chars, bytes, 1);
7874           return;
7875         }
7876     }
7877   code_conversion_save (0, 0);
7878
7879   coding->mode |= CODING_MODE_LAST_BLOCK;
7880   current_buffer->text->inhibit_shrinking = 1;
7881   decode_coding (coding);
7882   current_buffer->text->inhibit_shrinking = 0;
7883
7884   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7885     {
7886       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7887       Lisp_Object val;
7888
7889       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7890       val = call1 (CODING_ATTR_POST_READ (attrs),
7891                    make_number (coding->produced_char));
7892       CHECK_NATNUM (val);
7893       coding->produced_char += Z - prev_Z;
7894       coding->produced += Z_BYTE - prev_Z_BYTE;
7895     }
7896
7897   unbind_to (count, Qnil);
7898 }
7899
7900
7901 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7902    SRC_OBJECT into DST_OBJECT by coding context CODING.
7903
7904    SRC_OBJECT is a buffer, a string, or Qnil.
7905
7906    If it is a buffer, the text is at point of the buffer.  FROM and TO
7907    are positions in the buffer.
7908
7909    If it is a string, the text is at the beginning of the string.
7910    FROM and TO are indices to the string.
7911
7912    If it is nil, the text is at coding->source.  FROM and TO are
7913    indices to coding->source.
7914
7915    DST_OBJECT is a buffer, Qt, or Qnil.
7916
7917    If it is a buffer, the decoded text is inserted at point of the
7918    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7919    is deleted.
7920
7921    If it is Qt, a string is made from the decoded text, and
7922    set in CODING->dst_object.
7923
7924    If it is Qnil, the decoded text is stored at CODING->destination.
7925    The caller must allocate CODING->dst_bytes bytes at
7926    CODING->destination by xmalloc.  If the decoded text is longer than
7927    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7928  */
7929
7930 void
7931 decode_coding_object (struct coding_system *coding,
7932                       Lisp_Object src_object,
7933                       ptrdiff_t from, ptrdiff_t from_byte,
7934                       ptrdiff_t to, ptrdiff_t to_byte,
7935                       Lisp_Object dst_object)
7936 {
7937   ptrdiff_t count = SPECPDL_INDEX ();
7938   unsigned char *destination IF_LINT (= NULL);
7939   ptrdiff_t dst_bytes IF_LINT (= 0);
7940   ptrdiff_t chars = to - from;
7941   ptrdiff_t bytes = to_byte - from_byte;
7942   Lisp_Object attrs;
7943   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7944   bool need_marker_adjustment = 0;
7945   Lisp_Object old_deactivate_mark;
7946
7947   old_deactivate_mark = Vdeactivate_mark;
7948
7949   if (NILP (dst_object))
7950     {
7951       destination = coding->destination;
7952       dst_bytes = coding->dst_bytes;
7953     }
7954
7955   coding->src_object = src_object;
7956   coding->src_chars = chars;
7957   coding->src_bytes = bytes;
7958   coding->src_multibyte = chars < bytes;
7959
7960   if (STRINGP (src_object))
7961     {
7962       coding->src_pos = from;
7963       coding->src_pos_byte = from_byte;
7964     }
7965   else if (BUFFERP (src_object))
7966     {
7967       set_buffer_internal (XBUFFER (src_object));
7968       if (from != GPT)
7969         move_gap_both (from, from_byte);
7970       if (EQ (src_object, dst_object))
7971         {
7972           struct Lisp_Marker *tail;
7973
7974           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7975             {
7976               tail->need_adjustment
7977                 = tail->charpos == (tail->insertion_type ? from : to);
7978               need_marker_adjustment |= tail->need_adjustment;
7979             }
7980           saved_pt = PT, saved_pt_byte = PT_BYTE;
7981           TEMP_SET_PT_BOTH (from, from_byte);
7982           current_buffer->text->inhibit_shrinking = 1;
7983           del_range_both (from, from_byte, to, to_byte, 1);
7984           coding->src_pos = -chars;
7985           coding->src_pos_byte = -bytes;
7986         }
7987       else
7988         {
7989           coding->src_pos = from;
7990           coding->src_pos_byte = from_byte;
7991         }
7992     }
7993
7994   if (CODING_REQUIRE_DETECTION (coding))
7995     detect_coding (coding);
7996   attrs = CODING_ID_ATTRS (coding->id);
7997
7998   if (EQ (dst_object, Qt)
7999       || (! NILP (CODING_ATTR_POST_READ (attrs))
8000           && NILP (dst_object)))
8001     {
8002       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8003       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8004       coding->dst_pos = BEG;
8005       coding->dst_pos_byte = BEG_BYTE;
8006     }
8007   else if (BUFFERP (dst_object))
8008     {
8009       code_conversion_save (0, 0);
8010       coding->dst_object = dst_object;
8011       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8012       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8013       coding->dst_multibyte
8014         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8015     }
8016   else
8017     {
8018       code_conversion_save (0, 0);
8019       coding->dst_object = Qnil;
8020       /* Most callers presume this will return a multibyte result, and they
8021          won't use `binary' or `raw-text' anyway, so let's not worry about
8022          CODING_FOR_UNIBYTE.  */
8023       coding->dst_multibyte = 1;
8024     }
8025
8026   decode_coding (coding);
8027
8028   if (BUFFERP (coding->dst_object))
8029     set_buffer_internal (XBUFFER (coding->dst_object));
8030
8031   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8032     {
8033       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8034       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8035       Lisp_Object val;
8036
8037       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8038       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8039               old_deactivate_mark);
8040       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8041                         make_number (coding->produced_char));
8042       UNGCPRO;
8043       CHECK_NATNUM (val);
8044       coding->produced_char += Z - prev_Z;
8045       coding->produced += Z_BYTE - prev_Z_BYTE;
8046     }
8047
8048   if (EQ (dst_object, Qt))
8049     {
8050       coding->dst_object = Fbuffer_string ();
8051     }
8052   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8053     {
8054       set_buffer_internal (XBUFFER (coding->dst_object));
8055       if (dst_bytes < coding->produced)
8056         {
8057           eassert (coding->produced > 0);
8058           destination = xrealloc (destination, coding->produced);
8059           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8060             move_gap_both (BEGV, BEGV_BYTE);
8061           memcpy (destination, BEGV_ADDR, coding->produced);
8062           coding->destination = destination;
8063         }
8064     }
8065
8066   if (saved_pt >= 0)
8067     {
8068       /* This is the case of:
8069          (BUFFERP (src_object) && EQ (src_object, dst_object))
8070          As we have moved PT while replacing the original buffer
8071          contents, we must recover it now.  */
8072       set_buffer_internal (XBUFFER (src_object));
8073       current_buffer->text->inhibit_shrinking = 0;
8074       if (saved_pt < from)
8075         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8076       else if (saved_pt < from + chars)
8077         TEMP_SET_PT_BOTH (from, from_byte);
8078       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8079         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8080                           saved_pt_byte + (coding->produced - bytes));
8081       else
8082         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8083                           saved_pt_byte + (coding->produced - bytes));
8084
8085       if (need_marker_adjustment)
8086         {
8087           struct Lisp_Marker *tail;
8088
8089           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8090             if (tail->need_adjustment)
8091               {
8092                 tail->need_adjustment = 0;
8093                 if (tail->insertion_type)
8094                   {
8095                     tail->bytepos = from_byte;
8096                     tail->charpos = from;
8097                   }
8098                 else
8099                   {
8100                     tail->bytepos = from_byte + coding->produced;
8101                     tail->charpos
8102                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8103                          ? tail->bytepos : from + coding->produced_char);
8104                   }
8105               }
8106         }
8107     }
8108
8109   Vdeactivate_mark = old_deactivate_mark;
8110   unbind_to (count, coding->dst_object);
8111 }
8112
8113
8114 void
8115 encode_coding_object (struct coding_system *coding,
8116                       Lisp_Object src_object,
8117                       ptrdiff_t from, ptrdiff_t from_byte,
8118                       ptrdiff_t to, ptrdiff_t to_byte,
8119                       Lisp_Object dst_object)
8120 {
8121   ptrdiff_t count = SPECPDL_INDEX ();
8122   ptrdiff_t chars = to - from;
8123   ptrdiff_t bytes = to_byte - from_byte;
8124   Lisp_Object attrs;
8125   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8126   bool need_marker_adjustment = 0;
8127   bool kill_src_buffer = 0;
8128   Lisp_Object old_deactivate_mark;
8129
8130   old_deactivate_mark = Vdeactivate_mark;
8131
8132   coding->src_object = src_object;
8133   coding->src_chars = chars;
8134   coding->src_bytes = bytes;
8135   coding->src_multibyte = chars < bytes;
8136
8137   attrs = CODING_ID_ATTRS (coding->id);
8138
8139   if (EQ (src_object, dst_object))
8140     {
8141       struct Lisp_Marker *tail;
8142
8143       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8144         {
8145           tail->need_adjustment
8146             = tail->charpos == (tail->insertion_type ? from : to);
8147           need_marker_adjustment |= tail->need_adjustment;
8148         }
8149     }
8150
8151   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8152     {
8153       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8154       set_buffer_internal (XBUFFER (coding->src_object));
8155       if (STRINGP (src_object))
8156         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8157       else if (BUFFERP (src_object))
8158         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8159       else
8160         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8161
8162       if (EQ (src_object, dst_object))
8163         {
8164           set_buffer_internal (XBUFFER (src_object));
8165           saved_pt = PT, saved_pt_byte = PT_BYTE;
8166           del_range_both (from, from_byte, to, to_byte, 1);
8167           set_buffer_internal (XBUFFER (coding->src_object));
8168         }
8169
8170       {
8171         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8172
8173         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8174                 old_deactivate_mark);
8175         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8176                     make_number (BEG), make_number (Z));
8177         UNGCPRO;
8178       }
8179       if (XBUFFER (coding->src_object) != current_buffer)
8180         kill_src_buffer = 1;
8181       coding->src_object = Fcurrent_buffer ();
8182       if (BEG != GPT)
8183         move_gap_both (BEG, BEG_BYTE);
8184       coding->src_chars = Z - BEG;
8185       coding->src_bytes = Z_BYTE - BEG_BYTE;
8186       coding->src_pos = BEG;
8187       coding->src_pos_byte = BEG_BYTE;
8188       coding->src_multibyte = Z < Z_BYTE;
8189     }
8190   else if (STRINGP (src_object))
8191     {
8192       code_conversion_save (0, 0);
8193       coding->src_pos = from;
8194       coding->src_pos_byte = from_byte;
8195     }
8196   else if (BUFFERP (src_object))
8197     {
8198       code_conversion_save (0, 0);
8199       set_buffer_internal (XBUFFER (src_object));
8200       if (EQ (src_object, dst_object))
8201         {
8202           saved_pt = PT, saved_pt_byte = PT_BYTE;
8203           coding->src_object = del_range_1 (from, to, 1, 1);
8204           coding->src_pos = 0;
8205           coding->src_pos_byte = 0;
8206         }
8207       else
8208         {
8209           if (from < GPT && to >= GPT)
8210             move_gap_both (from, from_byte);
8211           coding->src_pos = from;
8212           coding->src_pos_byte = from_byte;
8213         }
8214     }
8215   else
8216     code_conversion_save (0, 0);
8217
8218   if (BUFFERP (dst_object))
8219     {
8220       coding->dst_object = dst_object;
8221       if (EQ (src_object, dst_object))
8222         {
8223           coding->dst_pos = from;
8224           coding->dst_pos_byte = from_byte;
8225         }
8226       else
8227         {
8228           struct buffer *current = current_buffer;
8229
8230           set_buffer_temp (XBUFFER (dst_object));
8231           coding->dst_pos = PT;
8232           coding->dst_pos_byte = PT_BYTE;
8233           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8234           set_buffer_temp (current);
8235         }
8236       coding->dst_multibyte
8237         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8238     }
8239   else if (EQ (dst_object, Qt))
8240     {
8241       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8242       coding->dst_object = Qnil;
8243       coding->destination = xmalloc (dst_bytes);
8244       coding->dst_bytes = dst_bytes;
8245       coding->dst_multibyte = 0;
8246     }
8247   else
8248     {
8249       coding->dst_object = Qnil;
8250       coding->dst_multibyte = 0;
8251     }
8252
8253   encode_coding (coding);
8254
8255   if (EQ (dst_object, Qt))
8256     {
8257       if (BUFFERP (coding->dst_object))
8258         coding->dst_object = Fbuffer_string ();
8259       else
8260         {
8261           coding->dst_object
8262             = make_unibyte_string ((char *) coding->destination,
8263                                    coding->produced);
8264           xfree (coding->destination);
8265         }
8266     }
8267
8268   if (saved_pt >= 0)
8269     {
8270       /* This is the case of:
8271          (BUFFERP (src_object) && EQ (src_object, dst_object))
8272          As we have moved PT while replacing the original buffer
8273          contents, we must recover it now.  */
8274       set_buffer_internal (XBUFFER (src_object));
8275       if (saved_pt < from)
8276         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8277       else if (saved_pt < from + chars)
8278         TEMP_SET_PT_BOTH (from, from_byte);
8279       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8280         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8281                           saved_pt_byte + (coding->produced - bytes));
8282       else
8283         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8284                           saved_pt_byte + (coding->produced - bytes));
8285
8286       if (need_marker_adjustment)
8287         {
8288           struct Lisp_Marker *tail;
8289
8290           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8291             if (tail->need_adjustment)
8292               {
8293                 tail->need_adjustment = 0;
8294                 if (tail->insertion_type)
8295                   {
8296                     tail->bytepos = from_byte;
8297                     tail->charpos = from;
8298                   }
8299                 else
8300                   {
8301                     tail->bytepos = from_byte + coding->produced;
8302                     tail->charpos
8303                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8304                          ? tail->bytepos : from + coding->produced_char);
8305                   }
8306               }
8307         }
8308     }
8309
8310   if (kill_src_buffer)
8311     Fkill_buffer (coding->src_object);
8312
8313   Vdeactivate_mark = old_deactivate_mark;
8314   unbind_to (count, Qnil);
8315 }
8316
8317
8318 Lisp_Object
8319 preferred_coding_system (void)
8320 {
8321   int id = coding_categories[coding_priorities[0]].id;
8322
8323   return CODING_ID_NAME (id);
8324 }
8325
8326 #if defined (WINDOWSNT) || defined (CYGWIN)
8327
8328 Lisp_Object
8329 from_unicode (Lisp_Object str)
8330 {
8331   CHECK_STRING (str);
8332   if (!STRING_MULTIBYTE (str) &&
8333       SBYTES (str) & 1)
8334     {
8335       str = Fsubstring (str, make_number (0), make_number (-1));
8336     }
8337
8338   return code_convert_string_norecord (str, Qutf_16le, 0);
8339 }
8340
8341 Lisp_Object
8342 from_unicode_buffer (const wchar_t* wstr)
8343 {
8344     return from_unicode (
8345         make_unibyte_string (
8346             (char*) wstr,
8347             /* we get one of the two final 0 bytes for free. */
8348             1 + sizeof (wchar_t) * wcslen (wstr)));
8349 }
8350
8351 wchar_t *
8352 to_unicode (Lisp_Object str, Lisp_Object *buf)
8353 {
8354   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8355   /* We need to make another copy (in addition to the one made by
8356      code_convert_string_norecord) to ensure that the final string is
8357      _doubly_ zero terminated --- that is, that the string is
8358      terminated by two zero bytes and one utf-16le null character.
8359      Because strings are already terminated with a single zero byte,
8360      we just add one additional zero. */
8361   str = make_uninit_string (SBYTES (*buf) + 1);
8362   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8363   SDATA (str) [SBYTES (*buf)] = '\0';
8364   *buf = str;
8365   return WCSDATA (*buf);
8366 }
8367
8368 #endif /* WINDOWSNT || CYGWIN */
8369
8370 \f
8371 #ifdef emacs
8372 /*** 8. Emacs Lisp library functions ***/
8373
8374 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8375        doc: /* Return t if OBJECT is nil or a coding-system.
8376 See the documentation of `define-coding-system' for information
8377 about coding-system objects.  */)
8378   (Lisp_Object object)
8379 {
8380   if (NILP (object)
8381       || CODING_SYSTEM_ID (object) >= 0)
8382     return Qt;
8383   if (! SYMBOLP (object)
8384       || NILP (Fget (object, Qcoding_system_define_form)))
8385     return Qnil;
8386   return Qt;
8387 }
8388
8389 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8390        Sread_non_nil_coding_system, 1, 1, 0,
8391        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8392   (Lisp_Object prompt)
8393 {
8394   Lisp_Object val;
8395   do
8396     {
8397       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8398                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8399     }
8400   while (SCHARS (val) == 0);
8401   return (Fintern (val, Qnil));
8402 }
8403
8404 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8405        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8406 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8407 Ignores case when completing coding systems (all Emacs coding systems
8408 are lower-case).  */)
8409   (Lisp_Object prompt, Lisp_Object default_coding_system)
8410 {
8411   Lisp_Object val;
8412   ptrdiff_t count = SPECPDL_INDEX ();
8413
8414   if (SYMBOLP (default_coding_system))
8415     default_coding_system = SYMBOL_NAME (default_coding_system);
8416   specbind (Qcompletion_ignore_case, Qt);
8417   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8418                           Qt, Qnil, Qcoding_system_history,
8419                           default_coding_system, Qnil);
8420   unbind_to (count, Qnil);
8421   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8422 }
8423
8424 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8425        1, 1, 0,
8426        doc: /* Check validity of CODING-SYSTEM.
8427 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8428 It is valid if it is nil or a symbol defined as a coding system by the
8429 function `define-coding-system'.  */)
8430   (Lisp_Object coding_system)
8431 {
8432   Lisp_Object define_form;
8433
8434   define_form = Fget (coding_system, Qcoding_system_define_form);
8435   if (! NILP (define_form))
8436     {
8437       Fput (coding_system, Qcoding_system_define_form, Qnil);
8438       safe_eval (define_form);
8439     }
8440   if (!NILP (Fcoding_system_p (coding_system)))
8441     return coding_system;
8442   xsignal1 (Qcoding_system_error, coding_system);
8443 }
8444
8445 \f
8446 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8447    HIGHEST, return the coding system of the highest
8448    priority among the detected coding systems.  Otherwise return a
8449    list of detected coding systems sorted by their priorities.  If
8450    MULTIBYTEP, it is assumed that the bytes are in correct
8451    multibyte form but contains only ASCII and eight-bit chars.
8452    Otherwise, the bytes are raw bytes.
8453
8454    CODING-SYSTEM controls the detection as below:
8455
8456    If it is nil, detect both text-format and eol-format.  If the
8457    text-format part of CODING-SYSTEM is already specified
8458    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8459    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8460    detect only text-format.  */
8461
8462 Lisp_Object
8463 detect_coding_system (const unsigned char *src,
8464                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8465                       bool highest, bool multibytep,
8466                       Lisp_Object coding_system)
8467 {
8468   const unsigned char *src_end = src + src_bytes;
8469   Lisp_Object attrs, eol_type;
8470   Lisp_Object val = Qnil;
8471   struct coding_system coding;
8472   ptrdiff_t id;
8473   struct coding_detection_info detect_info;
8474   enum coding_category base_category;
8475   bool null_byte_found = 0, eight_bit_found = 0;
8476
8477   if (NILP (coding_system))
8478     coding_system = Qundecided;
8479   setup_coding_system (coding_system, &coding);
8480   attrs = CODING_ID_ATTRS (coding.id);
8481   eol_type = CODING_ID_EOL_TYPE (coding.id);
8482   coding_system = CODING_ATTR_BASE_NAME (attrs);
8483
8484   coding.source = src;
8485   coding.src_chars = src_chars;
8486   coding.src_bytes = src_bytes;
8487   coding.src_multibyte = multibytep;
8488   coding.consumed = 0;
8489   coding.mode |= CODING_MODE_LAST_BLOCK;
8490   coding.head_ascii = 0;
8491
8492   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8493
8494   /* At first, detect text-format if necessary.  */
8495   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8496   if (base_category == coding_category_undecided)
8497     {
8498       enum coding_category category IF_LINT (= 0);
8499       struct coding_system *this IF_LINT (= NULL);
8500       int c, i;
8501
8502       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8503       for (; src < src_end; src++)
8504         {
8505           c = *src;
8506           if (c & 0x80)
8507             {
8508               eight_bit_found = 1;
8509               if (null_byte_found)
8510                 break;
8511             }
8512           else if (c < 0x20)
8513             {
8514               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8515                   && ! inhibit_iso_escape_detection
8516                   && ! detect_info.checked)
8517                 {
8518                   if (detect_coding_iso_2022 (&coding, &detect_info))
8519                     {
8520                       /* We have scanned the whole data.  */
8521                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8522                         {
8523                           /* We didn't find an 8-bit code.  We may
8524                              have found a null-byte, but it's very
8525                              rare that a binary file confirm to
8526                              ISO-2022.  */
8527                           src = src_end;
8528                           coding.head_ascii = src - coding.source;
8529                         }
8530                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8531                       break;
8532                     }
8533                 }
8534               else if (! c && !inhibit_null_byte_detection)
8535                 {
8536                   null_byte_found = 1;
8537                   if (eight_bit_found)
8538                     break;
8539                 }
8540               if (! eight_bit_found)
8541                 coding.head_ascii++;
8542             }
8543           else if (! eight_bit_found)
8544             coding.head_ascii++;
8545         }
8546
8547       if (null_byte_found || eight_bit_found
8548           || coding.head_ascii < coding.src_bytes
8549           || detect_info.found)
8550         {
8551           if (coding.head_ascii == coding.src_bytes)
8552             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8553             for (i = 0; i < coding_category_raw_text; i++)
8554               {
8555                 category = coding_priorities[i];
8556                 this = coding_categories + category;
8557                 if (detect_info.found & (1 << category))
8558                   break;
8559               }
8560           else
8561             {
8562               if (null_byte_found)
8563                 {
8564                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8565                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8566                 }
8567               for (i = 0; i < coding_category_raw_text; i++)
8568                 {
8569                   category = coding_priorities[i];
8570                   this = coding_categories + category;
8571
8572                   if (this->id < 0)
8573                     {
8574                       /* No coding system of this category is defined.  */
8575                       detect_info.rejected |= (1 << category);
8576                     }
8577                   else if (category >= coding_category_raw_text)
8578                     continue;
8579                   else if (detect_info.checked & (1 << category))
8580                     {
8581                       if (highest
8582                           && (detect_info.found & (1 << category)))
8583                         break;
8584                     }
8585                   else if ((*(this->detector)) (&coding, &detect_info)
8586                            && highest
8587                            && (detect_info.found & (1 << category)))
8588                     {
8589                       if (category == coding_category_utf_16_auto)
8590                         {
8591                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8592                             category = coding_category_utf_16_le;
8593                           else
8594                             category = coding_category_utf_16_be;
8595                         }
8596                       break;
8597                     }
8598                 }
8599             }
8600         }
8601
8602       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8603           || null_byte_found)
8604         {
8605           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8606           id = CODING_SYSTEM_ID (Qno_conversion);
8607           val = Fcons (make_number (id), Qnil);
8608         }
8609       else if (! detect_info.rejected && ! detect_info.found)
8610         {
8611           detect_info.found = CATEGORY_MASK_ANY;
8612           id = coding_categories[coding_category_undecided].id;
8613           val = Fcons (make_number (id), Qnil);
8614         }
8615       else if (highest)
8616         {
8617           if (detect_info.found)
8618             {
8619               detect_info.found = 1 << category;
8620               val = Fcons (make_number (this->id), Qnil);
8621             }
8622           else
8623             for (i = 0; i < coding_category_raw_text; i++)
8624               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8625                 {
8626                   detect_info.found = 1 << coding_priorities[i];
8627                   id = coding_categories[coding_priorities[i]].id;
8628                   val = Fcons (make_number (id), Qnil);
8629                   break;
8630                 }
8631         }
8632       else
8633         {
8634           int mask = detect_info.rejected | detect_info.found;
8635           int found = 0;
8636
8637           for (i = coding_category_raw_text - 1; i >= 0; i--)
8638             {
8639               category = coding_priorities[i];
8640               if (! (mask & (1 << category)))
8641                 {
8642                   found |= 1 << category;
8643                   id = coding_categories[category].id;
8644                   if (id >= 0)
8645                     val = Fcons (make_number (id), val);
8646                 }
8647             }
8648           for (i = coding_category_raw_text - 1; i >= 0; i--)
8649             {
8650               category = coding_priorities[i];
8651               if (detect_info.found & (1 << category))
8652                 {
8653                   id = coding_categories[category].id;
8654                   val = Fcons (make_number (id), val);
8655                 }
8656             }
8657           detect_info.found |= found;
8658         }
8659     }
8660   else if (base_category == coding_category_utf_8_auto)
8661     {
8662       if (detect_coding_utf_8 (&coding, &detect_info))
8663         {
8664           struct coding_system *this;
8665
8666           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8667             this = coding_categories + coding_category_utf_8_sig;
8668           else
8669             this = coding_categories + coding_category_utf_8_nosig;
8670           val = Fcons (make_number (this->id), Qnil);
8671         }
8672     }
8673   else if (base_category == coding_category_utf_16_auto)
8674     {
8675       if (detect_coding_utf_16 (&coding, &detect_info))
8676         {
8677           struct coding_system *this;
8678
8679           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8680             this = coding_categories + coding_category_utf_16_le;
8681           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8682             this = coding_categories + coding_category_utf_16_be;
8683           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8684             this = coding_categories + coding_category_utf_16_be_nosig;
8685           else
8686             this = coding_categories + coding_category_utf_16_le_nosig;
8687           val = Fcons (make_number (this->id), Qnil);
8688         }
8689     }
8690   else
8691     {
8692       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8693       val = Fcons (make_number (coding.id), Qnil);
8694     }
8695
8696   /* Then, detect eol-format if necessary.  */
8697   {
8698     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8699     Lisp_Object tail;
8700
8701     if (VECTORP (eol_type))
8702       {
8703         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8704           {
8705             if (null_byte_found)
8706               normal_eol = EOL_SEEN_LF;
8707             else
8708               normal_eol = detect_eol (coding.source, src_bytes,
8709                                        coding_category_raw_text);
8710           }
8711         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8712                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8713           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8714                                       coding_category_utf_16_be);
8715         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8716                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8717           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8718                                       coding_category_utf_16_le);
8719       }
8720     else
8721       {
8722         if (EQ (eol_type, Qunix))
8723           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8724         else if (EQ (eol_type, Qdos))
8725           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8726         else
8727           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8728       }
8729
8730     for (tail = val; CONSP (tail); tail = XCDR (tail))
8731       {
8732         enum coding_category category;
8733         int this_eol;
8734
8735         id = XINT (XCAR (tail));
8736         attrs = CODING_ID_ATTRS (id);
8737         category = XINT (CODING_ATTR_CATEGORY (attrs));
8738         eol_type = CODING_ID_EOL_TYPE (id);
8739         if (VECTORP (eol_type))
8740           {
8741             if (category == coding_category_utf_16_be
8742                 || category == coding_category_utf_16_be_nosig)
8743               this_eol = utf_16_be_eol;
8744             else if (category == coding_category_utf_16_le
8745                      || category == coding_category_utf_16_le_nosig)
8746               this_eol = utf_16_le_eol;
8747             else
8748               this_eol = normal_eol;
8749
8750             if (this_eol == EOL_SEEN_LF)
8751               XSETCAR (tail, AREF (eol_type, 0));
8752             else if (this_eol == EOL_SEEN_CRLF)
8753               XSETCAR (tail, AREF (eol_type, 1));
8754             else if (this_eol == EOL_SEEN_CR)
8755               XSETCAR (tail, AREF (eol_type, 2));
8756             else
8757               XSETCAR (tail, CODING_ID_NAME (id));
8758           }
8759         else
8760           XSETCAR (tail, CODING_ID_NAME (id));
8761       }
8762   }
8763
8764   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8765 }
8766
8767
8768 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8769        2, 3, 0,
8770        doc: /* Detect coding system of the text in the region between START and END.
8771 Return a list of possible coding systems ordered by priority.
8772 The coding systems to try and their priorities follows what
8773 the function `coding-system-priority-list' (which see) returns.
8774
8775 If only ASCII characters are found (except for such ISO-2022 control
8776 characters as ESC), it returns a list of single element `undecided'
8777 or its subsidiary coding system according to a detected end-of-line
8778 format.
8779
8780 If optional argument HIGHEST is non-nil, return the coding system of
8781 highest priority.  */)
8782   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8783 {
8784   ptrdiff_t from, to;
8785   ptrdiff_t from_byte, to_byte;
8786
8787   validate_region (&start, &end);
8788   from = XINT (start), to = XINT (end);
8789   from_byte = CHAR_TO_BYTE (from);
8790   to_byte = CHAR_TO_BYTE (to);
8791
8792   if (from < GPT && to >= GPT)
8793     move_gap_both (to, to_byte);
8794
8795   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8796                                to - from, to_byte - from_byte,
8797                                !NILP (highest),
8798                                !NILP (BVAR (current_buffer
8799                                       , enable_multibyte_characters)),
8800                                Qnil);
8801 }
8802
8803 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8804        1, 2, 0,
8805        doc: /* Detect coding system of the text in STRING.
8806 Return a list of possible coding systems ordered by priority.
8807 The coding systems to try and their priorities follows what
8808 the function `coding-system-priority-list' (which see) returns.
8809
8810 If only ASCII characters are found (except for such ISO-2022 control
8811 characters as ESC), it returns a list of single element `undecided'
8812 or its subsidiary coding system according to a detected end-of-line
8813 format.
8814
8815 If optional argument HIGHEST is non-nil, return the coding system of
8816 highest priority.  */)
8817   (Lisp_Object string, Lisp_Object highest)
8818 {
8819   CHECK_STRING (string);
8820
8821   return detect_coding_system (SDATA (string),
8822                                SCHARS (string), SBYTES (string),
8823                                !NILP (highest), STRING_MULTIBYTE (string),
8824                                Qnil);
8825 }
8826
8827
8828 static bool
8829 char_encodable_p (int c, Lisp_Object attrs)
8830 {
8831   Lisp_Object tail;
8832   struct charset *charset;
8833   Lisp_Object translation_table;
8834
8835   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8836   if (! NILP (translation_table))
8837     c = translate_char (translation_table, c);
8838   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8839        CONSP (tail); tail = XCDR (tail))
8840     {
8841       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8842       if (CHAR_CHARSET_P (c, charset))
8843         break;
8844     }
8845   return (! NILP (tail));
8846 }
8847
8848
8849 /* Return a list of coding systems that safely encode the text between
8850    START and END.  If EXCLUDE is non-nil, it is a list of coding
8851    systems not to check.  The returned list doesn't contain any such
8852    coding systems.  In any case, if the text contains only ASCII or is
8853    unibyte, return t.  */
8854
8855 DEFUN ("find-coding-systems-region-internal",
8856        Ffind_coding_systems_region_internal,
8857        Sfind_coding_systems_region_internal, 2, 3, 0,
8858        doc: /* Internal use only.  */)
8859   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8860 {
8861   Lisp_Object coding_attrs_list, safe_codings;
8862   ptrdiff_t start_byte, end_byte;
8863   const unsigned char *p, *pbeg, *pend;
8864   int c;
8865   Lisp_Object tail, elt, work_table;
8866
8867   if (STRINGP (start))
8868     {
8869       if (!STRING_MULTIBYTE (start)
8870           || SCHARS (start) == SBYTES (start))
8871         return Qt;
8872       start_byte = 0;
8873       end_byte = SBYTES (start);
8874     }
8875   else
8876     {
8877       CHECK_NUMBER_COERCE_MARKER (start);
8878       CHECK_NUMBER_COERCE_MARKER (end);
8879       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8880         args_out_of_range (start, end);
8881       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8882         return Qt;
8883       start_byte = CHAR_TO_BYTE (XINT (start));
8884       end_byte = CHAR_TO_BYTE (XINT (end));
8885       if (XINT (end) - XINT (start) == end_byte - start_byte)
8886         return Qt;
8887
8888       if (XINT (start) < GPT && XINT (end) > GPT)
8889         {
8890           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8891             move_gap_both (XINT (start), start_byte);
8892           else
8893             move_gap_both (XINT (end), end_byte);
8894         }
8895     }
8896
8897   coding_attrs_list = Qnil;
8898   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8899     if (NILP (exclude)
8900         || NILP (Fmemq (XCAR (tail), exclude)))
8901       {
8902         Lisp_Object attrs;
8903
8904         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8905         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8906             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8907           {
8908             ASET (attrs, coding_attr_trans_tbl,
8909                   get_translation_table (attrs, 1, NULL));
8910             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8911           }
8912       }
8913
8914   if (STRINGP (start))
8915     p = pbeg = SDATA (start);
8916   else
8917     p = pbeg = BYTE_POS_ADDR (start_byte);
8918   pend = p + (end_byte - start_byte);
8919
8920   while (p < pend && ASCII_BYTE_P (*p)) p++;
8921   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8922
8923   work_table = Fmake_char_table (Qnil, Qnil);
8924   while (p < pend)
8925     {
8926       if (ASCII_BYTE_P (*p))
8927         p++;
8928       else
8929         {
8930           c = STRING_CHAR_ADVANCE (p);
8931           if (!NILP (char_table_ref (work_table, c)))
8932             /* This character was already checked.  Ignore it.  */
8933             continue;
8934
8935           charset_map_loaded = 0;
8936           for (tail = coding_attrs_list; CONSP (tail);)
8937             {
8938               elt = XCAR (tail);
8939               if (NILP (elt))
8940                 tail = XCDR (tail);
8941               else if (char_encodable_p (c, elt))
8942                 tail = XCDR (tail);
8943               else if (CONSP (XCDR (tail)))
8944                 {
8945                   XSETCAR (tail, XCAR (XCDR (tail)));
8946                   XSETCDR (tail, XCDR (XCDR (tail)));
8947                 }
8948               else
8949                 {
8950                   XSETCAR (tail, Qnil);
8951                   tail = XCDR (tail);
8952                 }
8953             }
8954           if (charset_map_loaded)
8955             {
8956               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8957
8958               if (STRINGP (start))
8959                 pbeg = SDATA (start);
8960               else
8961                 pbeg = BYTE_POS_ADDR (start_byte);
8962               p = pbeg + p_offset;
8963               pend = pbeg + pend_offset;
8964             }
8965           char_table_set (work_table, c, Qt);
8966         }
8967     }
8968
8969   safe_codings = list2 (Qraw_text, Qno_conversion);
8970   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8971     if (! NILP (XCAR (tail)))
8972       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8973
8974   return safe_codings;
8975 }
8976
8977
8978 DEFUN ("unencodable-char-position", Funencodable_char_position,
8979        Sunencodable_char_position, 3, 5, 0,
8980        doc: /*
8981 Return position of first un-encodable character in a region.
8982 START and END specify the region and CODING-SYSTEM specifies the
8983 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8984
8985 If optional 4th argument COUNT is non-nil, it specifies at most how
8986 many un-encodable characters to search.  In this case, the value is a
8987 list of positions.
8988
8989 If optional 5th argument STRING is non-nil, it is a string to search
8990 for un-encodable characters.  In that case, START and END are indexes
8991 to the string.  */)
8992   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8993 {
8994   EMACS_INT n;
8995   struct coding_system coding;
8996   Lisp_Object attrs, charset_list, translation_table;
8997   Lisp_Object positions;
8998   ptrdiff_t from, to;
8999   const unsigned char *p, *stop, *pend;
9000   bool ascii_compatible;
9001
9002   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9003   attrs = CODING_ID_ATTRS (coding.id);
9004   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9005     return Qnil;
9006   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9007   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9008   translation_table = get_translation_table (attrs, 1, NULL);
9009
9010   if (NILP (string))
9011     {
9012       validate_region (&start, &end);
9013       from = XINT (start);
9014       to = XINT (end);
9015       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9016           || (ascii_compatible
9017               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9018         return Qnil;
9019       p = CHAR_POS_ADDR (from);
9020       pend = CHAR_POS_ADDR (to);
9021       if (from < GPT && to >= GPT)
9022         stop = GPT_ADDR;
9023       else
9024         stop = pend;
9025     }
9026   else
9027     {
9028       CHECK_STRING (string);
9029       CHECK_NATNUM (start);
9030       CHECK_NATNUM (end);
9031       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
9032         args_out_of_range_3 (string, start, end);
9033       from = XINT (start);
9034       to = XINT (end);
9035       if (! STRING_MULTIBYTE (string))
9036         return Qnil;
9037       p = SDATA (string) + string_char_to_byte (string, from);
9038       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9039       if (ascii_compatible && (to - from) == (pend - p))
9040         return Qnil;
9041     }
9042
9043   if (NILP (count))
9044     n = 1;
9045   else
9046     {
9047       CHECK_NATNUM (count);
9048       n = XINT (count);
9049     }
9050
9051   positions = Qnil;
9052   charset_map_loaded = 0;
9053   while (1)
9054     {
9055       int c;
9056
9057       if (ascii_compatible)
9058         while (p < stop && ASCII_BYTE_P (*p))
9059           p++, from++;
9060       if (p >= stop)
9061         {
9062           if (p >= pend)
9063             break;
9064           stop = pend;
9065           p = GAP_END_ADDR;
9066         }
9067
9068       c = STRING_CHAR_ADVANCE (p);
9069       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9070           && ! char_charset (translate_char (translation_table, c),
9071                              charset_list, NULL))
9072         {
9073           positions = Fcons (make_number (from), positions);
9074           n--;
9075           if (n == 0)
9076             break;
9077         }
9078
9079       from++;
9080       if (charset_map_loaded && NILP (string))
9081         {
9082           p = CHAR_POS_ADDR (from);
9083           pend = CHAR_POS_ADDR (to);
9084           if (from < GPT && to >= GPT)
9085             stop = GPT_ADDR;
9086           else
9087             stop = pend;
9088           charset_map_loaded = 0;
9089         }
9090     }
9091
9092   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9093 }
9094
9095
9096 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9097        Scheck_coding_systems_region, 3, 3, 0,
9098        doc: /* Check if the region is encodable by coding systems.
9099
9100 START and END are buffer positions specifying the region.
9101 CODING-SYSTEM-LIST is a list of coding systems to check.
9102
9103 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9104 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9105 whole region, POS0, POS1, ... are buffer positions where non-encodable
9106 characters are found.
9107
9108 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9109 value is nil.
9110
9111 START may be a string.  In that case, check if the string is
9112 encodable, and the value contains indices to the string instead of
9113 buffer positions.  END is ignored.
9114
9115 If the current buffer (or START if it is a string) is unibyte, the value
9116 is nil.  */)
9117   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9118 {
9119   Lisp_Object list;
9120   ptrdiff_t start_byte, end_byte;
9121   ptrdiff_t pos;
9122   const unsigned char *p, *pbeg, *pend;
9123   int c;
9124   Lisp_Object tail, elt, attrs;
9125
9126   if (STRINGP (start))
9127     {
9128       if (!STRING_MULTIBYTE (start)
9129           || SCHARS (start) == SBYTES (start))
9130         return Qnil;
9131       start_byte = 0;
9132       end_byte = SBYTES (start);
9133       pos = 0;
9134     }
9135   else
9136     {
9137       CHECK_NUMBER_COERCE_MARKER (start);
9138       CHECK_NUMBER_COERCE_MARKER (end);
9139       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9140         args_out_of_range (start, end);
9141       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9142         return Qnil;
9143       start_byte = CHAR_TO_BYTE (XINT (start));
9144       end_byte = CHAR_TO_BYTE (XINT (end));
9145       if (XINT (end) - XINT (start) == end_byte - start_byte)
9146         return Qnil;
9147
9148       if (XINT (start) < GPT && XINT (end) > GPT)
9149         {
9150           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9151             move_gap_both (XINT (start), start_byte);
9152           else
9153             move_gap_both (XINT (end), end_byte);
9154         }
9155       pos = XINT (start);
9156     }
9157
9158   list = Qnil;
9159   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9160     {
9161       elt = XCAR (tail);
9162       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9163       ASET (attrs, coding_attr_trans_tbl,
9164             get_translation_table (attrs, 1, NULL));
9165       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
9166     }
9167
9168   if (STRINGP (start))
9169     p = pbeg = SDATA (start);
9170   else
9171     p = pbeg = BYTE_POS_ADDR (start_byte);
9172   pend = p + (end_byte - start_byte);
9173
9174   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9175   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9176
9177   while (p < pend)
9178     {
9179       if (ASCII_BYTE_P (*p))
9180         p++;
9181       else
9182         {
9183           c = STRING_CHAR_ADVANCE (p);
9184
9185           charset_map_loaded = 0;
9186           for (tail = list; CONSP (tail); tail = XCDR (tail))
9187             {
9188               elt = XCDR (XCAR (tail));
9189               if (! char_encodable_p (c, XCAR (elt)))
9190                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9191             }
9192           if (charset_map_loaded)
9193             {
9194               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9195
9196               if (STRINGP (start))
9197                 pbeg = SDATA (start);
9198               else
9199                 pbeg = BYTE_POS_ADDR (start_byte);
9200               p = pbeg + p_offset;
9201               pend = pbeg + pend_offset;
9202             }
9203         }
9204       pos++;
9205     }
9206
9207   tail = list;
9208   list = Qnil;
9209   for (; CONSP (tail); tail = XCDR (tail))
9210     {
9211       elt = XCAR (tail);
9212       if (CONSP (XCDR (XCDR (elt))))
9213         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9214                       list);
9215     }
9216
9217   return list;
9218 }
9219
9220
9221 static Lisp_Object
9222 code_convert_region (Lisp_Object start, Lisp_Object end,
9223                      Lisp_Object coding_system, Lisp_Object dst_object,
9224                      bool encodep, bool norecord)
9225 {
9226   struct coding_system coding;
9227   ptrdiff_t from, from_byte, to, to_byte;
9228   Lisp_Object src_object;
9229
9230   if (NILP (coding_system))
9231     coding_system = Qno_conversion;
9232   else
9233     CHECK_CODING_SYSTEM (coding_system);
9234   src_object = Fcurrent_buffer ();
9235   if (NILP (dst_object))
9236     dst_object = src_object;
9237   else if (! EQ (dst_object, Qt))
9238     CHECK_BUFFER (dst_object);
9239
9240   validate_region (&start, &end);
9241   from = XFASTINT (start);
9242   from_byte = CHAR_TO_BYTE (from);
9243   to = XFASTINT (end);
9244   to_byte = CHAR_TO_BYTE (to);
9245
9246   setup_coding_system (coding_system, &coding);
9247   coding.mode |= CODING_MODE_LAST_BLOCK;
9248
9249   if (encodep)
9250     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9251                           dst_object);
9252   else
9253     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9254                           dst_object);
9255   if (! norecord)
9256     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9257
9258   return (BUFFERP (dst_object)
9259           ? make_number (coding.produced_char)
9260           : coding.dst_object);
9261 }
9262
9263
9264 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9265        3, 4, "r\nzCoding system: ",
9266        doc: /* Decode the current region from the specified coding system.
9267 When called from a program, takes four arguments:
9268         START, END, CODING-SYSTEM, and DESTINATION.
9269 START and END are buffer positions.
9270
9271 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9272 If nil, the region between START and END is replaced by the decoded text.
9273 If buffer, the decoded text is inserted in that buffer after point (point
9274 does not move).
9275 In those cases, the length of the decoded text is returned.
9276 If DESTINATION is t, the decoded text is returned.
9277
9278 This function sets `last-coding-system-used' to the precise coding system
9279 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9280 not fully specified.)  */)
9281   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9282 {
9283   return code_convert_region (start, end, coding_system, destination, 0, 0);
9284 }
9285
9286 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9287        3, 4, "r\nzCoding system: ",
9288        doc: /* Encode the current region by specified coding system.
9289 When called from a program, takes four arguments:
9290         START, END, CODING-SYSTEM and DESTINATION.
9291 START and END are buffer positions.
9292
9293 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9294 If nil, the region between START and END is replace by the encoded text.
9295 If buffer, the encoded text is inserted in that buffer after point (point
9296 does not move).
9297 In those cases, the length of the encoded text is returned.
9298 If DESTINATION is t, the encoded text is returned.
9299
9300 This function sets `last-coding-system-used' to the precise coding system
9301 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9302 not fully specified.)  */)
9303   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9304 {
9305   return code_convert_region (start, end, coding_system, destination, 1, 0);
9306 }
9307
9308 Lisp_Object
9309 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9310                      Lisp_Object dst_object, bool encodep, bool nocopy,
9311                      bool norecord)
9312 {
9313   struct coding_system coding;
9314   ptrdiff_t chars, bytes;
9315
9316   CHECK_STRING (string);
9317   if (NILP (coding_system))
9318     {
9319       if (! norecord)
9320         Vlast_coding_system_used = Qno_conversion;
9321       if (NILP (dst_object))
9322         return (nocopy ? Fcopy_sequence (string) : string);
9323     }
9324
9325   if (NILP (coding_system))
9326     coding_system = Qno_conversion;
9327   else
9328     CHECK_CODING_SYSTEM (coding_system);
9329   if (NILP (dst_object))
9330     dst_object = Qt;
9331   else if (! EQ (dst_object, Qt))
9332     CHECK_BUFFER (dst_object);
9333
9334   setup_coding_system (coding_system, &coding);
9335   coding.mode |= CODING_MODE_LAST_BLOCK;
9336   chars = SCHARS (string);
9337   bytes = SBYTES (string);
9338   if (encodep)
9339     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9340   else
9341     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9342   if (! norecord)
9343     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9344
9345   return (BUFFERP (dst_object)
9346           ? make_number (coding.produced_char)
9347           : coding.dst_object);
9348 }
9349
9350
9351 /* Encode or decode STRING according to CODING_SYSTEM.
9352    Do not set Vlast_coding_system_used.
9353
9354    This function is called only from macros DECODE_FILE and
9355    ENCODE_FILE, thus we ignore character composition.  */
9356
9357 Lisp_Object
9358 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9359                               bool encodep)
9360 {
9361   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9362 }
9363
9364
9365 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9366        2, 4, 0,
9367        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9368
9369 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9370 if the decoding operation is trivial.
9371
9372 Optional fourth arg BUFFER non-nil means that the decoded text is
9373 inserted in that buffer after point (point does not move).  In this
9374 case, the return value is the length of the decoded text.
9375
9376 This function sets `last-coding-system-used' to the precise coding system
9377 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9378 not fully specified.)  */)
9379   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9380 {
9381   return code_convert_string (string, coding_system, buffer,
9382                               0, ! NILP (nocopy), 0);
9383 }
9384
9385 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9386        2, 4, 0,
9387        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9388
9389 Optional third arg NOCOPY non-nil means it is OK to return STRING
9390 itself if the encoding operation is trivial.
9391
9392 Optional fourth arg BUFFER non-nil means that the encoded text is
9393 inserted in that buffer after point (point does not move).  In this
9394 case, the return value is the length of the encoded text.
9395
9396 This function sets `last-coding-system-used' to the precise coding system
9397 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9398 not fully specified.)  */)
9399   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9400 {
9401   return code_convert_string (string, coding_system, buffer,
9402                               1, ! NILP (nocopy), 0);
9403 }
9404
9405 \f
9406 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9407        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9408 Return the corresponding character.  */)
9409   (Lisp_Object code)
9410 {
9411   Lisp_Object spec, attrs, val;
9412   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9413   EMACS_INT ch;
9414   int c;
9415
9416   CHECK_NATNUM (code);
9417   ch = XFASTINT (code);
9418   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9419   attrs = AREF (spec, 0);
9420
9421   if (ASCII_BYTE_P (ch)
9422       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9423     return code;
9424
9425   val = CODING_ATTR_CHARSET_LIST (attrs);
9426   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9427   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9428   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9429
9430   if (ch <= 0x7F)
9431     {
9432       c = ch;
9433       charset = charset_roman;
9434     }
9435   else if (ch >= 0xA0 && ch < 0xDF)
9436     {
9437       c = ch - 0x80;
9438       charset = charset_kana;
9439     }
9440   else
9441     {
9442       EMACS_INT c1 = ch >> 8;
9443       int c2 = ch & 0xFF;
9444
9445       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9446           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9447         error ("Invalid code: %"pI"d", ch);
9448       c = ch;
9449       SJIS_TO_JIS (c);
9450       charset = charset_kanji;
9451     }
9452   c = DECODE_CHAR (charset, c);
9453   if (c < 0)
9454     error ("Invalid code: %"pI"d", ch);
9455   return make_number (c);
9456 }
9457
9458
9459 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9460        doc: /* Encode a Japanese character CH to shift_jis encoding.
9461 Return the corresponding code in SJIS.  */)
9462   (Lisp_Object ch)
9463 {
9464   Lisp_Object spec, attrs, charset_list;
9465   int c;
9466   struct charset *charset;
9467   unsigned code;
9468
9469   CHECK_CHARACTER (ch);
9470   c = XFASTINT (ch);
9471   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9472   attrs = AREF (spec, 0);
9473
9474   if (ASCII_CHAR_P (c)
9475       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9476     return ch;
9477
9478   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9479   charset = char_charset (c, charset_list, &code);
9480   if (code == CHARSET_INVALID_CODE (charset))
9481     error ("Can't encode by shift_jis encoding: %c", c);
9482   JIS_TO_SJIS (code);
9483
9484   return make_number (code);
9485 }
9486
9487 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9488        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9489 Return the corresponding character.  */)
9490   (Lisp_Object code)
9491 {
9492   Lisp_Object spec, attrs, val;
9493   struct charset *charset_roman, *charset_big5, *charset;
9494   EMACS_INT ch;
9495   int c;
9496
9497   CHECK_NATNUM (code);
9498   ch = XFASTINT (code);
9499   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9500   attrs = AREF (spec, 0);
9501
9502   if (ASCII_BYTE_P (ch)
9503       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9504     return code;
9505
9506   val = CODING_ATTR_CHARSET_LIST (attrs);
9507   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9508   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9509
9510   if (ch <= 0x7F)
9511     {
9512       c = ch;
9513       charset = charset_roman;
9514     }
9515   else
9516     {
9517       EMACS_INT b1 = ch >> 8;
9518       int b2 = ch & 0x7F;
9519       if (b1 < 0xA1 || b1 > 0xFE
9520           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9521         error ("Invalid code: %"pI"d", ch);
9522       c = ch;
9523       charset = charset_big5;
9524     }
9525   c = DECODE_CHAR (charset, c);
9526   if (c < 0)
9527     error ("Invalid code: %"pI"d", ch);
9528   return make_number (c);
9529 }
9530
9531 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9532        doc: /* Encode the Big5 character CH to BIG5 coding system.
9533 Return the corresponding character code in Big5.  */)
9534   (Lisp_Object ch)
9535 {
9536   Lisp_Object spec, attrs, charset_list;
9537   struct charset *charset;
9538   int c;
9539   unsigned code;
9540
9541   CHECK_CHARACTER (ch);
9542   c = XFASTINT (ch);
9543   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9544   attrs = AREF (spec, 0);
9545   if (ASCII_CHAR_P (c)
9546       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9547     return ch;
9548
9549   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9550   charset = char_charset (c, charset_list, &code);
9551   if (code == CHARSET_INVALID_CODE (charset))
9552     error ("Can't encode by Big5 encoding: %c", c);
9553
9554   return make_number (code);
9555 }
9556
9557 \f
9558 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9559        Sset_terminal_coding_system_internal, 1, 2, 0,
9560        doc: /* Internal use only.  */)
9561   (Lisp_Object coding_system, Lisp_Object terminal)
9562 {
9563   struct terminal *term = get_terminal (terminal, 1);
9564   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9565   CHECK_SYMBOL (coding_system);
9566   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9567   /* We had better not send unsafe characters to terminal.  */
9568   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9569   /* Character composition should be disabled.  */
9570   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9571   terminal_coding->src_multibyte = 1;
9572   terminal_coding->dst_multibyte = 0;
9573   tset_charset_list
9574     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9575             ? coding_charset_list (terminal_coding)
9576             : Fcons (make_number (charset_ascii), Qnil)));
9577   return Qnil;
9578 }
9579
9580 DEFUN ("set-safe-terminal-coding-system-internal",
9581        Fset_safe_terminal_coding_system_internal,
9582        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9583        doc: /* Internal use only.  */)
9584   (Lisp_Object coding_system)
9585 {
9586   CHECK_SYMBOL (coding_system);
9587   setup_coding_system (Fcheck_coding_system (coding_system),
9588                        &safe_terminal_coding);
9589   /* Character composition should be disabled.  */
9590   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9591   safe_terminal_coding.src_multibyte = 1;
9592   safe_terminal_coding.dst_multibyte = 0;
9593   return Qnil;
9594 }
9595
9596 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9597        Sterminal_coding_system, 0, 1, 0,
9598        doc: /* Return coding system specified for terminal output on the given terminal.
9599 TERMINAL may be a terminal object, a frame, or nil for the selected
9600 frame's terminal device.  */)
9601   (Lisp_Object terminal)
9602 {
9603   struct coding_system *terminal_coding
9604     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9605   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9606
9607   /* For backward compatibility, return nil if it is `undecided'.  */
9608   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9609 }
9610
9611 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9612        Sset_keyboard_coding_system_internal, 1, 2, 0,
9613        doc: /* Internal use only.  */)
9614   (Lisp_Object coding_system, Lisp_Object terminal)
9615 {
9616   struct terminal *t = get_terminal (terminal, 1);
9617   CHECK_SYMBOL (coding_system);
9618   if (NILP (coding_system))
9619     coding_system = Qno_conversion;
9620   else
9621     Fcheck_coding_system (coding_system);
9622   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9623   /* Character composition should be disabled.  */
9624   TERMINAL_KEYBOARD_CODING (t)->common_flags
9625     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9626   return Qnil;
9627 }
9628
9629 DEFUN ("keyboard-coding-system",
9630        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9631        doc: /* Return coding system specified for decoding keyboard input.  */)
9632   (Lisp_Object terminal)
9633 {
9634   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9635                          (get_terminal (terminal, 1))->id);
9636 }
9637
9638 \f
9639 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9640        Sfind_operation_coding_system,  1, MANY, 0,
9641        doc: /* Choose a coding system for an operation based on the target name.
9642 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9643 DECODING-SYSTEM is the coding system to use for decoding
9644 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9645 for encoding (in case OPERATION does encoding).
9646
9647 The first argument OPERATION specifies an I/O primitive:
9648   For file I/O, `insert-file-contents' or `write-region'.
9649   For process I/O, `call-process', `call-process-region', or `start-process'.
9650   For network I/O, `open-network-stream'.
9651
9652 The remaining arguments should be the same arguments that were passed
9653 to the primitive.  Depending on which primitive, one of those arguments
9654 is selected as the TARGET.  For example, if OPERATION does file I/O,
9655 whichever argument specifies the file name is TARGET.
9656
9657 TARGET has a meaning which depends on OPERATION:
9658   For file I/O, TARGET is a file name (except for the special case below).
9659   For process I/O, TARGET is a process name.
9660   For network I/O, TARGET is a service name or a port number.
9661
9662 This function looks up what is specified for TARGET in
9663 `file-coding-system-alist', `process-coding-system-alist',
9664 or `network-coding-system-alist' depending on OPERATION.
9665 They may specify a coding system, a cons of coding systems,
9666 or a function symbol to call.
9667 In the last case, we call the function with one argument,
9668 which is a list of all the arguments given to this function.
9669 If the function can't decide a coding system, it can return
9670 `undecided' so that the normal code-detection is performed.
9671
9672 If OPERATION is `insert-file-contents', the argument corresponding to
9673 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9674 file name to look up, and BUFFER is a buffer that contains the file's
9675 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9676 function to call for FILENAME, that function should examine the
9677 contents of BUFFER instead of reading the file.
9678
9679 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9680   (ptrdiff_t nargs, Lisp_Object *args)
9681 {
9682   Lisp_Object operation, target_idx, target, val;
9683   register Lisp_Object chain;
9684
9685   if (nargs < 2)
9686     error ("Too few arguments");
9687   operation = args[0];
9688   if (!SYMBOLP (operation)
9689       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9690     error ("Invalid first argument");
9691   if (nargs <= 1 + XFASTINT (target_idx))
9692     error ("Too few arguments for operation `%s'",
9693            SDATA (SYMBOL_NAME (operation)));
9694   target = args[XFASTINT (target_idx) + 1];
9695   if (!(STRINGP (target)
9696         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9697             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9698         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9699     error ("Invalid argument %"pI"d of operation `%s'",
9700            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9701   if (CONSP (target))
9702     target = XCAR (target);
9703
9704   chain = ((EQ (operation, Qinsert_file_contents)
9705             || EQ (operation, Qwrite_region))
9706            ? Vfile_coding_system_alist
9707            : (EQ (operation, Qopen_network_stream)
9708               ? Vnetwork_coding_system_alist
9709               : Vprocess_coding_system_alist));
9710   if (NILP (chain))
9711     return Qnil;
9712
9713   for (; CONSP (chain); chain = XCDR (chain))
9714     {
9715       Lisp_Object elt;
9716
9717       elt = XCAR (chain);
9718       if (CONSP (elt)
9719           && ((STRINGP (target)
9720                && STRINGP (XCAR (elt))
9721                && fast_string_match (XCAR (elt), target) >= 0)
9722               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9723         {
9724           val = XCDR (elt);
9725           /* Here, if VAL is both a valid coding system and a valid
9726              function symbol, we return VAL as a coding system.  */
9727           if (CONSP (val))
9728             return val;
9729           if (! SYMBOLP (val))
9730             return Qnil;
9731           if (! NILP (Fcoding_system_p (val)))
9732             return Fcons (val, val);
9733           if (! NILP (Ffboundp (val)))
9734             {
9735               /* We use call1 rather than safe_call1
9736                  so as to get bug reports about functions called here
9737                  which don't handle the current interface.  */
9738               val = call1 (val, Flist (nargs, args));
9739               if (CONSP (val))
9740                 return val;
9741               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9742                 return Fcons (val, val);
9743             }
9744           return Qnil;
9745         }
9746     }
9747   return Qnil;
9748 }
9749
9750 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9751        Sset_coding_system_priority, 0, MANY, 0,
9752        doc: /* Assign higher priority to the coding systems given as arguments.
9753 If multiple coding systems belong to the same category,
9754 all but the first one are ignored.
9755
9756 usage: (set-coding-system-priority &rest coding-systems)  */)
9757   (ptrdiff_t nargs, Lisp_Object *args)
9758 {
9759   ptrdiff_t i, j;
9760   bool changed[coding_category_max];
9761   enum coding_category priorities[coding_category_max];
9762
9763   memset (changed, 0, sizeof changed);
9764
9765   for (i = j = 0; i < nargs; i++)
9766     {
9767       enum coding_category category;
9768       Lisp_Object spec, attrs;
9769
9770       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9771       attrs = AREF (spec, 0);
9772       category = XINT (CODING_ATTR_CATEGORY (attrs));
9773       if (changed[category])
9774         /* Ignore this coding system because a coding system of the
9775            same category already had a higher priority.  */
9776         continue;
9777       changed[category] = 1;
9778       priorities[j++] = category;
9779       if (coding_categories[category].id >= 0
9780           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9781         setup_coding_system (args[i], &coding_categories[category]);
9782       Fset (AREF (Vcoding_category_table, category), args[i]);
9783     }
9784
9785   /* Now we have decided top J priorities.  Reflect the order of the
9786      original priorities to the remaining priorities.  */
9787
9788   for (i = j, j = 0; i < coding_category_max; i++, j++)
9789     {
9790       while (j < coding_category_max
9791              && changed[coding_priorities[j]])
9792         j++;
9793       if (j == coding_category_max)
9794         emacs_abort ();
9795       priorities[i] = coding_priorities[j];
9796     }
9797
9798   memcpy (coding_priorities, priorities, sizeof priorities);
9799
9800   /* Update `coding-category-list'.  */
9801   Vcoding_category_list = Qnil;
9802   for (i = coding_category_max; i-- > 0; )
9803     Vcoding_category_list
9804       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9805                Vcoding_category_list);
9806
9807   return Qnil;
9808 }
9809
9810 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9811        Scoding_system_priority_list, 0, 1, 0,
9812        doc: /* Return a list of coding systems ordered by their priorities.
9813 The list contains a subset of coding systems; i.e. coding systems
9814 assigned to each coding category (see `coding-category-list').
9815
9816 HIGHESTP non-nil means just return the highest priority one.  */)
9817   (Lisp_Object highestp)
9818 {
9819   int i;
9820   Lisp_Object val;
9821
9822   for (i = 0, val = Qnil; i < coding_category_max; i++)
9823     {
9824       enum coding_category category = coding_priorities[i];
9825       int id = coding_categories[category].id;
9826       Lisp_Object attrs;
9827
9828       if (id < 0)
9829         continue;
9830       attrs = CODING_ID_ATTRS (id);
9831       if (! NILP (highestp))
9832         return CODING_ATTR_BASE_NAME (attrs);
9833       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9834     }
9835   return Fnreverse (val);
9836 }
9837
9838 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9839
9840 static Lisp_Object
9841 make_subsidiaries (Lisp_Object base)
9842 {
9843   Lisp_Object subsidiaries;
9844   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9845   char *buf = alloca (base_name_len + 6);
9846   int i;
9847
9848   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9849   subsidiaries = make_uninit_vector (3);
9850   for (i = 0; i < 3; i++)
9851     {
9852       strcpy (buf + base_name_len, suffixes[i]);
9853       ASET (subsidiaries, i, intern (buf));
9854     }
9855   return subsidiaries;
9856 }
9857
9858
9859 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9860        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9861        doc: /* For internal use only.
9862 usage: (define-coding-system-internal ...)  */)
9863   (ptrdiff_t nargs, Lisp_Object *args)
9864 {
9865   Lisp_Object name;
9866   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9867   Lisp_Object attrs;            /* Vector of attributes.  */
9868   Lisp_Object eol_type;
9869   Lisp_Object aliases;
9870   Lisp_Object coding_type, charset_list, safe_charsets;
9871   enum coding_category category;
9872   Lisp_Object tail, val;
9873   int max_charset_id = 0;
9874   int i;
9875
9876   if (nargs < coding_arg_max)
9877     goto short_args;
9878
9879   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9880
9881   name = args[coding_arg_name];
9882   CHECK_SYMBOL (name);
9883   ASET (attrs, coding_attr_base_name, name);
9884
9885   val = args[coding_arg_mnemonic];
9886   if (! STRINGP (val))
9887     CHECK_CHARACTER (val);
9888   ASET (attrs, coding_attr_mnemonic, val);
9889
9890   coding_type = args[coding_arg_coding_type];
9891   CHECK_SYMBOL (coding_type);
9892   ASET (attrs, coding_attr_type, coding_type);
9893
9894   charset_list = args[coding_arg_charset_list];
9895   if (SYMBOLP (charset_list))
9896     {
9897       if (EQ (charset_list, Qiso_2022))
9898         {
9899           if (! EQ (coding_type, Qiso_2022))
9900             error ("Invalid charset-list");
9901           charset_list = Viso_2022_charset_list;
9902         }
9903       else if (EQ (charset_list, Qemacs_mule))
9904         {
9905           if (! EQ (coding_type, Qemacs_mule))
9906             error ("Invalid charset-list");
9907           charset_list = Vemacs_mule_charset_list;
9908         }
9909       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9910         {
9911           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9912             error ("Invalid charset-list");
9913           if (max_charset_id < XFASTINT (XCAR (tail)))
9914             max_charset_id = XFASTINT (XCAR (tail));
9915         }
9916     }
9917   else
9918     {
9919       charset_list = Fcopy_sequence (charset_list);
9920       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9921         {
9922           struct charset *charset;
9923
9924           val = XCAR (tail);
9925           CHECK_CHARSET_GET_CHARSET (val, charset);
9926           if (EQ (coding_type, Qiso_2022)
9927               ? CHARSET_ISO_FINAL (charset) < 0
9928               : EQ (coding_type, Qemacs_mule)
9929               ? CHARSET_EMACS_MULE_ID (charset) < 0
9930               : 0)
9931             error ("Can't handle charset `%s'",
9932                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9933
9934           XSETCAR (tail, make_number (charset->id));
9935           if (max_charset_id < charset->id)
9936             max_charset_id = charset->id;
9937         }
9938     }
9939   ASET (attrs, coding_attr_charset_list, charset_list);
9940
9941   safe_charsets = make_uninit_string (max_charset_id + 1);
9942   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9943   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9944     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9945   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9946
9947   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9948
9949   val = args[coding_arg_decode_translation_table];
9950   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9951     CHECK_SYMBOL (val);
9952   ASET (attrs, coding_attr_decode_tbl, val);
9953
9954   val = args[coding_arg_encode_translation_table];
9955   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9956     CHECK_SYMBOL (val);
9957   ASET (attrs, coding_attr_encode_tbl, val);
9958
9959   val = args[coding_arg_post_read_conversion];
9960   CHECK_SYMBOL (val);
9961   ASET (attrs, coding_attr_post_read, val);
9962
9963   val = args[coding_arg_pre_write_conversion];
9964   CHECK_SYMBOL (val);
9965   ASET (attrs, coding_attr_pre_write, val);
9966
9967   val = args[coding_arg_default_char];
9968   if (NILP (val))
9969     ASET (attrs, coding_attr_default_char, make_number (' '));
9970   else
9971     {
9972       CHECK_CHARACTER (val);
9973       ASET (attrs, coding_attr_default_char, val);
9974     }
9975
9976   val = args[coding_arg_for_unibyte];
9977   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9978
9979   val = args[coding_arg_plist];
9980   CHECK_LIST (val);
9981   ASET (attrs, coding_attr_plist, val);
9982
9983   if (EQ (coding_type, Qcharset))
9984     {
9985       /* Generate a lisp vector of 256 elements.  Each element is nil,
9986          integer, or a list of charset IDs.
9987
9988          If Nth element is nil, the byte code N is invalid in this
9989          coding system.
9990
9991          If Nth element is a number NUM, N is the first byte of a
9992          charset whose ID is NUM.
9993
9994          If Nth element is a list of charset IDs, N is the first byte
9995          of one of them.  The list is sorted by dimensions of the
9996          charsets.  A charset of smaller dimension comes first. */
9997       val = Fmake_vector (make_number (256), Qnil);
9998
9999       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10000         {
10001           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10002           int dim = CHARSET_DIMENSION (charset);
10003           int idx = (dim - 1) * 4;
10004
10005           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10006             ASET (attrs, coding_attr_ascii_compat, Qt);
10007
10008           for (i = charset->code_space[idx];
10009                i <= charset->code_space[idx + 1]; i++)
10010             {
10011               Lisp_Object tmp, tmp2;
10012               int dim2;
10013
10014               tmp = AREF (val, i);
10015               if (NILP (tmp))
10016                 tmp = XCAR (tail);
10017               else if (NUMBERP (tmp))
10018                 {
10019                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10020                   if (dim < dim2)
10021                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
10022                   else
10023                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
10024                 }
10025               else
10026                 {
10027                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10028                     {
10029                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10030                       if (dim < dim2)
10031                         break;
10032                     }
10033                   if (NILP (tmp2))
10034                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
10035                   else
10036                     {
10037                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10038                       XSETCAR (tmp2, XCAR (tail));
10039                     }
10040                 }
10041               ASET (val, i, tmp);
10042             }
10043         }
10044       ASET (attrs, coding_attr_charset_valids, val);
10045       category = coding_category_charset;
10046     }
10047   else if (EQ (coding_type, Qccl))
10048     {
10049       Lisp_Object valids;
10050
10051       if (nargs < coding_arg_ccl_max)
10052         goto short_args;
10053
10054       val = args[coding_arg_ccl_decoder];
10055       CHECK_CCL_PROGRAM (val);
10056       if (VECTORP (val))
10057         val = Fcopy_sequence (val);
10058       ASET (attrs, coding_attr_ccl_decoder, val);
10059
10060       val = args[coding_arg_ccl_encoder];
10061       CHECK_CCL_PROGRAM (val);
10062       if (VECTORP (val))
10063         val = Fcopy_sequence (val);
10064       ASET (attrs, coding_attr_ccl_encoder, val);
10065
10066       val = args[coding_arg_ccl_valids];
10067       valids = Fmake_string (make_number (256), make_number (0));
10068       for (tail = val; CONSP (tail); tail = XCDR (tail))
10069         {
10070           int from, to;
10071
10072           val = XCAR (tail);
10073           if (INTEGERP (val))
10074             {
10075               if (! (0 <= XINT (val) && XINT (val) <= 255))
10076                 args_out_of_range_3 (val, make_number (0), make_number (255));
10077               from = to = XINT (val);
10078             }
10079           else
10080             {
10081               CHECK_CONS (val);
10082               CHECK_NATNUM_CAR (val);
10083               CHECK_NUMBER_CDR (val);
10084               if (XINT (XCAR (val)) > 255)
10085                 args_out_of_range_3 (XCAR (val),
10086                                      make_number (0), make_number (255));
10087               from = XINT (XCAR (val));
10088               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10089                 args_out_of_range_3 (XCDR (val),
10090                                      XCAR (val), make_number (255));
10091               to = XINT (XCDR (val));
10092             }
10093           for (i = from; i <= to; i++)
10094             SSET (valids, i, 1);
10095         }
10096       ASET (attrs, coding_attr_ccl_valids, valids);
10097
10098       category = coding_category_ccl;
10099     }
10100   else if (EQ (coding_type, Qutf_16))
10101     {
10102       Lisp_Object bom, endian;
10103
10104       ASET (attrs, coding_attr_ascii_compat, Qnil);
10105
10106       if (nargs < coding_arg_utf16_max)
10107         goto short_args;
10108
10109       bom = args[coding_arg_utf16_bom];
10110       if (! NILP (bom) && ! EQ (bom, Qt))
10111         {
10112           CHECK_CONS (bom);
10113           val = XCAR (bom);
10114           CHECK_CODING_SYSTEM (val);
10115           val = XCDR (bom);
10116           CHECK_CODING_SYSTEM (val);
10117         }
10118       ASET (attrs, coding_attr_utf_bom, bom);
10119
10120       endian = args[coding_arg_utf16_endian];
10121       CHECK_SYMBOL (endian);
10122       if (NILP (endian))
10123         endian = Qbig;
10124       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10125         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10126       ASET (attrs, coding_attr_utf_16_endian, endian);
10127
10128       category = (CONSP (bom)
10129                   ? coding_category_utf_16_auto
10130                   : NILP (bom)
10131                   ? (EQ (endian, Qbig)
10132                      ? coding_category_utf_16_be_nosig
10133                      : coding_category_utf_16_le_nosig)
10134                   : (EQ (endian, Qbig)
10135                      ? coding_category_utf_16_be
10136                      : coding_category_utf_16_le));
10137     }
10138   else if (EQ (coding_type, Qiso_2022))
10139     {
10140       Lisp_Object initial, reg_usage, request, flags;
10141
10142       if (nargs < coding_arg_iso2022_max)
10143         goto short_args;
10144
10145       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10146       CHECK_VECTOR (initial);
10147       for (i = 0; i < 4; i++)
10148         {
10149           val = AREF (initial, i);
10150           if (! NILP (val))
10151             {
10152               struct charset *charset;
10153
10154               CHECK_CHARSET_GET_CHARSET (val, charset);
10155               ASET (initial, i, make_number (CHARSET_ID (charset)));
10156               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10157                 ASET (attrs, coding_attr_ascii_compat, Qt);
10158             }
10159           else
10160             ASET (initial, i, make_number (-1));
10161         }
10162
10163       reg_usage = args[coding_arg_iso2022_reg_usage];
10164       CHECK_CONS (reg_usage);
10165       CHECK_NUMBER_CAR (reg_usage);
10166       CHECK_NUMBER_CDR (reg_usage);
10167
10168       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10169       for (tail = request; CONSP (tail); tail = XCDR (tail))
10170         {
10171           int id;
10172           Lisp_Object tmp1;
10173
10174           val = XCAR (tail);
10175           CHECK_CONS (val);
10176           tmp1 = XCAR (val);
10177           CHECK_CHARSET_GET_ID (tmp1, id);
10178           CHECK_NATNUM_CDR (val);
10179           if (XINT (XCDR (val)) >= 4)
10180             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10181           XSETCAR (val, make_number (id));
10182         }
10183
10184       flags = args[coding_arg_iso2022_flags];
10185       CHECK_NATNUM (flags);
10186       i = XINT (flags) & INT_MAX;
10187       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10188         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10189       flags = make_number (i);
10190
10191       ASET (attrs, coding_attr_iso_initial, initial);
10192       ASET (attrs, coding_attr_iso_usage, reg_usage);
10193       ASET (attrs, coding_attr_iso_request, request);
10194       ASET (attrs, coding_attr_iso_flags, flags);
10195       setup_iso_safe_charsets (attrs);
10196
10197       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10198         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10199                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10200                     ? coding_category_iso_7_else
10201                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10202                     ? coding_category_iso_7
10203                     : coding_category_iso_7_tight);
10204       else
10205         {
10206           int id = XINT (AREF (initial, 1));
10207
10208           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10209                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10210                        || id < 0)
10211                       ? coding_category_iso_8_else
10212                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10213                       ? coding_category_iso_8_1
10214                       : coding_category_iso_8_2);
10215         }
10216       if (category != coding_category_iso_8_1
10217           && category != coding_category_iso_8_2)
10218         ASET (attrs, coding_attr_ascii_compat, Qnil);
10219     }
10220   else if (EQ (coding_type, Qemacs_mule))
10221     {
10222       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10223         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10224       ASET (attrs, coding_attr_ascii_compat, Qt);
10225       category = coding_category_emacs_mule;
10226     }
10227   else if (EQ (coding_type, Qshift_jis))
10228     {
10229
10230       struct charset *charset;
10231
10232       if (XINT (Flength (charset_list)) != 3
10233           && XINT (Flength (charset_list)) != 4)
10234         error ("There should be three or four charsets");
10235
10236       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10237       if (CHARSET_DIMENSION (charset) != 1)
10238         error ("Dimension of charset %s is not one",
10239                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10240       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10241         ASET (attrs, coding_attr_ascii_compat, Qt);
10242
10243       charset_list = XCDR (charset_list);
10244       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10245       if (CHARSET_DIMENSION (charset) != 1)
10246         error ("Dimension of charset %s is not one",
10247                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10248
10249       charset_list = XCDR (charset_list);
10250       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10251       if (CHARSET_DIMENSION (charset) != 2)
10252         error ("Dimension of charset %s is not two",
10253                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10254
10255       charset_list = XCDR (charset_list);
10256       if (! NILP (charset_list))
10257         {
10258           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10259           if (CHARSET_DIMENSION (charset) != 2)
10260             error ("Dimension of charset %s is not two",
10261                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10262         }
10263
10264       category = coding_category_sjis;
10265       Vsjis_coding_system = name;
10266     }
10267   else if (EQ (coding_type, Qbig5))
10268     {
10269       struct charset *charset;
10270
10271       if (XINT (Flength (charset_list)) != 2)
10272         error ("There should be just two charsets");
10273
10274       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10275       if (CHARSET_DIMENSION (charset) != 1)
10276         error ("Dimension of charset %s is not one",
10277                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10278       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10279         ASET (attrs, coding_attr_ascii_compat, Qt);
10280
10281       charset_list = XCDR (charset_list);
10282       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10283       if (CHARSET_DIMENSION (charset) != 2)
10284         error ("Dimension of charset %s is not two",
10285                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10286
10287       category = coding_category_big5;
10288       Vbig5_coding_system = name;
10289     }
10290   else if (EQ (coding_type, Qraw_text))
10291     {
10292       category = coding_category_raw_text;
10293       ASET (attrs, coding_attr_ascii_compat, Qt);
10294     }
10295   else if (EQ (coding_type, Qutf_8))
10296     {
10297       Lisp_Object bom;
10298
10299       if (nargs < coding_arg_utf8_max)
10300         goto short_args;
10301
10302       bom = args[coding_arg_utf8_bom];
10303       if (! NILP (bom) && ! EQ (bom, Qt))
10304         {
10305           CHECK_CONS (bom);
10306           val = XCAR (bom);
10307           CHECK_CODING_SYSTEM (val);
10308           val = XCDR (bom);
10309           CHECK_CODING_SYSTEM (val);
10310         }
10311       ASET (attrs, coding_attr_utf_bom, bom);
10312       if (NILP (bom))
10313         ASET (attrs, coding_attr_ascii_compat, Qt);
10314
10315       category = (CONSP (bom) ? coding_category_utf_8_auto
10316                   : NILP (bom) ? coding_category_utf_8_nosig
10317                   : coding_category_utf_8_sig);
10318     }
10319   else if (EQ (coding_type, Qundecided))
10320     category = coding_category_undecided;
10321   else
10322     error ("Invalid coding system type: %s",
10323            SDATA (SYMBOL_NAME (coding_type)));
10324
10325   ASET (attrs, coding_attr_category, make_number (category));
10326   ASET (attrs, coding_attr_plist,
10327         Fcons (QCcategory,
10328                Fcons (AREF (Vcoding_category_table, category),
10329                       CODING_ATTR_PLIST (attrs))));
10330   ASET (attrs, coding_attr_plist,
10331         Fcons (QCascii_compatible_p,
10332                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10333                       CODING_ATTR_PLIST (attrs))));
10334
10335   eol_type = args[coding_arg_eol_type];
10336   if (! NILP (eol_type)
10337       && ! EQ (eol_type, Qunix)
10338       && ! EQ (eol_type, Qdos)
10339       && ! EQ (eol_type, Qmac))
10340     error ("Invalid eol-type");
10341
10342   aliases = Fcons (name, Qnil);
10343
10344   if (NILP (eol_type))
10345     {
10346       eol_type = make_subsidiaries (name);
10347       for (i = 0; i < 3; i++)
10348         {
10349           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10350
10351           this_name = AREF (eol_type, i);
10352           this_aliases = Fcons (this_name, Qnil);
10353           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10354           this_spec = make_uninit_vector (3);
10355           ASET (this_spec, 0, attrs);
10356           ASET (this_spec, 1, this_aliases);
10357           ASET (this_spec, 2, this_eol_type);
10358           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10359           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10360           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10361           if (NILP (val))
10362             Vcoding_system_alist
10363               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10364                        Vcoding_system_alist);
10365         }
10366     }
10367
10368   spec_vec = make_uninit_vector (3);
10369   ASET (spec_vec, 0, attrs);
10370   ASET (spec_vec, 1, aliases);
10371   ASET (spec_vec, 2, eol_type);
10372
10373   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10374   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10375   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10376   if (NILP (val))
10377     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10378                                   Vcoding_system_alist);
10379
10380   {
10381     int id = coding_categories[category].id;
10382
10383     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10384       setup_coding_system (name, &coding_categories[category]);
10385   }
10386
10387   return Qnil;
10388
10389  short_args:
10390   return Fsignal (Qwrong_number_of_arguments,
10391                   Fcons (intern ("define-coding-system-internal"),
10392                          make_number (nargs)));
10393 }
10394
10395
10396 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10397        3, 3, 0,
10398        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10399   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10400 {
10401   Lisp_Object spec, attrs;
10402
10403   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10404   attrs = AREF (spec, 0);
10405   if (EQ (prop, QCmnemonic))
10406     {
10407       if (! STRINGP (val))
10408         CHECK_CHARACTER (val);
10409       ASET (attrs, coding_attr_mnemonic, val);
10410     }
10411   else if (EQ (prop, QCdefault_char))
10412     {
10413       if (NILP (val))
10414         val = make_number (' ');
10415       else
10416         CHECK_CHARACTER (val);
10417       ASET (attrs, coding_attr_default_char, val);
10418     }
10419   else if (EQ (prop, QCdecode_translation_table))
10420     {
10421       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10422         CHECK_SYMBOL (val);
10423       ASET (attrs, coding_attr_decode_tbl, val);
10424     }
10425   else if (EQ (prop, QCencode_translation_table))
10426     {
10427       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10428         CHECK_SYMBOL (val);
10429       ASET (attrs, coding_attr_encode_tbl, val);
10430     }
10431   else if (EQ (prop, QCpost_read_conversion))
10432     {
10433       CHECK_SYMBOL (val);
10434       ASET (attrs, coding_attr_post_read, val);
10435     }
10436   else if (EQ (prop, QCpre_write_conversion))
10437     {
10438       CHECK_SYMBOL (val);
10439       ASET (attrs, coding_attr_pre_write, val);
10440     }
10441   else if (EQ (prop, QCascii_compatible_p))
10442     {
10443       ASET (attrs, coding_attr_ascii_compat, val);
10444     }
10445
10446   ASET (attrs, coding_attr_plist,
10447         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10448   return val;
10449 }
10450
10451
10452 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10453        Sdefine_coding_system_alias, 2, 2, 0,
10454        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10455   (Lisp_Object alias, Lisp_Object coding_system)
10456 {
10457   Lisp_Object spec, aliases, eol_type, val;
10458
10459   CHECK_SYMBOL (alias);
10460   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10461   aliases = AREF (spec, 1);
10462   /* ALIASES should be a list of length more than zero, and the first
10463      element is a base coding system.  Append ALIAS at the tail of the
10464      list.  */
10465   while (!NILP (XCDR (aliases)))
10466     aliases = XCDR (aliases);
10467   XSETCDR (aliases, Fcons (alias, Qnil));
10468
10469   eol_type = AREF (spec, 2);
10470   if (VECTORP (eol_type))
10471     {
10472       Lisp_Object subsidiaries;
10473       int i;
10474
10475       subsidiaries = make_subsidiaries (alias);
10476       for (i = 0; i < 3; i++)
10477         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10478                                      AREF (eol_type, i));
10479     }
10480
10481   Fputhash (alias, spec, Vcoding_system_hash_table);
10482   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10483   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10484   if (NILP (val))
10485     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10486                                   Vcoding_system_alist);
10487
10488   return Qnil;
10489 }
10490
10491 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10492        1, 1, 0,
10493        doc: /* Return the base of CODING-SYSTEM.
10494 Any alias or subsidiary coding system is not a base coding system.  */)
10495   (Lisp_Object coding_system)
10496 {
10497   Lisp_Object spec, attrs;
10498
10499   if (NILP (coding_system))
10500     return (Qno_conversion);
10501   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10502   attrs = AREF (spec, 0);
10503   return CODING_ATTR_BASE_NAME (attrs);
10504 }
10505
10506 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10507        1, 1, 0,
10508        doc: "Return the property list of CODING-SYSTEM.")
10509   (Lisp_Object coding_system)
10510 {
10511   Lisp_Object spec, attrs;
10512
10513   if (NILP (coding_system))
10514     coding_system = Qno_conversion;
10515   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10516   attrs = AREF (spec, 0);
10517   return CODING_ATTR_PLIST (attrs);
10518 }
10519
10520
10521 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10522        1, 1, 0,
10523        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10524   (Lisp_Object coding_system)
10525 {
10526   Lisp_Object spec;
10527
10528   if (NILP (coding_system))
10529     coding_system = Qno_conversion;
10530   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10531   return AREF (spec, 1);
10532 }
10533
10534 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10535        Scoding_system_eol_type, 1, 1, 0,
10536        doc: /* Return eol-type of CODING-SYSTEM.
10537 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10538
10539 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10540 and CR respectively.
10541
10542 A vector value indicates that a format of end-of-line should be
10543 detected automatically.  Nth element of the vector is the subsidiary
10544 coding system whose eol-type is N.  */)
10545   (Lisp_Object coding_system)
10546 {
10547   Lisp_Object spec, eol_type;
10548   int n;
10549
10550   if (NILP (coding_system))
10551     coding_system = Qno_conversion;
10552   if (! CODING_SYSTEM_P (coding_system))
10553     return Qnil;
10554   spec = CODING_SYSTEM_SPEC (coding_system);
10555   eol_type = AREF (spec, 2);
10556   if (VECTORP (eol_type))
10557     return Fcopy_sequence (eol_type);
10558   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10559   return make_number (n);
10560 }
10561
10562 #endif /* emacs */
10563
10564 \f
10565 /*** 9. Post-amble ***/
10566
10567 void
10568 init_coding_once (void)
10569 {
10570   int i;
10571
10572   for (i = 0; i < coding_category_max; i++)
10573     {
10574       coding_categories[i].id = -1;
10575       coding_priorities[i] = i;
10576     }
10577
10578   /* ISO2022 specific initialize routine.  */
10579   for (i = 0; i < 0x20; i++)
10580     iso_code_class[i] = ISO_control_0;
10581   for (i = 0x21; i < 0x7F; i++)
10582     iso_code_class[i] = ISO_graphic_plane_0;
10583   for (i = 0x80; i < 0xA0; i++)
10584     iso_code_class[i] = ISO_control_1;
10585   for (i = 0xA1; i < 0xFF; i++)
10586     iso_code_class[i] = ISO_graphic_plane_1;
10587   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10588   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10589   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10590   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10591   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10592   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10593   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10594   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10595   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10596
10597   for (i = 0; i < 256; i++)
10598     {
10599       emacs_mule_bytes[i] = 1;
10600     }
10601   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10602   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10603   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10604   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10605 }
10606
10607 #ifdef emacs
10608
10609 void
10610 syms_of_coding (void)
10611 {
10612   staticpro (&Vcoding_system_hash_table);
10613   {
10614     Lisp_Object args[2];
10615     args[0] = QCtest;
10616     args[1] = Qeq;
10617     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10618   }
10619
10620   staticpro (&Vsjis_coding_system);
10621   Vsjis_coding_system = Qnil;
10622
10623   staticpro (&Vbig5_coding_system);
10624   Vbig5_coding_system = Qnil;
10625
10626   staticpro (&Vcode_conversion_reused_workbuf);
10627   Vcode_conversion_reused_workbuf = Qnil;
10628
10629   staticpro (&Vcode_conversion_workbuf_name);
10630   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10631
10632   reused_workbuf_in_use = 0;
10633
10634   DEFSYM (Qcharset, "charset");
10635   DEFSYM (Qtarget_idx, "target-idx");
10636   DEFSYM (Qcoding_system_history, "coding-system-history");
10637   Fset (Qcoding_system_history, Qnil);
10638
10639   /* Target FILENAME is the first argument.  */
10640   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10641   /* Target FILENAME is the third argument.  */
10642   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10643
10644   DEFSYM (Qcall_process, "call-process");
10645   /* Target PROGRAM is the first argument.  */
10646   Fput (Qcall_process, Qtarget_idx, make_number (0));
10647
10648   DEFSYM (Qcall_process_region, "call-process-region");
10649   /* Target PROGRAM is the third argument.  */
10650   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10651
10652   DEFSYM (Qstart_process, "start-process");
10653   /* Target PROGRAM is the third argument.  */
10654   Fput (Qstart_process, Qtarget_idx, make_number (2));
10655
10656   DEFSYM (Qopen_network_stream, "open-network-stream");
10657   /* Target SERVICE is the fourth argument.  */
10658   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10659
10660   DEFSYM (Qcoding_system, "coding-system");
10661   DEFSYM (Qcoding_aliases, "coding-aliases");
10662
10663   DEFSYM (Qeol_type, "eol-type");
10664   DEFSYM (Qunix, "unix");
10665   DEFSYM (Qdos, "dos");
10666   DEFSYM (Qmac, "mac");
10667
10668   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10669   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10670   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10671   DEFSYM (Qdefault_char, "default-char");
10672   DEFSYM (Qundecided, "undecided");
10673   DEFSYM (Qno_conversion, "no-conversion");
10674   DEFSYM (Qraw_text, "raw-text");
10675
10676   DEFSYM (Qiso_2022, "iso-2022");
10677
10678   DEFSYM (Qutf_8, "utf-8");
10679   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10680
10681 #if defined (WINDOWSNT) || defined (CYGWIN)
10682   /* No, not utf-16-le: that one has a BOM.  */
10683   DEFSYM (Qutf_16le, "utf-16le");
10684 #endif
10685
10686   DEFSYM (Qutf_16, "utf-16");
10687   DEFSYM (Qbig, "big");
10688   DEFSYM (Qlittle, "little");
10689
10690   DEFSYM (Qshift_jis, "shift-jis");
10691   DEFSYM (Qbig5, "big5");
10692
10693   DEFSYM (Qcoding_system_p, "coding-system-p");
10694
10695   DEFSYM (Qcoding_system_error, "coding-system-error");
10696   Fput (Qcoding_system_error, Qerror_conditions,
10697         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10698   Fput (Qcoding_system_error, Qerror_message,
10699         build_pure_c_string ("Invalid coding system"));
10700
10701   /* Intern this now in case it isn't already done.
10702      Setting this variable twice is harmless.
10703      But don't staticpro it here--that is done in alloc.c.  */
10704   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10705
10706   DEFSYM (Qtranslation_table, "translation-table");
10707   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10708   DEFSYM (Qtranslation_table_id, "translation-table-id");
10709   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10710   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10711
10712   DEFSYM (Qvalid_codes, "valid-codes");
10713
10714   DEFSYM (Qemacs_mule, "emacs-mule");
10715
10716   DEFSYM (QCcategory, ":category");
10717   DEFSYM (QCmnemonic, ":mnemonic");
10718   DEFSYM (QCdefault_char, ":default-char");
10719   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10720   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10721   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10722   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10723   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10724
10725   Vcoding_category_table
10726     = Fmake_vector (make_number (coding_category_max), Qnil);
10727   staticpro (&Vcoding_category_table);
10728   /* Followings are target of code detection.  */
10729   ASET (Vcoding_category_table, coding_category_iso_7,
10730         intern_c_string ("coding-category-iso-7"));
10731   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10732         intern_c_string ("coding-category-iso-7-tight"));
10733   ASET (Vcoding_category_table, coding_category_iso_8_1,
10734         intern_c_string ("coding-category-iso-8-1"));
10735   ASET (Vcoding_category_table, coding_category_iso_8_2,
10736         intern_c_string ("coding-category-iso-8-2"));
10737   ASET (Vcoding_category_table, coding_category_iso_7_else,
10738         intern_c_string ("coding-category-iso-7-else"));
10739   ASET (Vcoding_category_table, coding_category_iso_8_else,
10740         intern_c_string ("coding-category-iso-8-else"));
10741   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10742         intern_c_string ("coding-category-utf-8-auto"));
10743   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10744         intern_c_string ("coding-category-utf-8"));
10745   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10746         intern_c_string ("coding-category-utf-8-sig"));
10747   ASET (Vcoding_category_table, coding_category_utf_16_be,
10748         intern_c_string ("coding-category-utf-16-be"));
10749   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10750         intern_c_string ("coding-category-utf-16-auto"));
10751   ASET (Vcoding_category_table, coding_category_utf_16_le,
10752         intern_c_string ("coding-category-utf-16-le"));
10753   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10754         intern_c_string ("coding-category-utf-16-be-nosig"));
10755   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10756         intern_c_string ("coding-category-utf-16-le-nosig"));
10757   ASET (Vcoding_category_table, coding_category_charset,
10758         intern_c_string ("coding-category-charset"));
10759   ASET (Vcoding_category_table, coding_category_sjis,
10760         intern_c_string ("coding-category-sjis"));
10761   ASET (Vcoding_category_table, coding_category_big5,
10762         intern_c_string ("coding-category-big5"));
10763   ASET (Vcoding_category_table, coding_category_ccl,
10764         intern_c_string ("coding-category-ccl"));
10765   ASET (Vcoding_category_table, coding_category_emacs_mule,
10766         intern_c_string ("coding-category-emacs-mule"));
10767   /* Followings are NOT target of code detection.  */
10768   ASET (Vcoding_category_table, coding_category_raw_text,
10769         intern_c_string ("coding-category-raw-text"));
10770   ASET (Vcoding_category_table, coding_category_undecided,
10771         intern_c_string ("coding-category-undecided"));
10772
10773   DEFSYM (Qinsufficient_source, "insufficient-source");
10774   DEFSYM (Qinvalid_source, "invalid-source");
10775   DEFSYM (Qinterrupted, "interrupted");
10776   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10777
10778   defsubr (&Scoding_system_p);
10779   defsubr (&Sread_coding_system);
10780   defsubr (&Sread_non_nil_coding_system);
10781   defsubr (&Scheck_coding_system);
10782   defsubr (&Sdetect_coding_region);
10783   defsubr (&Sdetect_coding_string);
10784   defsubr (&Sfind_coding_systems_region_internal);
10785   defsubr (&Sunencodable_char_position);
10786   defsubr (&Scheck_coding_systems_region);
10787   defsubr (&Sdecode_coding_region);
10788   defsubr (&Sencode_coding_region);
10789   defsubr (&Sdecode_coding_string);
10790   defsubr (&Sencode_coding_string);
10791   defsubr (&Sdecode_sjis_char);
10792   defsubr (&Sencode_sjis_char);
10793   defsubr (&Sdecode_big5_char);
10794   defsubr (&Sencode_big5_char);
10795   defsubr (&Sset_terminal_coding_system_internal);
10796   defsubr (&Sset_safe_terminal_coding_system_internal);
10797   defsubr (&Sterminal_coding_system);
10798   defsubr (&Sset_keyboard_coding_system_internal);
10799   defsubr (&Skeyboard_coding_system);
10800   defsubr (&Sfind_operation_coding_system);
10801   defsubr (&Sset_coding_system_priority);
10802   defsubr (&Sdefine_coding_system_internal);
10803   defsubr (&Sdefine_coding_system_alias);
10804   defsubr (&Scoding_system_put);
10805   defsubr (&Scoding_system_base);
10806   defsubr (&Scoding_system_plist);
10807   defsubr (&Scoding_system_aliases);
10808   defsubr (&Scoding_system_eol_type);
10809   defsubr (&Scoding_system_priority_list);
10810
10811   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10812                doc: /* List of coding systems.
10813
10814 Do not alter the value of this variable manually.  This variable should be
10815 updated by the functions `define-coding-system' and
10816 `define-coding-system-alias'.  */);
10817   Vcoding_system_list = Qnil;
10818
10819   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10820                doc: /* Alist of coding system names.
10821 Each element is one element list of coding system name.
10822 This variable is given to `completing-read' as COLLECTION argument.
10823
10824 Do not alter the value of this variable manually.  This variable should be
10825 updated by the functions `make-coding-system' and
10826 `define-coding-system-alias'.  */);
10827   Vcoding_system_alist = Qnil;
10828
10829   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10830                doc: /* List of coding-categories (symbols) ordered by priority.
10831
10832 On detecting a coding system, Emacs tries code detection algorithms
10833 associated with each coding-category one by one in this order.  When
10834 one algorithm agrees with a byte sequence of source text, the coding
10835 system bound to the corresponding coding-category is selected.
10836
10837 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10838   {
10839     int i;
10840
10841     Vcoding_category_list = Qnil;
10842     for (i = coding_category_max - 1; i >= 0; i--)
10843       Vcoding_category_list
10844         = Fcons (AREF (Vcoding_category_table, i),
10845                  Vcoding_category_list);
10846   }
10847
10848   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10849                doc: /* Specify the coding system for read operations.
10850 It is useful to bind this variable with `let', but do not set it globally.
10851 If the value is a coding system, it is used for decoding on read operation.
10852 If not, an appropriate element is used from one of the coding system alists.
10853 There are three such tables: `file-coding-system-alist',
10854 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10855   Vcoding_system_for_read = Qnil;
10856
10857   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10858                doc: /* Specify the coding system for write operations.
10859 Programs bind this variable with `let', but you should not set it globally.
10860 If the value is a coding system, it is used for encoding of output,
10861 when writing it to a file and when sending it to a file or subprocess.
10862
10863 If this does not specify a coding system, an appropriate element
10864 is used from one of the coding system alists.
10865 There are three such tables: `file-coding-system-alist',
10866 `process-coding-system-alist', and `network-coding-system-alist'.
10867 For output to files, if the above procedure does not specify a coding system,
10868 the value of `buffer-file-coding-system' is used.  */);
10869   Vcoding_system_for_write = Qnil;
10870
10871   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10872                doc: /*
10873 Coding system used in the latest file or process I/O.  */);
10874   Vlast_coding_system_used = Qnil;
10875
10876   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10877                doc: /*
10878 Error status of the last code conversion.
10879
10880 When an error was detected in the last code conversion, this variable
10881 is set to one of the following symbols.
10882   `insufficient-source'
10883   `inconsistent-eol'
10884   `invalid-source'
10885   `interrupted'
10886   `insufficient-memory'
10887 When no error was detected, the value doesn't change.  So, to check
10888 the error status of a code conversion by this variable, you must
10889 explicitly set this variable to nil before performing code
10890 conversion.  */);
10891   Vlast_code_conversion_error = Qnil;
10892
10893   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10894                doc: /*
10895 *Non-nil means always inhibit code conversion of end-of-line format.
10896 See info node `Coding Systems' and info node `Text and Binary' concerning
10897 such conversion.  */);
10898   inhibit_eol_conversion = 0;
10899
10900   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10901                doc: /*
10902 Non-nil means process buffer inherits coding system of process output.
10903 Bind it to t if the process output is to be treated as if it were a file
10904 read from some filesystem.  */);
10905   inherit_process_coding_system = 0;
10906
10907   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10908                doc: /*
10909 Alist to decide a coding system to use for a file I/O operation.
10910 The format is ((PATTERN . VAL) ...),
10911 where PATTERN is a regular expression matching a file name,
10912 VAL is a coding system, a cons of coding systems, or a function symbol.
10913 If VAL is a coding system, it is used for both decoding and encoding
10914 the file contents.
10915 If VAL is a cons of coding systems, the car part is used for decoding,
10916 and the cdr part is used for encoding.
10917 If VAL is a function symbol, the function must return a coding system
10918 or a cons of coding systems which are used as above.  The function is
10919 called with an argument that is a list of the arguments with which
10920 `find-operation-coding-system' was called.  If the function can't decide
10921 a coding system, it can return `undecided' so that the normal
10922 code-detection is performed.
10923
10924 See also the function `find-operation-coding-system'
10925 and the variable `auto-coding-alist'.  */);
10926   Vfile_coding_system_alist = Qnil;
10927
10928   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10929                doc: /*
10930 Alist to decide a coding system to use for a process I/O operation.
10931 The format is ((PATTERN . VAL) ...),
10932 where PATTERN is a regular expression matching a program name,
10933 VAL is a coding system, a cons of coding systems, or a function symbol.
10934 If VAL is a coding system, it is used for both decoding what received
10935 from the program and encoding what sent to the program.
10936 If VAL is a cons of coding systems, the car part is used for decoding,
10937 and the cdr part is used for encoding.
10938 If VAL is a function symbol, the function must return a coding system
10939 or a cons of coding systems which are used as above.
10940
10941 See also the function `find-operation-coding-system'.  */);
10942   Vprocess_coding_system_alist = Qnil;
10943
10944   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10945                doc: /*
10946 Alist to decide a coding system to use for a network I/O operation.
10947 The format is ((PATTERN . VAL) ...),
10948 where PATTERN is a regular expression matching a network service name
10949 or is a port number to connect to,
10950 VAL is a coding system, a cons of coding systems, or a function symbol.
10951 If VAL is a coding system, it is used for both decoding what received
10952 from the network stream and encoding what sent to the network stream.
10953 If VAL is a cons of coding systems, the car part is used for decoding,
10954 and the cdr part is used for encoding.
10955 If VAL is a function symbol, the function must return a coding system
10956 or a cons of coding systems which are used as above.
10957
10958 See also the function `find-operation-coding-system'.  */);
10959   Vnetwork_coding_system_alist = Qnil;
10960
10961   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10962                doc: /* Coding system to use with system messages.
10963 Also used for decoding keyboard input on X Window system.  */);
10964   Vlocale_coding_system = Qnil;
10965
10966   /* The eol mnemonics are reset in startup.el system-dependently.  */
10967   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10968                doc: /*
10969 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10970   eol_mnemonic_unix = build_pure_c_string (":");
10971
10972   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10973                doc: /*
10974 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10975   eol_mnemonic_dos = build_pure_c_string ("\\");
10976
10977   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10978                doc: /*
10979 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10980   eol_mnemonic_mac = build_pure_c_string ("/");
10981
10982   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10983                doc: /*
10984 *String displayed in mode line when end-of-line format is not yet determined.  */);
10985   eol_mnemonic_undecided = build_pure_c_string (":");
10986
10987   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10988                doc: /*
10989 *Non-nil enables character translation while encoding and decoding.  */);
10990   Venable_character_translation = Qt;
10991
10992   DEFVAR_LISP ("standard-translation-table-for-decode",
10993                Vstandard_translation_table_for_decode,
10994                doc: /* Table for translating characters while decoding.  */);
10995   Vstandard_translation_table_for_decode = Qnil;
10996
10997   DEFVAR_LISP ("standard-translation-table-for-encode",
10998                Vstandard_translation_table_for_encode,
10999                doc: /* Table for translating characters while encoding.  */);
11000   Vstandard_translation_table_for_encode = Qnil;
11001
11002   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11003                doc: /* Alist of charsets vs revision numbers.
11004 While encoding, if a charset (car part of an element) is found,
11005 designate it with the escape sequence identifying revision (cdr part
11006 of the element).  */);
11007   Vcharset_revision_table = Qnil;
11008
11009   DEFVAR_LISP ("default-process-coding-system",
11010                Vdefault_process_coding_system,
11011                doc: /* Cons of coding systems used for process I/O by default.
11012 The car part is used for decoding a process output,
11013 the cdr part is used for encoding a text to be sent to a process.  */);
11014   Vdefault_process_coding_system = Qnil;
11015
11016   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11017                doc: /*
11018 Table of extra Latin codes in the range 128..159 (inclusive).
11019 This is a vector of length 256.
11020 If Nth element is non-nil, the existence of code N in a file
11021 \(or output of subprocess) doesn't prevent it to be detected as
11022 a coding system of ISO 2022 variant which has a flag
11023 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11024 or reading output of a subprocess.
11025 Only 128th through 159th elements have a meaning.  */);
11026   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11027
11028   DEFVAR_LISP ("select-safe-coding-system-function",
11029                Vselect_safe_coding_system_function,
11030                doc: /*
11031 Function to call to select safe coding system for encoding a text.
11032
11033 If set, this function is called to force a user to select a proper
11034 coding system which can encode the text in the case that a default
11035 coding system used in each operation can't encode the text.  The
11036 function should take care that the buffer is not modified while
11037 the coding system is being selected.
11038
11039 The default value is `select-safe-coding-system' (which see).  */);
11040   Vselect_safe_coding_system_function = Qnil;
11041
11042   DEFVAR_BOOL ("coding-system-require-warning",
11043                coding_system_require_warning,
11044                doc: /* Internal use only.
11045 If non-nil, on writing a file, `select-safe-coding-system-function' is
11046 called even if `coding-system-for-write' is non-nil.  The command
11047 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11048   coding_system_require_warning = 0;
11049
11050
11051   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11052                inhibit_iso_escape_detection,
11053                doc: /*
11054 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11055
11056 When Emacs reads text, it tries to detect how the text is encoded.
11057 This code detection is sensitive to escape sequences.  If Emacs sees
11058 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11059 of the ISO2022 encodings, and decodes text by the corresponding coding
11060 system (e.g. `iso-2022-7bit').
11061
11062 However, there may be a case that you want to read escape sequences in
11063 a file as is.  In such a case, you can set this variable to non-nil.
11064 Then the code detection will ignore any escape sequences, and no text is
11065 detected as encoded in some ISO-2022 encoding.  The result is that all
11066 escape sequences become visible in a buffer.
11067
11068 The default value is nil, and it is strongly recommended not to change
11069 it.  That is because many Emacs Lisp source files that contain
11070 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11071 in Emacs's distribution, and they won't be decoded correctly on
11072 reading if you suppress escape sequence detection.
11073
11074 The other way to read escape sequences in a file without decoding is
11075 to explicitly specify some coding system that doesn't use ISO-2022
11076 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11077   inhibit_iso_escape_detection = 0;
11078
11079   DEFVAR_BOOL ("inhibit-null-byte-detection",
11080                inhibit_null_byte_detection,
11081                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11082 By default, Emacs treats it as binary data, and does not attempt to
11083 decode it.  The effect is as if you specified `no-conversion' for
11084 reading that text.
11085
11086 Set this to non-nil when a regular text happens to include null bytes.
11087 Examples are Index nodes of Info files and null-byte delimited output
11088 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11089 decode text as usual.  */);
11090   inhibit_null_byte_detection = 0;
11091
11092   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11093                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11094 Internal use only.  Removed after the experimental optimizer gets stable. */);
11095   disable_ascii_optimization = 0;
11096
11097   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11098                doc: /* Char table for translating self-inserting characters.
11099 This is applied to the result of input methods, not their input.
11100 See also `keyboard-translate-table'.
11101
11102 Use of this variable for character code unification was rendered
11103 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11104 internal character representation.  */);
11105     Vtranslation_table_for_input = Qnil;
11106
11107   {
11108     Lisp_Object args[coding_arg_max];
11109     Lisp_Object plist[16];
11110     int i;
11111
11112     for (i = 0; i < coding_arg_max; i++)
11113       args[i] = Qnil;
11114
11115     plist[0] = intern_c_string (":name");
11116     plist[1] = args[coding_arg_name] = Qno_conversion;
11117     plist[2] = intern_c_string (":mnemonic");
11118     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11119     plist[4] = intern_c_string (":coding-type");
11120     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11121     plist[6] = intern_c_string (":ascii-compatible-p");
11122     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11123     plist[8] = intern_c_string (":default-char");
11124     plist[9] = args[coding_arg_default_char] = make_number (0);
11125     plist[10] = intern_c_string (":for-unibyte");
11126     plist[11] = args[coding_arg_for_unibyte] = Qt;
11127     plist[12] = intern_c_string (":docstring");
11128     plist[13] = build_pure_c_string ("Do no conversion.\n\
11129 \n\
11130 When you visit a file with this coding, the file is read into a\n\
11131 unibyte buffer as is, thus each byte of a file is treated as a\n\
11132 character.");
11133     plist[14] = intern_c_string (":eol-type");
11134     plist[15] = args[coding_arg_eol_type] = Qunix;
11135     args[coding_arg_plist] = Flist (16, plist);
11136     Fdefine_coding_system_internal (coding_arg_max, args);
11137
11138     plist[1] = args[coding_arg_name] = Qundecided;
11139     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11140     plist[5] = args[coding_arg_coding_type] = Qundecided;
11141     /* This is already set.
11142        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11143     plist[8] = intern_c_string (":charset-list");
11144     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11145     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11146     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11147     plist[15] = args[coding_arg_eol_type] = Qnil;
11148     args[coding_arg_plist] = Flist (16, plist);
11149     Fdefine_coding_system_internal (coding_arg_max, args);
11150   }
11151
11152   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11153
11154   {
11155     int i;
11156
11157     for (i = 0; i < coding_category_max; i++)
11158       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11159   }
11160 #if defined (DOS_NT)
11161   system_eol_type = Qdos;
11162 #else
11163   system_eol_type = Qunix;
11164 #endif
11165   staticpro (&system_eol_type);
11166 }
11167
11168 char *
11169 emacs_strerror (int error_number)
11170 {
11171   char *str;
11172
11173   synchronize_system_messages_locale ();
11174   str = strerror (error_number);
11175
11176   if (! NILP (Vlocale_coding_system))
11177     {
11178       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11179                                                       Vlocale_coding_system,
11180                                                       0);
11181       str = SSDATA (dec);
11182     }
11183
11184   return str;
11185 }
11186
11187 #endif /* emacs */