src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2013 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 497
 498 /* A character to be produced on output if encoding of the original
 499    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 500 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 501
 502 /* UTF-8 section */
 503 #define CODING_UTF_8_BOM(coding)        \
 504   ((coding)->spec.utf_8_bom)
 505
 506 /* UTF-16 section */
 507 #define CODING_UTF_16_BOM(coding)       \
 508   ((coding)->spec.utf_16.bom)
 509
 510 #define CODING_UTF_16_ENDIAN(coding)    \
 511   ((coding)->spec.utf_16.endian)
 512
 513 #define CODING_UTF_16_SURROGATE(coding) \
 514   ((coding)->spec.utf_16.surrogate)
 515
 516
 517 /* CCL section */
 518 #define CODING_CCL_DECODER(coding)      \
 519   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 520 #define CODING_CCL_ENCODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 522 #define CODING_CCL_VALIDS(coding)                                          \
 523   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 524
 525 /* Index for each coding category in `coding_categories' */
 526
 527 enum coding_category
 528   {
 529     coding_category_iso_7,
 530     coding_category_iso_7_tight,
 531     coding_category_iso_8_1,
 532     coding_category_iso_8_2,
 533     coding_category_iso_7_else,
 534     coding_category_iso_8_else,
 535     coding_category_utf_8_auto,
 536     coding_category_utf_8_nosig,
 537     coding_category_utf_8_sig,
 538     coding_category_utf_16_auto,
 539     coding_category_utf_16_be,
 540     coding_category_utf_16_le,
 541     coding_category_utf_16_be_nosig,
 542     coding_category_utf_16_le_nosig,
 543     coding_category_charset,
 544     coding_category_sjis,
 545     coding_category_big5,
 546     coding_category_ccl,
 547     coding_category_emacs_mule,
 548     /* All above are targets of code detection.  */
 549     coding_category_raw_text,
 550     coding_category_undecided,
 551     coding_category_max
 552   };
 553
 554 /* Definitions of flag bits used in detect_coding_XXXX.  */
 555 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 556 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 557 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 558 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 559 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 560 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 561 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 562 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 563 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 564 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 565 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 566 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 567 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 568 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 569 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 570 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 571 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 572 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 573 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 574 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 575
 576 /* This value is returned if detect_coding_mask () find nothing other
 577    than ASCII characters.  */
 578 #define CATEGORY_MASK_ANY               \
 579   (CATEGORY_MASK_ISO_7                  \
 580    | CATEGORY_MASK_ISO_7_TIGHT          \
 581    | CATEGORY_MASK_ISO_8_1              \
 582    | CATEGORY_MASK_ISO_8_2              \
 583    | CATEGORY_MASK_ISO_7_ELSE           \
 584    | CATEGORY_MASK_ISO_8_ELSE           \
 585    | CATEGORY_MASK_UTF_8_AUTO           \
 586    | CATEGORY_MASK_UTF_8_NOSIG          \
 587    | CATEGORY_MASK_UTF_8_SIG            \
 588    | CATEGORY_MASK_UTF_16_AUTO          \
 589    | CATEGORY_MASK_UTF_16_BE            \
 590    | CATEGORY_MASK_UTF_16_LE            \
 591    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 592    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 593    | CATEGORY_MASK_CHARSET              \
 594    | CATEGORY_MASK_SJIS                 \
 595    | CATEGORY_MASK_BIG5                 \
 596    | CATEGORY_MASK_CCL                  \
 597    | CATEGORY_MASK_EMACS_MULE)
 598
 599
 600 #define CATEGORY_MASK_ISO_7BIT \
 601   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 602
 603 #define CATEGORY_MASK_ISO_8BIT \
 604   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 605
 606 #define CATEGORY_MASK_ISO_ELSE \
 607   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 608
 609 #define CATEGORY_MASK_ISO_ESCAPE        \
 610   (CATEGORY_MASK_ISO_7                  \
 611    | CATEGORY_MASK_ISO_7_TIGHT          \
 612    | CATEGORY_MASK_ISO_7_ELSE           \
 613    | CATEGORY_MASK_ISO_8_ELSE)
 614
 615 #define CATEGORY_MASK_ISO       \
 616   (  CATEGORY_MASK_ISO_7BIT     \
 617      | CATEGORY_MASK_ISO_8BIT   \
 618      | CATEGORY_MASK_ISO_ELSE)
 619
 620 #define CATEGORY_MASK_UTF_16            \
 621   (CATEGORY_MASK_UTF_16_AUTO            \
 622    | CATEGORY_MASK_UTF_16_BE            \
 623    | CATEGORY_MASK_UTF_16_LE            \
 624    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 625    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 626
 627 #define CATEGORY_MASK_UTF_8     \
 628   (CATEGORY_MASK_UTF_8_AUTO     \
 629    | CATEGORY_MASK_UTF_8_NOSIG  \
 630    | CATEGORY_MASK_UTF_8_SIG)
 631
 632 /* Table of coding categories (Lisp symbols).  This variable is for
 633    internal use only.  */
 634 static Lisp_Object Vcoding_category_table;
 635
 636 /* Table of coding-categories ordered by priority.  */
 637 static enum coding_category coding_priorities[coding_category_max];
 638
 639 /* Nth element is a coding context for the coding system bound to the
 640    Nth coding category.  */
 641 static struct coding_system coding_categories[coding_category_max];
 642
 643 /*** Commonly used macros and functions ***/
 644
 645 #ifndef min
 646 #define min(a, b) ((a) < (b) ? (a) : (b))
 647 #endif
 648 #ifndef max
 649 #define max(a, b) ((a) > (b) ? (a) : (b))
 650 #endif
 651
 652 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 653
 654 static int
 655 encode_inhibit_flag (Lisp_Object flag)
 656 {
 657   return NILP (flag) ? -1 : EQ (flag, Qt);
 658 }
 659
 660 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 661    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 662
 663 static bool
 664 inhibit_flag (int encoded_flag, bool var)
 665 {
 666   return 0 < encoded_flag + var;
 667 }
 668
 669 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 670   do {                                                  \
 671     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 672     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 673   } while (0)
 674
 675 static void
 676 CHECK_NATNUM_CAR (Lisp_Object x)
 677 {
 678   Lisp_Object tmp = XCAR (x);
 679   CHECK_NATNUM (tmp);
 680   XSETCAR (x, tmp);
 681 }
 682
 683 static void
 684 CHECK_NATNUM_CDR (Lisp_Object x)
 685 {
 686   Lisp_Object tmp = XCDR (x);
 687   CHECK_NATNUM (tmp);
 688   XSETCDR (x, tmp);
 689 }
 690
 691
 692 /* Safely get one byte from the source text pointed by SRC which ends
 693    at SRC_END, and set C to that byte.  If there are not enough bytes
 694    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 695    and a multibyte character is found at SRC, set C to the
 696    negative value of the character code.  The caller should declare
 697    and set these variables appropriately in advance:
 698         src, src_end, multibytep */
 699
 700 #define ONE_MORE_BYTE(c)                                \
 701   do {                                                  \
 702     if (src == src_end)                                 \
 703       {                                                 \
 704         if (src_base < src)                             \
 705           record_conversion_result                      \
 706             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 707         goto no_more_source;                            \
 708       }                                                 \
 709     c = *src++;                                         \
 710     if (multibytep && (c & 0x80))                       \
 711       {                                                 \
 712         if ((c & 0xFE) == 0xC0)                         \
 713           c = ((c & 1) << 6) | *src++;                  \
 714         else                                            \
 715           {                                             \
 716             src--;                                      \
 717             c = - string_char (src, &src, NULL);        \
 718             record_conversion_result                    \
 719               (coding, CODING_RESULT_INVALID_SRC);      \
 720           }                                             \
 721       }                                                 \
 722     consumed_chars++;                                   \
 723   } while (0)
 724
 725 /* Safely get two bytes from the source text pointed by SRC which ends
 726    at SRC_END, and set C1 and C2 to those bytes while skipping the
 727    heading multibyte characters.  If there are not enough bytes in the
 728    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 729    a multibyte character is found for C2, set C2 to the negative value
 730    of the character code.  The caller should declare and set these
 731    variables appropriately in advance:
 732         src, src_end, multibytep
 733    It is intended that this macro is used in detect_coding_utf_16.  */
 734
 735 #define TWO_MORE_BYTES(c1, c2)                          \
 736   do {                                                  \
 737     do {                                                \
 738       if (src == src_end)                               \
 739         goto no_more_source;                            \
 740       c1 = *src++;                                      \
 741       if (multibytep && (c1 & 0x80))                    \
 742         {                                               \
 743           if ((c1 & 0xFE) == 0xC0)                      \
 744             c1 = ((c1 & 1) << 6) | *src++;              \
 745           else                                          \
 746             {                                           \
 747               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 748               c1 = -1;                                  \
 749             }                                           \
 750         }                                               \
 751     } while (c1 < 0);                                   \
 752     if (src == src_end)                                 \
 753       goto no_more_source;                              \
 754     c2 = *src++;                                        \
 755     if (multibytep && (c2 & 0x80))                      \
 756       {                                                 \
 757         if ((c2 & 0xFE) == 0xC0)                        \
 758           c2 = ((c2 & 1) << 6) | *src++;                \
 759         else                                            \
 760           c2 = -1;                                      \
 761       }                                                 \
 762   } while (0)
 763
 764
 765 /* Store a byte C in the place pointed by DST and increment DST to the
 766    next free point, and increment PRODUCED_CHARS.  The caller should
 767    assure that C is 0..127, and declare and set the variable `dst'
 768    appropriately in advance.
 769 */
 770
 771
 772 #define EMIT_ONE_ASCII_BYTE(c)  \
 773   do {                          \
 774     produced_chars++;           \
 775     *dst++ = (c);               \
 776   } while (0)
 777
 778
 779 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 780
 781 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 782   do {                                  \
 783     produced_chars += 2;                \
 784     *dst++ = (c1), *dst++ = (c2);       \
 785   } while (0)
 786
 787
 788 /* Store a byte C in the place pointed by DST and increment DST to the
 789    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 790    store in an appropriate multibyte form.  The caller should
 791    declare and set the variables `dst' and `multibytep' appropriately
 792    in advance.  */
 793
 794 #define EMIT_ONE_BYTE(c)                \
 795   do {                                  \
 796     produced_chars++;                   \
 797     if (multibytep)                     \
 798       {                                 \
 799         unsigned ch = (c);              \
 800         if (ch >= 0x80)                 \
 801           ch = BYTE8_TO_CHAR (ch);      \
 802         CHAR_STRING_ADVANCE (ch, dst);  \
 803       }                                 \
 804     else                                \
 805       *dst++ = (c);                     \
 806   } while (0)
 807
 808
 809 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 810
 811 #define EMIT_TWO_BYTES(c1, c2)          \
 812   do {                                  \
 813     produced_chars += 2;                \
 814     if (multibytep)                     \
 815       {                                 \
 816         unsigned ch;                    \
 817                                         \
 818         ch = (c1);                      \
 819         if (ch >= 0x80)                 \
 820           ch = BYTE8_TO_CHAR (ch);      \
 821         CHAR_STRING_ADVANCE (ch, dst);  \
 822         ch = (c2);                      \
 823         if (ch >= 0x80)                 \
 824           ch = BYTE8_TO_CHAR (ch);      \
 825         CHAR_STRING_ADVANCE (ch, dst);  \
 826       }                                 \
 827     else                                \
 828       {                                 \
 829         *dst++ = (c1);                  \
 830         *dst++ = (c2);                  \
 831       }                                 \
 832   } while (0)
 833
 834
 835 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 836   do {                                  \
 837     EMIT_ONE_BYTE (c1);                 \
 838     EMIT_TWO_BYTES (c2, c3);            \
 839   } while (0)
 840
 841
 842 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 843   do {                                          \
 844     EMIT_TWO_BYTES (c1, c2);                    \
 845     EMIT_TWO_BYTES (c3, c4);                    \
 846   } while (0)
 847
 848
 849 static void
 850 record_conversion_result (struct coding_system *coding,
 851                           enum coding_result_code result)
 852 {
 853   coding->result = result;
 854   switch (result)
 855     {
 856     case CODING_RESULT_INSUFFICIENT_SRC:
 857       Vlast_code_conversion_error = Qinsufficient_source;
 858       break;
 859     case CODING_RESULT_INVALID_SRC:
 860       Vlast_code_conversion_error = Qinvalid_source;
 861       break;
 862     case CODING_RESULT_INTERRUPT:
 863       Vlast_code_conversion_error = Qinterrupted;
 864       break;
 865     case CODING_RESULT_INSUFFICIENT_DST:
 866       /* Don't record this error in Vlast_code_conversion_error
 867          because it happens just temporarily and is resolved when the
 868          whole conversion is finished.  */
 869       break;
 870     case CODING_RESULT_SUCCESS:
 871       break;
 872     default:
 873       Vlast_code_conversion_error = intern ("Unknown error");
 874     }
 875 }
 876
 877 /* These wrapper macros are used to preserve validity of pointers into
 878    buffer text across calls to decode_char, encode_char, etc, which
 879    could cause relocation of buffers if it loads a charset map,
 880    because loading a charset map allocates large structures.  */
 881
 882 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 883   do {                                                                       \
 884     ptrdiff_t offset;                                                        \
 885                                                                              \
 886     charset_map_loaded = 0;                                                  \
 887     c = DECODE_CHAR (charset, code);                                         \
 888     if (charset_map_loaded                                                   \
 889         && (offset = coding_change_source (coding)))                         \
 890       {                                                                      \
 891         src += offset;                                                       \
 892         src_base += offset;                                                  \
 893         src_end += offset;                                                   \
 894       }                                                                      \
 895   } while (0)
 896
 897 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 898   do {                                                                  \
 899     ptrdiff_t offset;                                                   \
 900                                                                         \
 901     charset_map_loaded = 0;                                             \
 902     code = ENCODE_CHAR (charset, c);                                    \
 903     if (charset_map_loaded                                              \
 904         && (offset = coding_change_destination (coding)))               \
 905       {                                                                 \
 906         dst += offset;                                                  \
 907         dst_end += offset;                                              \
 908       }                                                                 \
 909   } while (0)
 910
 911 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 912   do {                                                                  \
 913     ptrdiff_t offset;                                                   \
 914                                                                         \
 915     charset_map_loaded = 0;                                             \
 916     charset = char_charset (c, charset_list, code_return);              \
 917     if (charset_map_loaded                                              \
 918         && (offset = coding_change_destination (coding)))               \
 919       {                                                                 \
 920         dst += offset;                                                  \
 921         dst_end += offset;                                              \
 922       }                                                                 \
 923   } while (0)
 924
 925 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 926   do {                                                                  \
 927     ptrdiff_t offset;                                                   \
 928                                                                         \
 929     charset_map_loaded = 0;                                             \
 930     result = CHAR_CHARSET_P (c, charset);                               \
 931     if (charset_map_loaded                                              \
 932         && (offset = coding_change_destination (coding)))               \
 933       {                                                                 \
 934         dst += offset;                                                  \
 935         dst_end += offset;                                              \
 936       }                                                                 \
 937   } while (0)
 938
 939
 940 /* If there are at least BYTES length of room at dst, allocate memory
 941    for coding->destination and update dst and dst_end.  We don't have
 942    to take care of coding->source which will be relocated.  It is
 943    handled by calling coding_set_source in encode_coding.  */
 944
 945 #define ASSURE_DESTINATION(bytes)                               \
 946   do {                                                          \
 947     if (dst + (bytes) >= dst_end)                               \
 948       {                                                         \
 949         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 950                                                                 \
 951         dst = alloc_destination (coding, more_bytes, dst);      \
 952         dst_end = coding->destination + coding->dst_bytes;      \
 953       }                                                         \
 954   } while (0)
 955
 956
 957 /* Store multibyte form of the character C in P, and advance P to the
 958    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 959    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 960    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 961
 962 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 963
 964 /* Return the character code of character whose multibyte form is at
 965    P, and advance P to the end of the multibyte form.  This used to be
 966    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 967    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 968
 969 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 970
 971 /* Set coding->source from coding->src_object.  */
 972
 973 static void
 974 coding_set_source (struct coding_system *coding)
 975 {
 976   if (BUFFERP (coding->src_object))
 977     {
 978       struct buffer *buf = XBUFFER (coding->src_object);
 979
 980       if (coding->src_pos < 0)
 981         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 982       else
 983         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 984     }
 985   else if (STRINGP (coding->src_object))
 986     {
 987       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 988     }
 989   else
 990     {
 991       /* Otherwise, the source is C string and is never relocated
 992          automatically.  Thus we don't have to update anything.  */
 993     }
 994 }
 995
 996
 997 /* Set coding->source from coding->src_object, and return how many
 998    bytes coding->source was changed.  */
 999
1000 static ptrdiff_t
1001 coding_change_source (struct coding_system *coding)
1002 {
1003   const unsigned char *orig = coding->source;
1004   coding_set_source (coding);
1005   return coding->source - orig;
1006 }
1007
1008
1009 /* Set coding->destination from coding->dst_object.  */
1010
1011 static void
1012 coding_set_destination (struct coding_system *coding)
1013 {
1014   if (BUFFERP (coding->dst_object))
1015     {
1016       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1017         {
1018           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1019           coding->dst_bytes = (GAP_END_ADDR
1020                                - (coding->src_bytes - coding->consumed)
1021                                - coding->destination);
1022         }
1023       else
1024         {
1025           /* We are sure that coding->dst_pos_byte is before the gap
1026              of the buffer. */
1027           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1028                                  + coding->dst_pos_byte - BEG_BYTE);
1029           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1030                                - coding->destination);
1031         }
1032     }
1033   else
1034     {
1035       /* Otherwise, the destination is C string and is never relocated
1036          automatically.  Thus we don't have to update anything.  */
1037     }
1038 }
1039
1040
1041 /* Set coding->destination from coding->dst_object, and return how
1042    many bytes coding->destination was changed.  */
1043
1044 static ptrdiff_t
1045 coding_change_destination (struct coding_system *coding)
1046 {
1047   const unsigned char *orig = coding->destination;
1048   coding_set_destination (coding);
1049   return coding->destination - orig;
1050 }
1051
1052
1053 static void
1054 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1055 {
1056   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1057     string_overflow ();
1058   coding->destination = xrealloc (coding->destination,
1059                                   coding->dst_bytes + bytes);
1060   coding->dst_bytes += bytes;
1061 }
1062
1063 static void
1064 coding_alloc_by_making_gap (struct coding_system *coding,
1065                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1066 {
1067   if (EQ (coding->src_object, coding->dst_object))
1068     {
1069       /* The gap may contain the produced data at the head and not-yet
1070          consumed data at the tail.  To preserve those data, we at
1071          first make the gap size to zero, then increase the gap
1072          size.  */
1073       ptrdiff_t add = GAP_SIZE;
1074
1075       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1076       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1077       make_gap (bytes);
1078       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1079       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1080     }
1081   else
1082     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1083 }
1084
1085
1086 static unsigned char *
1087 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1088                    unsigned char *dst)
1089 {
1090   ptrdiff_t offset = dst - coding->destination;
1091
1092   if (BUFFERP (coding->dst_object))
1093     {
1094       struct buffer *buf = XBUFFER (coding->dst_object);
1095
1096       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1097     }
1098   else
1099     coding_alloc_by_realloc (coding, nbytes);
1100   coding_set_destination (coding);
1101   dst = coding->destination + offset;
1102   return dst;
1103 }
1104
1105 /** Macros for annotations.  */
1106
1107 /* An annotation data is stored in the array coding->charbuf in this
1108    format:
1109      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1110    LENGTH is the number of elements in the annotation.
1111    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1112    NCHARS is the number of characters in the text annotated.
1113
1114    The format of the following elements depend on ANNOTATION_MASK.
1115
1116    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1117    follows:
1118      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1119
1120    NBYTES is the number of bytes specified in the header part of
1121    old-style emacs-mule encoding, or 0 for the other kind of
1122    composition.
1123
1124    METHOD is one of enum composition_method.
1125
1126    Optional COMPOSITION-COMPONENTS are characters and composition
1127    rules.
1128
1129    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1130    follows.
1131
1132    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1133    recover from an invalid annotation, and should be skipped by
1134    produce_annotation.  */
1135
1136 /* Maximum length of the header of annotation data.  */
1137 #define MAX_ANNOTATION_LENGTH 5
1138
1139 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1140   do {                                                  \
1141     *(buf)++ = -(len);                                  \
1142     *(buf)++ = (mask);                                  \
1143     *(buf)++ = (nchars);                                \
1144     coding->annotated = 1;                              \
1145   } while (0);
1146
1147 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1148   do {                                                                      \
1149     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1150     *buf++ = nbytes;                                                        \
1151     *buf++ = method;                                                        \
1152   } while (0)
1153
1154
1155 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1156   do {                                                                  \
1157     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1158     *buf++ = id;                                                        \
1159   } while (0)
1160
1161
1162 /* Bitmasks for coding->eol_seen.  */
1163
1164 #define EOL_SEEN_NONE   0
1165 #define EOL_SEEN_LF     1
1166 #define EOL_SEEN_CR     2
1167 #define EOL_SEEN_CRLF   4
1168
1169 \f
1170 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1171
1172
1173
1174 \f
1175 /*** 3. UTF-8 ***/
1176
1177 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1178    Return true if a text is encoded in UTF-8.  */
1179
1180 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1181 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1182 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1183 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1184 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1185 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1186
1187 #define UTF_8_BOM_1 0xEF
1188 #define UTF_8_BOM_2 0xBB
1189 #define UTF_8_BOM_3 0xBF
1190
1191 /* Unlike the other detect_coding_XXX, this function counts number of
1192    characters and check EOL format.  */
1193
1194 static bool
1195 detect_coding_utf_8 (struct coding_system *coding,
1196                      struct coding_detection_info *detect_info)
1197 {
1198   const unsigned char *src = coding->source, *src_base;
1199   const unsigned char *src_end = coding->source + coding->src_bytes;
1200   bool multibytep = coding->src_multibyte;
1201   ptrdiff_t consumed_chars = 0;
1202   bool bom_found = 0;
1203   int nchars = coding->head_ascii;
1204   int eol_seen = coding->eol_seen;
1205
1206   detect_info->checked |= CATEGORY_MASK_UTF_8;
1207   /* A coding system of this category is always ASCII compatible.  */
1208   src += nchars;
1209
1210   if (src == coding->source     /* BOM should be at the head.  */
1211       && src + 3 < src_end      /* BOM is 3-byte long.  */
1212       && src[0] == UTF_8_BOM_1
1213       && src[1] == UTF_8_BOM_2
1214       && src[2] == UTF_8_BOM_3)
1215     {
1216       bom_found = 1;
1217       src += 3;
1218       nchars++;
1219     }
1220
1221   while (1)
1222     {
1223       int c, c1, c2, c3, c4;
1224
1225       src_base = src;
1226       ONE_MORE_BYTE (c);
1227       if (c < 0 || UTF_8_1_OCTET_P (c))
1228         {
1229           nchars++;
1230           if (c == '\r')
1231             {
1232               if (src < src_end && *src == '\n')
1233                 {
1234                   eol_seen |= EOL_SEEN_CRLF;
1235                   src++;
1236                   nchars++;
1237                 }
1238               else
1239                 eol_seen |= EOL_SEEN_CR;
1240             }
1241           else if (c == '\n')
1242             eol_seen |= EOL_SEEN_LF;
1243           continue;
1244         }
1245       ONE_MORE_BYTE (c1);
1246       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1247         break;
1248       if (UTF_8_2_OCTET_LEADING_P (c))
1249         {
1250           nchars++;
1251           continue;
1252         }
1253       ONE_MORE_BYTE (c2);
1254       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1255         break;
1256       if (UTF_8_3_OCTET_LEADING_P (c))
1257         {
1258           nchars++;
1259           continue;
1260         }
1261       ONE_MORE_BYTE (c3);
1262       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1263         break;
1264       if (UTF_8_4_OCTET_LEADING_P (c))
1265         {
1266           nchars++;
1267           continue;
1268         }
1269       ONE_MORE_BYTE (c4);
1270       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1271         break;
1272       if (UTF_8_5_OCTET_LEADING_P (c))
1273         {
1274           nchars++;
1275           continue;
1276         }
1277       break;
1278     }
1279   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1280   return 0;
1281
1282  no_more_source:
1283   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1284     {
1285       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1286       return 0;
1287     }
1288   if (bom_found)
1289     {
1290       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1291       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1292     }
1293   else
1294     {
1295       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1296       if (nchars < src_end - coding->source)
1297         /* The found characters are less than source bytes, which
1298            means that we found a valid non-ASCII characters.  */
1299         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1300     }
1301   coding->detected_utf8_chars = nchars;
1302   return 1;
1303 }
1304
1305
1306 static void
1307 decode_coding_utf_8 (struct coding_system *coding)
1308 {
1309   const unsigned char *src = coding->source + coding->consumed;
1310   const unsigned char *src_end = coding->source + coding->src_bytes;
1311   const unsigned char *src_base;
1312   int *charbuf = coding->charbuf + coding->charbuf_used;
1313   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1314   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1315   bool multibytep = coding->src_multibyte;
1316   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1317   bool eol_dos
1318     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1319   int byte_after_cr = -1;
1320
1321   if (bom != utf_without_bom)
1322     {
1323       int c1, c2, c3;
1324
1325       src_base = src;
1326       ONE_MORE_BYTE (c1);
1327       if (! UTF_8_3_OCTET_LEADING_P (c1))
1328         src = src_base;
1329       else
1330         {
1331           ONE_MORE_BYTE (c2);
1332           if (! UTF_8_EXTRA_OCTET_P (c2))
1333             src = src_base;
1334           else
1335             {
1336               ONE_MORE_BYTE (c3);
1337               if (! UTF_8_EXTRA_OCTET_P (c3))
1338                 src = src_base;
1339               else
1340                 {
1341                   if ((c1 != UTF_8_BOM_1)
1342                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1343                     src = src_base;
1344                   else
1345                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1346                 }
1347             }
1348         }
1349     }
1350   CODING_UTF_8_BOM (coding) = utf_without_bom;
1351
1352   while (1)
1353     {
1354       int c, c1, c2, c3, c4, c5;
1355
1356       src_base = src;
1357       consumed_chars_base = consumed_chars;
1358
1359       if (charbuf >= charbuf_end)
1360         {
1361           if (byte_after_cr >= 0)
1362             src_base--;
1363           break;
1364         }
1365
1366       if (byte_after_cr >= 0)
1367         c1 = byte_after_cr, byte_after_cr = -1;
1368       else
1369         ONE_MORE_BYTE (c1);
1370       if (c1 < 0)
1371         {
1372           c = - c1;
1373         }
1374       else if (UTF_8_1_OCTET_P (c1))
1375         {
1376           if (eol_dos && c1 == '\r')
1377             ONE_MORE_BYTE (byte_after_cr);
1378           c = c1;
1379         }
1380       else
1381         {
1382           ONE_MORE_BYTE (c2);
1383           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1384             goto invalid_code;
1385           if (UTF_8_2_OCTET_LEADING_P (c1))
1386             {
1387               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1388               /* Reject overlong sequences here and below.  Encoders
1389                  producing them are incorrect, they can be misleading,
1390                  and they mess up read/write invariance.  */
1391               if (c < 128)
1392                 goto invalid_code;
1393             }
1394           else
1395             {
1396               ONE_MORE_BYTE (c3);
1397               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1398                 goto invalid_code;
1399               if (UTF_8_3_OCTET_LEADING_P (c1))
1400                 {
1401                   c = (((c1 & 0xF) << 12)
1402                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1403                   if (c < 0x800
1404                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1405                     goto invalid_code;
1406                 }
1407               else
1408                 {
1409                   ONE_MORE_BYTE (c4);
1410                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1411                     goto invalid_code;
1412                   if (UTF_8_4_OCTET_LEADING_P (c1))
1413                     {
1414                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1415                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1416                     if (c < 0x10000)
1417                       goto invalid_code;
1418                     }
1419                   else
1420                     {
1421                       ONE_MORE_BYTE (c5);
1422                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1423                         goto invalid_code;
1424                       if (UTF_8_5_OCTET_LEADING_P (c1))
1425                         {
1426                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1427                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1428                                | (c5 & 0x3F));
1429                           if ((c > MAX_CHAR) || (c < 0x200000))
1430                             goto invalid_code;
1431                         }
1432                       else
1433                         goto invalid_code;
1434                     }
1435                 }
1436             }
1437         }
1438
1439       *charbuf++ = c;
1440       continue;
1441
1442     invalid_code:
1443       src = src_base;
1444       consumed_chars = consumed_chars_base;
1445       ONE_MORE_BYTE (c);
1446       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1447       coding->errors++;
1448     }
1449
1450  no_more_source:
1451   coding->consumed_char += consumed_chars_base;
1452   coding->consumed = src_base - coding->source;
1453   coding->charbuf_used = charbuf - coding->charbuf;
1454 }
1455
1456
1457 static bool
1458 encode_coding_utf_8 (struct coding_system *coding)
1459 {
1460   bool multibytep = coding->dst_multibyte;
1461   int *charbuf = coding->charbuf;
1462   int *charbuf_end = charbuf + coding->charbuf_used;
1463   unsigned char *dst = coding->destination + coding->produced;
1464   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1465   ptrdiff_t produced_chars = 0;
1466   int c;
1467
1468   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1469     {
1470       ASSURE_DESTINATION (3);
1471       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1472       CODING_UTF_8_BOM (coding) = utf_without_bom;
1473     }
1474
1475   if (multibytep)
1476     {
1477       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1478
1479       while (charbuf < charbuf_end)
1480         {
1481           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1482
1483           ASSURE_DESTINATION (safe_room);
1484           c = *charbuf++;
1485           if (CHAR_BYTE8_P (c))
1486             {
1487               c = CHAR_TO_BYTE8 (c);
1488               EMIT_ONE_BYTE (c);
1489             }
1490           else
1491             {
1492               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1493               for (p = str; p < pend; p++)
1494                 EMIT_ONE_BYTE (*p);
1495             }
1496         }
1497     }
1498   else
1499     {
1500       int safe_room = MAX_MULTIBYTE_LENGTH;
1501
1502       while (charbuf < charbuf_end)
1503         {
1504           ASSURE_DESTINATION (safe_room);
1505           c = *charbuf++;
1506           if (CHAR_BYTE8_P (c))
1507             *dst++ = CHAR_TO_BYTE8 (c);
1508           else
1509             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1510           produced_chars++;
1511         }
1512     }
1513   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1514   coding->produced_char += produced_chars;
1515   coding->produced = dst - coding->destination;
1516   return 0;
1517 }
1518
1519
1520 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1521    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1522
1523 #define UTF_16_HIGH_SURROGATE_P(val) \
1524   (((val) & 0xFC00) == 0xD800)
1525
1526 #define UTF_16_LOW_SURROGATE_P(val) \
1527   (((val) & 0xFC00) == 0xDC00)
1528
1529
1530 static bool
1531 detect_coding_utf_16 (struct coding_system *coding,
1532                       struct coding_detection_info *detect_info)
1533 {
1534   const unsigned char *src = coding->source;
1535   const unsigned char *src_end = coding->source + coding->src_bytes;
1536   bool multibytep = coding->src_multibyte;
1537   int c1, c2;
1538
1539   detect_info->checked |= CATEGORY_MASK_UTF_16;
1540   if (coding->mode & CODING_MODE_LAST_BLOCK
1541       && (coding->src_chars & 1))
1542     {
1543       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1544       return 0;
1545     }
1546
1547   TWO_MORE_BYTES (c1, c2);
1548   if ((c1 == 0xFF) && (c2 == 0xFE))
1549     {
1550       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1551                              | CATEGORY_MASK_UTF_16_AUTO);
1552       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1553                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1554                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1555     }
1556   else if ((c1 == 0xFE) && (c2 == 0xFF))
1557     {
1558       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1559                              | CATEGORY_MASK_UTF_16_AUTO);
1560       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1561                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1562                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1563     }
1564   else if (c2 < 0)
1565     {
1566       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1567       return 0;
1568     }
1569   else
1570     {
1571       /* We check the dispersion of Eth and Oth bytes where E is even and
1572          O is odd.  If both are high, we assume binary data.*/
1573       unsigned char e[256], o[256];
1574       unsigned e_num = 1, o_num = 1;
1575
1576       memset (e, 0, 256);
1577       memset (o, 0, 256);
1578       e[c1] = 1;
1579       o[c2] = 1;
1580
1581       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1582                                 |CATEGORY_MASK_UTF_16_BE
1583                                 | CATEGORY_MASK_UTF_16_LE);
1584
1585       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1586              != CATEGORY_MASK_UTF_16)
1587         {
1588           TWO_MORE_BYTES (c1, c2);
1589           if (c2 < 0)
1590             break;
1591           if (! e[c1])
1592             {
1593               e[c1] = 1;
1594               e_num++;
1595               if (e_num >= 128)
1596                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1597             }
1598           if (! o[c2])
1599             {
1600               o[c2] = 1;
1601               o_num++;
1602               if (o_num >= 128)
1603                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1604             }
1605         }
1606       return 0;
1607     }
1608
1609  no_more_source:
1610   return 1;
1611 }
1612
1613 static void
1614 decode_coding_utf_16 (struct coding_system *coding)
1615 {
1616   const unsigned char *src = coding->source + coding->consumed;
1617   const unsigned char *src_end = coding->source + coding->src_bytes;
1618   const unsigned char *src_base;
1619   int *charbuf = coding->charbuf + coding->charbuf_used;
1620   /* We may produces at most 3 chars in one loop.  */
1621   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1622   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1623   bool multibytep = coding->src_multibyte;
1624   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1625   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1626   int surrogate = CODING_UTF_16_SURROGATE (coding);
1627   bool eol_dos
1628     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1629   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1630
1631   if (bom == utf_with_bom)
1632     {
1633       int c, c1, c2;
1634
1635       src_base = src;
1636       ONE_MORE_BYTE (c1);
1637       ONE_MORE_BYTE (c2);
1638       c = (c1 << 8) | c2;
1639
1640       if (endian == utf_16_big_endian
1641           ? c != 0xFEFF : c != 0xFFFE)
1642         {
1643           /* The first two bytes are not BOM.  Treat them as bytes
1644              for a normal character.  */
1645           src = src_base;
1646           coding->errors++;
1647         }
1648       CODING_UTF_16_BOM (coding) = utf_without_bom;
1649     }
1650   else if (bom == utf_detect_bom)
1651     {
1652       /* We have already tried to detect BOM and failed in
1653          detect_coding.  */
1654       CODING_UTF_16_BOM (coding) = utf_without_bom;
1655     }
1656
1657   while (1)
1658     {
1659       int c, c1, c2;
1660
1661       src_base = src;
1662       consumed_chars_base = consumed_chars;
1663
1664       if (charbuf >= charbuf_end)
1665         {
1666           if (byte_after_cr1 >= 0)
1667             src_base -= 2;
1668           break;
1669         }
1670
1671       if (byte_after_cr1 >= 0)
1672         c1 = byte_after_cr1, byte_after_cr1 = -1;
1673       else
1674         ONE_MORE_BYTE (c1);
1675       if (c1 < 0)
1676         {
1677           *charbuf++ = -c1;
1678           continue;
1679         }
1680       if (byte_after_cr2 >= 0)
1681         c2 = byte_after_cr2, byte_after_cr2 = -1;
1682       else
1683         ONE_MORE_BYTE (c2);
1684       if (c2 < 0)
1685         {
1686           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1687           *charbuf++ = -c2;
1688           continue;
1689         }
1690       c = (endian == utf_16_big_endian
1691            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1692
1693       if (surrogate)
1694         {
1695           if (! UTF_16_LOW_SURROGATE_P (c))
1696             {
1697               if (endian == utf_16_big_endian)
1698                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1699               else
1700                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1701               *charbuf++ = c1;
1702               *charbuf++ = c2;
1703               coding->errors++;
1704               if (UTF_16_HIGH_SURROGATE_P (c))
1705                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1706               else
1707                 *charbuf++ = c;
1708             }
1709           else
1710             {
1711               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1712               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1713               *charbuf++ = 0x10000 + c;
1714             }
1715         }
1716       else
1717         {
1718           if (UTF_16_HIGH_SURROGATE_P (c))
1719             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1720           else
1721             {
1722               if (eol_dos && c == '\r')
1723                 {
1724                   ONE_MORE_BYTE (byte_after_cr1);
1725                   ONE_MORE_BYTE (byte_after_cr2);
1726                 }
1727               *charbuf++ = c;
1728             }
1729         }
1730     }
1731
1732  no_more_source:
1733   coding->consumed_char += consumed_chars_base;
1734   coding->consumed = src_base - coding->source;
1735   coding->charbuf_used = charbuf - coding->charbuf;
1736 }
1737
1738 static bool
1739 encode_coding_utf_16 (struct coding_system *coding)
1740 {
1741   bool multibytep = coding->dst_multibyte;
1742   int *charbuf = coding->charbuf;
1743   int *charbuf_end = charbuf + coding->charbuf_used;
1744   unsigned char *dst = coding->destination + coding->produced;
1745   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1746   int safe_room = 8;
1747   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1748   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1749   ptrdiff_t produced_chars = 0;
1750   int c;
1751
1752   if (bom != utf_without_bom)
1753     {
1754       ASSURE_DESTINATION (safe_room);
1755       if (big_endian)
1756         EMIT_TWO_BYTES (0xFE, 0xFF);
1757       else
1758         EMIT_TWO_BYTES (0xFF, 0xFE);
1759       CODING_UTF_16_BOM (coding) = utf_without_bom;
1760     }
1761
1762   while (charbuf < charbuf_end)
1763     {
1764       ASSURE_DESTINATION (safe_room);
1765       c = *charbuf++;
1766       if (c > MAX_UNICODE_CHAR)
1767         c = coding->default_char;
1768
1769       if (c < 0x10000)
1770         {
1771           if (big_endian)
1772             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1773           else
1774             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1775         }
1776       else
1777         {
1778           int c1, c2;
1779
1780           c -= 0x10000;
1781           c1 = (c >> 10) + 0xD800;
1782           c2 = (c & 0x3FF) + 0xDC00;
1783           if (big_endian)
1784             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1785           else
1786             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1787         }
1788     }
1789   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1790   coding->produced = dst - coding->destination;
1791   coding->produced_char += produced_chars;
1792   return 0;
1793 }
1794
1795 \f
1796 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1797
1798 /* Emacs' internal format for representation of multiple character
1799    sets is a kind of multi-byte encoding, i.e. characters are
1800    represented by variable-length sequences of one-byte codes.
1801
1802    ASCII characters and control characters (e.g. `tab', `newline') are
1803    represented by one-byte sequences which are their ASCII codes, in
1804    the range 0x00 through 0x7F.
1805
1806    8-bit characters of the range 0x80..0x9F are represented by
1807    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1808    code + 0x20).
1809
1810    8-bit characters of the range 0xA0..0xFF are represented by
1811    one-byte sequences which are their 8-bit code.
1812
1813    The other characters are represented by a sequence of `base
1814    leading-code', optional `extended leading-code', and one or two
1815    `position-code's.  The length of the sequence is determined by the
1816    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1817    whereas extended leading-code and position-code take the range 0xA0
1818    through 0xFF.  See `charset.h' for more details about leading-code
1819    and position-code.
1820
1821    --- CODE RANGE of Emacs' internal format ---
1822    character set        range
1823    -------------        -----
1824    ascii                0x00..0x7F
1825    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1826    eight-bit-graphic    0xA0..0xBF
1827    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1828    ---------------------------------------------
1829
1830    As this is the internal character representation, the format is
1831    usually not used externally (i.e. in a file or in a data sent to a
1832    process).  But, it is possible to have a text externally in this
1833    format (i.e. by encoding by the coding system `emacs-mule').
1834
1835    In that case, a sequence of one-byte codes has a slightly different
1836    form.
1837
1838    At first, all characters in eight-bit-control are represented by
1839    one-byte sequences which are their 8-bit code.
1840
1841    Next, character composition data are represented by the byte
1842    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1843    where,
1844         METHOD is 0xF2 plus one of composition method (enum
1845         composition_method),
1846
1847         BYTES is 0xA0 plus a byte length of this composition data,
1848
1849         CHARS is 0xA0 plus a number of characters composed by this
1850         data,
1851
1852         COMPONENTs are characters of multibyte form or composition
1853         rules encoded by two-byte of ASCII codes.
1854
1855    In addition, for backward compatibility, the following formats are
1856    also recognized as composition data on decoding.
1857
1858    0x80 MSEQ ...
1859    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1860
1861    Here,
1862         MSEQ is a multibyte form but in these special format:
1863           ASCII: 0xA0 ASCII_CODE+0x80,
1864           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1865         RULE is a one byte code of the range 0xA0..0xF0 that
1866         represents a composition rule.
1867   */
1868
1869 char emacs_mule_bytes[256];
1870
1871
1872 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1873    Return true if a text is encoded in 'emacs-mule'.  */
1874
1875 static bool
1876 detect_coding_emacs_mule (struct coding_system *coding,
1877                           struct coding_detection_info *detect_info)
1878 {
1879   const unsigned char *src = coding->source, *src_base;
1880   const unsigned char *src_end = coding->source + coding->src_bytes;
1881   bool multibytep = coding->src_multibyte;
1882   ptrdiff_t consumed_chars = 0;
1883   int c;
1884   int found = 0;
1885
1886   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1887   /* A coding system of this category is always ASCII compatible.  */
1888   src += coding->head_ascii;
1889
1890   while (1)
1891     {
1892       src_base = src;
1893       ONE_MORE_BYTE (c);
1894       if (c < 0)
1895         continue;
1896       if (c == 0x80)
1897         {
1898           /* Perhaps the start of composite character.  We simply skip
1899              it because analyzing it is too heavy for detecting.  But,
1900              at least, we check that the composite character
1901              constitutes of more than 4 bytes.  */
1902           const unsigned char *src_start;
1903
1904         repeat:
1905           src_start = src;
1906           do
1907             {
1908               ONE_MORE_BYTE (c);
1909             }
1910           while (c >= 0xA0);
1911
1912           if (src - src_start <= 4)
1913             break;
1914           found = CATEGORY_MASK_EMACS_MULE;
1915           if (c == 0x80)
1916             goto repeat;
1917         }
1918
1919       if (c < 0x80)
1920         {
1921           if (c < 0x20
1922               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1923             break;
1924         }
1925       else
1926         {
1927           int more_bytes = emacs_mule_bytes[c] - 1;
1928
1929           while (more_bytes > 0)
1930             {
1931               ONE_MORE_BYTE (c);
1932               if (c < 0xA0)
1933                 {
1934                   src--;        /* Unread the last byte.  */
1935                   break;
1936                 }
1937               more_bytes--;
1938             }
1939           if (more_bytes != 0)
1940             break;
1941           found = CATEGORY_MASK_EMACS_MULE;
1942         }
1943     }
1944   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1945   return 0;
1946
1947  no_more_source:
1948   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1949     {
1950       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1951       return 0;
1952     }
1953   detect_info->found |= found;
1954   return 1;
1955 }
1956
1957
1958 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1959    character.  If CMP_STATUS indicates that we must expect MSEQ or
1960    RULE described above, decode it and return the negative value of
1961    the decoded character or rule.  If an invalid byte is found, return
1962    -1.  If SRC is too short, return -2.  */
1963
1964 static int
1965 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1966                  int *nbytes, int *nchars, int *id,
1967                  struct composition_status *cmp_status)
1968 {
1969   const unsigned char *src_end = coding->source + coding->src_bytes;
1970   const unsigned char *src_base = src;
1971   bool multibytep = coding->src_multibyte;
1972   int charset_ID;
1973   unsigned code;
1974   int c;
1975   int consumed_chars = 0;
1976   bool mseq_found = 0;
1977
1978   ONE_MORE_BYTE (c);
1979   if (c < 0)
1980     {
1981       c = -c;
1982       charset_ID = emacs_mule_charset[0];
1983     }
1984   else
1985     {
1986       if (c >= 0xA0)
1987         {
1988           if (cmp_status->state != COMPOSING_NO
1989               && cmp_status->old_form)
1990             {
1991               if (cmp_status->state == COMPOSING_CHAR)
1992                 {
1993                   if (c == 0xA0)
1994                     {
1995                       ONE_MORE_BYTE (c);
1996                       c -= 0x80;
1997                       if (c < 0)
1998                         goto invalid_code;
1999                     }
2000                   else
2001                     c -= 0x20;
2002                   mseq_found = 1;
2003                 }
2004               else
2005                 {
2006                   *nbytes = src - src_base;
2007                   *nchars = consumed_chars;
2008                   return -c;
2009                 }
2010             }
2011           else
2012             goto invalid_code;
2013         }
2014
2015       switch (emacs_mule_bytes[c])
2016         {
2017         case 2:
2018           if ((charset_ID = emacs_mule_charset[c]) < 0)
2019             goto invalid_code;
2020           ONE_MORE_BYTE (c);
2021           if (c < 0xA0)
2022             goto invalid_code;
2023           code = c & 0x7F;
2024           break;
2025
2026         case 3:
2027           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2028               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2029             {
2030               ONE_MORE_BYTE (c);
2031               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2032                 goto invalid_code;
2033               ONE_MORE_BYTE (c);
2034               if (c < 0xA0)
2035                 goto invalid_code;
2036               code = c & 0x7F;
2037             }
2038           else
2039             {
2040               if ((charset_ID = emacs_mule_charset[c]) < 0)
2041                 goto invalid_code;
2042               ONE_MORE_BYTE (c);
2043               if (c < 0xA0)
2044                 goto invalid_code;
2045               code = (c & 0x7F) << 8;
2046               ONE_MORE_BYTE (c);
2047               if (c < 0xA0)
2048                 goto invalid_code;
2049               code |= c & 0x7F;
2050             }
2051           break;
2052
2053         case 4:
2054           ONE_MORE_BYTE (c);
2055           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2056             goto invalid_code;
2057           ONE_MORE_BYTE (c);
2058           if (c < 0xA0)
2059             goto invalid_code;
2060           code = (c & 0x7F) << 8;
2061           ONE_MORE_BYTE (c);
2062           if (c < 0xA0)
2063             goto invalid_code;
2064           code |= c & 0x7F;
2065           break;
2066
2067         case 1:
2068           code = c;
2069           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2070           break;
2071
2072         default:
2073           emacs_abort ();
2074         }
2075       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2076                           CHARSET_FROM_ID (charset_ID), code, c);
2077       if (c < 0)
2078         goto invalid_code;
2079     }
2080   *nbytes = src - src_base;
2081   *nchars = consumed_chars;
2082   if (id)
2083     *id = charset_ID;
2084   return (mseq_found ? -c : c);
2085
2086  no_more_source:
2087   return -2;
2088
2089  invalid_code:
2090   return -1;
2091 }
2092
2093
2094 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2095
2096 /* Handle these composition sequence ('|': the end of header elements,
2097    BYTES and CHARS >= 0xA0):
2098
2099    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2100    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2101    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2102
2103    and these old form:
2104
2105    (4) relative composition: 0x80 | MSEQ ... MSEQ
2106    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2107
2108    When the starter 0x80 and the following header elements are found,
2109    this annotation header is produced.
2110
2111         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2112
2113    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2114    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2115
2116    Then, upon reading the following elements, these codes are produced
2117    until the composition end is found:
2118
2119    (1) CHAR ... CHAR
2120    (2) ALT ... ALT CHAR ... CHAR
2121    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2122    (4) CHAR ... CHAR
2123    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2124
2125    When the composition end is found, LENGTH and NCHARS in the
2126    annotation header is updated as below:
2127
2128    (1) LENGTH: unchanged, NCHARS: unchanged
2129    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2130    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2131    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2132    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2133
2134    If an error is found while composing, the annotation header is
2135    changed to the original composition header (plus filler -1s) as
2136    below:
2137
2138    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2139    (5)          [ 0x80 0xFF -1 -1- -1 ]
2140
2141    and the sequence [ -2 DECODED-RULE ] is changed to the original
2142    byte sequence as below:
2143         o the original byte sequence is B: [ B -1 ]
2144         o the original byte sequence is B1 B2: [ B1 B2 ]
2145
2146    Most of the routines are implemented by macros because many
2147    variables and labels in the caller decode_coding_emacs_mule must be
2148    accessible, and they are usually called just once (thus doesn't
2149    increase the size of compiled object).  */
2150
2151 /* Decode a composition rule represented by C as a component of
2152    composition sequence of Emacs 20 style.  Set RULE to the decoded
2153    rule. */
2154
2155 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2156   do {                                                  \
2157     int gref, nref;                                     \
2158                                                         \
2159     c -= 0xA0;                                          \
2160     if (c < 0 || c >= 81)                               \
2161       goto invalid_code;                                \
2162     gref = c / 9, nref = c % 9;                         \
2163     if (gref == 4) gref = 10;                           \
2164     if (nref == 4) nref = 10;                           \
2165     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2166   } while (0)
2167
2168
2169 /* Decode a composition rule represented by C and the following byte
2170    at SRC as a component of composition sequence of Emacs 21 style.
2171    Set RULE to the decoded rule.  */
2172
2173 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2174   do {                                                  \
2175     int gref, nref;                                     \
2176                                                         \
2177     gref = c - 0x20;                                    \
2178     if (gref < 0 || gref >= 81)                         \
2179       goto invalid_code;                                \
2180     ONE_MORE_BYTE (c);                                  \
2181     nref = c - 0x20;                                    \
2182     if (nref < 0 || nref >= 81)                         \
2183       goto invalid_code;                                \
2184     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2185   } while (0)
2186
2187
2188 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2189    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2190    byte length of this composition information, CHARS is the number of
2191    characters composed by this composition.  */
2192
2193 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2194   do {                                                                  \
2195     enum composition_method method = c - 0xF2;                          \
2196     int nbytes, nchars;                                                 \
2197                                                                         \
2198     ONE_MORE_BYTE (c);                                                  \
2199     if (c < 0)                                                          \
2200       goto invalid_code;                                                \
2201     nbytes = c - 0xA0;                                                  \
2202     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2203       goto invalid_code;                                                \
2204     ONE_MORE_BYTE (c);                                                  \
2205     nchars = c - 0xA0;                                                  \
2206     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2207       goto invalid_code;                                                \
2208     cmp_status->old_form = 0;                                           \
2209     cmp_status->method = method;                                        \
2210     if (method == COMPOSITION_RELATIVE)                                 \
2211       cmp_status->state = COMPOSING_CHAR;                               \
2212     else                                                                \
2213       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2214     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2215     cmp_status->nchars = nchars;                                        \
2216     cmp_status->ncomps = nbytes - 4;                                    \
2217     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2218   } while (0)
2219
2220
2221 /* Start of Emacs 20 style format for relative composition.  */
2222
2223 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2224   do {                                                          \
2225     cmp_status->old_form = 1;                                   \
2226     cmp_status->method = COMPOSITION_RELATIVE;                  \
2227     cmp_status->state = COMPOSING_CHAR;                         \
2228     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2229     cmp_status->nchars = cmp_status->ncomps = 0;                \
2230     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2231   } while (0)
2232
2233
2234 /* Start of Emacs 20 style format for rule-base composition.  */
2235
2236 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2237   do {                                                          \
2238     cmp_status->old_form = 1;                                   \
2239     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2240     cmp_status->state = COMPOSING_CHAR;                         \
2241     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2242     cmp_status->nchars = cmp_status->ncomps = 0;                \
2243     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2244   } while (0)
2245
2246
2247 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2248   do {                                                  \
2249     const unsigned char *current_src = src;             \
2250                                                         \
2251     ONE_MORE_BYTE (c);                                  \
2252     if (c < 0)                                          \
2253       goto invalid_code;                                \
2254     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2255         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2256       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2257     else if (c < 0xA0)                                  \
2258       goto invalid_code;                                \
2259     else if (c < 0xC0)                                  \
2260       {                                                 \
2261         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2262         /* Re-read C as a composition component.  */    \
2263         src = current_src;                              \
2264       }                                                 \
2265     else if (c == 0xFF)                                 \
2266       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2267     else                                                \
2268       goto invalid_code;                                \
2269   } while (0)
2270
2271 #define EMACS_MULE_COMPOSITION_END()                            \
2272   do {                                                          \
2273     int idx = - cmp_status->length;                             \
2274                                                                 \
2275     if (cmp_status->old_form)                                   \
2276       charbuf[idx + 2] = cmp_status->nchars;                    \
2277     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2278       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2279     cmp_status->state = COMPOSING_NO;                           \
2280   } while (0)
2281
2282
2283 static int
2284 emacs_mule_finish_composition (int *charbuf,
2285                                struct composition_status *cmp_status)
2286 {
2287   int idx = - cmp_status->length;
2288   int new_chars;
2289
2290   if (cmp_status->old_form && cmp_status->nchars > 0)
2291     {
2292       charbuf[idx + 2] = cmp_status->nchars;
2293       new_chars = 0;
2294       if (cmp_status->method == COMPOSITION_WITH_RULE
2295           && cmp_status->state == COMPOSING_CHAR)
2296         {
2297           /* The last rule was invalid.  */
2298           int rule = charbuf[-1] + 0xA0;
2299
2300           charbuf[-2] = BYTE8_TO_CHAR (rule);
2301           charbuf[-1] = -1;
2302           new_chars = 1;
2303         }
2304     }
2305   else
2306     {
2307       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2308
2309       if (cmp_status->method == COMPOSITION_WITH_RULE)
2310         {
2311           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2312           charbuf[idx++] = -3;
2313           charbuf[idx++] = 0;
2314           new_chars = 1;
2315         }
2316       else
2317         {
2318           int nchars = charbuf[idx + 1] + 0xA0;
2319           int nbytes = charbuf[idx + 2] + 0xA0;
2320
2321           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2322           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2323           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2324           charbuf[idx++] = -1;
2325           new_chars = 4;
2326         }
2327     }
2328   cmp_status->state = COMPOSING_NO;
2329   return new_chars;
2330 }
2331
2332 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2333   do {                                                                    \
2334     if (cmp_status->state != COMPOSING_NO)                                \
2335       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2336   } while (0)
2337
2338
2339 static void
2340 decode_coding_emacs_mule (struct coding_system *coding)
2341 {
2342   const unsigned char *src = coding->source + coding->consumed;
2343   const unsigned char *src_end = coding->source + coding->src_bytes;
2344   const unsigned char *src_base;
2345   int *charbuf = coding->charbuf + coding->charbuf_used;
2346   /* We may produce two annotations (charset and composition) in one
2347      loop and one more charset annotation at the end.  */
2348   int *charbuf_end
2349     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2350       /* We can produce up to 2 characters in a loop.  */
2351       - 1;
2352   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2353   bool multibytep = coding->src_multibyte;
2354   ptrdiff_t char_offset = coding->produced_char;
2355   ptrdiff_t last_offset = char_offset;
2356   int last_id = charset_ascii;
2357   bool eol_dos
2358     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2359   int byte_after_cr = -1;
2360   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2361
2362   if (cmp_status->state != COMPOSING_NO)
2363     {
2364       int i;
2365
2366       if (charbuf_end - charbuf < cmp_status->length)
2367         emacs_abort ();
2368       for (i = 0; i < cmp_status->length; i++)
2369         *charbuf++ = cmp_status->carryover[i];
2370       coding->annotated = 1;
2371     }
2372
2373   while (1)
2374     {
2375       int c, id IF_LINT (= 0);
2376
2377       src_base = src;
2378       consumed_chars_base = consumed_chars;
2379
2380       if (charbuf >= charbuf_end)
2381         {
2382           if (byte_after_cr >= 0)
2383             src_base--;
2384           break;
2385         }
2386
2387       if (byte_after_cr >= 0)
2388         c = byte_after_cr, byte_after_cr = -1;
2389       else
2390         ONE_MORE_BYTE (c);
2391
2392       if (c < 0 || c == 0x80)
2393         {
2394           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2395           if (c < 0)
2396             {
2397               *charbuf++ = -c;
2398               char_offset++;
2399             }
2400           else
2401             DECODE_EMACS_MULE_COMPOSITION_START ();
2402           continue;
2403         }
2404
2405       if (c < 0x80)
2406         {
2407           if (eol_dos && c == '\r')
2408             ONE_MORE_BYTE (byte_after_cr);
2409           id = charset_ascii;
2410           if (cmp_status->state != COMPOSING_NO)
2411             {
2412               if (cmp_status->old_form)
2413                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2414               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2415                 cmp_status->ncomps--;
2416             }
2417         }
2418       else
2419         {
2420           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2421           /* emacs_mule_char can load a charset map from a file, which
2422              allocates a large structure and might cause buffer text
2423              to be relocated as result.  Thus, we need to remember the
2424              original pointer to buffer text, and fix up all related
2425              pointers after the call.  */
2426           const unsigned char *orig = coding->source;
2427           ptrdiff_t offset;
2428
2429           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2430                                cmp_status);
2431           offset = coding->source - orig;
2432           if (offset)
2433             {
2434               src += offset;
2435               src_base += offset;
2436               src_end += offset;
2437             }
2438           if (c < 0)
2439             {
2440               if (c == -1)
2441                 goto invalid_code;
2442               if (c == -2)
2443                 break;
2444             }
2445           src = src_base + nbytes;
2446           consumed_chars = consumed_chars_base + nchars;
2447           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2448             cmp_status->ncomps -= nchars;
2449         }
2450
2451       /* Now if C >= 0, we found a normally encoded character, if C <
2452          0, we found an old-style composition component character or
2453          rule.  */
2454
2455       if (cmp_status->state == COMPOSING_NO)
2456         {
2457           if (last_id != id)
2458             {
2459               if (last_id != charset_ascii)
2460                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2461                                   last_id);
2462               last_id = id;
2463               last_offset = char_offset;
2464             }
2465           *charbuf++ = c;
2466           char_offset++;
2467         }
2468       else if (cmp_status->state == COMPOSING_CHAR)
2469         {
2470           if (cmp_status->old_form)
2471             {
2472               if (c >= 0)
2473                 {
2474                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2475                   *charbuf++ = c;
2476                   char_offset++;
2477                 }
2478               else
2479                 {
2480                   *charbuf++ = -c;
2481                   cmp_status->nchars++;
2482                   cmp_status->length++;
2483                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2484                     EMACS_MULE_COMPOSITION_END ();
2485                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2486                     cmp_status->state = COMPOSING_RULE;
2487                 }
2488             }
2489           else
2490             {
2491               *charbuf++ = c;
2492               cmp_status->length++;
2493               cmp_status->nchars--;
2494               if (cmp_status->nchars == 0)
2495                 EMACS_MULE_COMPOSITION_END ();
2496             }
2497         }
2498       else if (cmp_status->state == COMPOSING_RULE)
2499         {
2500           int rule;
2501
2502           if (c >= 0)
2503             {
2504               EMACS_MULE_COMPOSITION_END ();
2505               *charbuf++ = c;
2506               char_offset++;
2507             }
2508           else
2509             {
2510               c = -c;
2511               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2512               if (rule < 0)
2513                 goto invalid_code;
2514               *charbuf++ = -2;
2515               *charbuf++ = rule;
2516               cmp_status->length += 2;
2517               cmp_status->state = COMPOSING_CHAR;
2518             }
2519         }
2520       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2521         {
2522           *charbuf++ = c;
2523           cmp_status->length++;
2524           if (cmp_status->ncomps == 0)
2525             cmp_status->state = COMPOSING_CHAR;
2526           else if (cmp_status->ncomps > 0)
2527             {
2528               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2529                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2530             }
2531           else
2532             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2533         }
2534       else                      /* COMPOSING_COMPONENT_RULE */
2535         {
2536           int rule;
2537
2538           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2539           if (rule < 0)
2540             goto invalid_code;
2541           *charbuf++ = -2;
2542           *charbuf++ = rule;
2543           cmp_status->length += 2;
2544           cmp_status->ncomps--;
2545           if (cmp_status->ncomps > 0)
2546             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2547           else
2548             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2549         }
2550       continue;
2551
2552     invalid_code:
2553       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2554       src = src_base;
2555       consumed_chars = consumed_chars_base;
2556       ONE_MORE_BYTE (c);
2557       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2558       char_offset++;
2559       coding->errors++;
2560     }
2561
2562  no_more_source:
2563   if (cmp_status->state != COMPOSING_NO)
2564     {
2565       if (coding->mode & CODING_MODE_LAST_BLOCK)
2566         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2567       else
2568         {
2569           int i;
2570
2571           charbuf -= cmp_status->length;
2572           for (i = 0; i < cmp_status->length; i++)
2573             cmp_status->carryover[i] = charbuf[i];
2574         }
2575     }
2576   if (last_id != charset_ascii)
2577     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2578   coding->consumed_char += consumed_chars_base;
2579   coding->consumed = src_base - coding->source;
2580   coding->charbuf_used = charbuf - coding->charbuf;
2581 }
2582
2583
2584 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2585   do {                                          \
2586     if (id < 0xA0)                              \
2587       codes[0] = id, codes[1] = 0;              \
2588     else if (id < 0xE0)                         \
2589       codes[0] = 0x9A, codes[1] = id;           \
2590     else if (id < 0xF0)                         \
2591       codes[0] = 0x9B, codes[1] = id;           \
2592     else if (id < 0xF5)                         \
2593       codes[0] = 0x9C, codes[1] = id;           \
2594     else                                        \
2595       codes[0] = 0x9D, codes[1] = id;           \
2596   } while (0);
2597
2598
2599 static bool
2600 encode_coding_emacs_mule (struct coding_system *coding)
2601 {
2602   bool multibytep = coding->dst_multibyte;
2603   int *charbuf = coding->charbuf;
2604   int *charbuf_end = charbuf + coding->charbuf_used;
2605   unsigned char *dst = coding->destination + coding->produced;
2606   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2607   int safe_room = 8;
2608   ptrdiff_t produced_chars = 0;
2609   Lisp_Object attrs, charset_list;
2610   int c;
2611   int preferred_charset_id = -1;
2612
2613   CODING_GET_INFO (coding, attrs, charset_list);
2614   if (! EQ (charset_list, Vemacs_mule_charset_list))
2615     {
2616       charset_list = Vemacs_mule_charset_list;
2617       ASET (attrs, coding_attr_charset_list, charset_list);
2618     }
2619
2620   while (charbuf < charbuf_end)
2621     {
2622       ASSURE_DESTINATION (safe_room);
2623       c = *charbuf++;
2624
2625       if (c < 0)
2626         {
2627           /* Handle an annotation.  */
2628           switch (*charbuf)
2629             {
2630             case CODING_ANNOTATE_COMPOSITION_MASK:
2631               /* Not yet implemented.  */
2632               break;
2633             case CODING_ANNOTATE_CHARSET_MASK:
2634               preferred_charset_id = charbuf[3];
2635               if (preferred_charset_id >= 0
2636                   && NILP (Fmemq (make_number (preferred_charset_id),
2637                                   charset_list)))
2638                 preferred_charset_id = -1;
2639               break;
2640             default:
2641               emacs_abort ();
2642             }
2643           charbuf += -c - 1;
2644           continue;
2645         }
2646
2647       if (ASCII_CHAR_P (c))
2648         EMIT_ONE_ASCII_BYTE (c);
2649       else if (CHAR_BYTE8_P (c))
2650         {
2651           c = CHAR_TO_BYTE8 (c);
2652           EMIT_ONE_BYTE (c);
2653         }
2654       else
2655         {
2656           struct charset *charset;
2657           unsigned code;
2658           int dimension;
2659           int emacs_mule_id;
2660           unsigned char leading_codes[2];
2661
2662           if (preferred_charset_id >= 0)
2663             {
2664               bool result;
2665
2666               charset = CHARSET_FROM_ID (preferred_charset_id);
2667               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2668               if (result)
2669                 code = ENCODE_CHAR (charset, c);
2670               else
2671                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2672                                      &code, charset);
2673             }
2674           else
2675             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2676                                  &code, charset);
2677           if (! charset)
2678             {
2679               c = coding->default_char;
2680               if (ASCII_CHAR_P (c))
2681                 {
2682                   EMIT_ONE_ASCII_BYTE (c);
2683                   continue;
2684                 }
2685               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2686                                    &code, charset);
2687             }
2688           dimension = CHARSET_DIMENSION (charset);
2689           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2690           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2691           EMIT_ONE_BYTE (leading_codes[0]);
2692           if (leading_codes[1])
2693             EMIT_ONE_BYTE (leading_codes[1]);
2694           if (dimension == 1)
2695             EMIT_ONE_BYTE (code | 0x80);
2696           else
2697             {
2698               code |= 0x8080;
2699               EMIT_ONE_BYTE (code >> 8);
2700               EMIT_ONE_BYTE (code & 0xFF);
2701             }
2702         }
2703     }
2704   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2705   coding->produced_char += produced_chars;
2706   coding->produced = dst - coding->destination;
2707   return 0;
2708 }
2709
2710 \f
2711 /*** 7. ISO2022 handlers ***/
2712
2713 /* The following note describes the coding system ISO2022 briefly.
2714    Since the intention of this note is to help understand the
2715    functions in this file, some parts are NOT ACCURATE or are OVERLY
2716    SIMPLIFIED.  For thorough understanding, please refer to the
2717    original document of ISO2022.  This is equivalent to the standard
2718    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2719
2720    ISO2022 provides many mechanisms to encode several character sets
2721    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2722    is encoded using bytes less than 128.  This may make the encoded
2723    text a little bit longer, but the text passes more easily through
2724    several types of gateway, some of which strip off the MSB (Most
2725    Significant Bit).
2726
2727    There are two kinds of character sets: control character sets and
2728    graphic character sets.  The former contain control characters such
2729    as `newline' and `escape' to provide control functions (control
2730    functions are also provided by escape sequences).  The latter
2731    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2732    two control character sets and many graphic character sets.
2733
2734    Graphic character sets are classified into one of the following
2735    four classes, according to the number of bytes (DIMENSION) and
2736    number of characters in one dimension (CHARS) of the set:
2737    - DIMENSION1_CHARS94
2738    - DIMENSION1_CHARS96
2739    - DIMENSION2_CHARS94
2740    - DIMENSION2_CHARS96
2741
2742    In addition, each character set is assigned an identification tag,
2743    unique for each set, called the "final character" (denoted as <F>
2744    hereafter).  The <F> of each character set is decided by ECMA(*)
2745    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2746    (0x30..0x3F are for private use only).
2747
2748    Note (*): ECMA = European Computer Manufacturers Association
2749
2750    Here are examples of graphic character sets [NAME(<F>)]:
2751         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2752         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2753         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2754         o DIMENSION2_CHARS96 -- none for the moment
2755
2756    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2757         C0 [0x00..0x1F] -- control character plane 0
2758         GL [0x20..0x7F] -- graphic character plane 0
2759         C1 [0x80..0x9F] -- control character plane 1
2760         GR [0xA0..0xFF] -- graphic character plane 1
2761
2762    A control character set is directly designated and invoked to C0 or
2763    C1 by an escape sequence.  The most common case is that:
2764    - ISO646's  control character set is designated/invoked to C0, and
2765    - ISO6429's control character set is designated/invoked to C1,
2766    and usually these designations/invocations are omitted in encoded
2767    text.  In a 7-bit environment, only C0 can be used, and a control
2768    character for C1 is encoded by an appropriate escape sequence to
2769    fit into the environment.  All control characters for C1 are
2770    defined to have corresponding escape sequences.
2771
2772    A graphic character set is at first designated to one of four
2773    graphic registers (G0 through G3), then these graphic registers are
2774    invoked to GL or GR.  These designations and invocations can be
2775    done independently.  The most common case is that G0 is invoked to
2776    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2777    these invocations and designations are omitted in encoded text.
2778    In a 7-bit environment, only GL can be used.
2779
2780    When a graphic character set of CHARS94 is invoked to GL, codes
2781    0x20 and 0x7F of the GL area work as control characters SPACE and
2782    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2783    be used.
2784
2785    There are two ways of invocation: locking-shift and single-shift.
2786    With locking-shift, the invocation lasts until the next different
2787    invocation, whereas with single-shift, the invocation affects the
2788    following character only and doesn't affect the locking-shift
2789    state.  Invocations are done by the following control characters or
2790    escape sequences:
2791
2792    ----------------------------------------------------------------------
2793    abbrev  function                  cntrl escape seq   description
2794    ----------------------------------------------------------------------
2795    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2796    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2797    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2798    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2799    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2800    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2801    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2802    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2803    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2804    ----------------------------------------------------------------------
2805    (*) These are not used by any known coding system.
2806
2807    Control characters for these functions are defined by macros
2808    ISO_CODE_XXX in `coding.h'.
2809
2810    Designations are done by the following escape sequences:
2811    ----------------------------------------------------------------------
2812    escape sequence      description
2813    ----------------------------------------------------------------------
2814    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2815    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2816    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2817    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2818    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2819    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2820    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2821    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2822    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2823    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2824    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2825    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2826    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2827    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2828    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2829    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2830    ----------------------------------------------------------------------
2831
2832    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2833    of dimension 1, chars 94, and final character <F>, etc...
2834
2835    Note (*): Although these designations are not allowed in ISO2022,
2836    Emacs accepts them on decoding, and produces them on encoding
2837    CHARS96 character sets in a coding system which is characterized as
2838    7-bit environment, non-locking-shift, and non-single-shift.
2839
2840    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2841    '(' must be omitted.  We refer to this as "short-form" hereafter.
2842
2843    Now you may notice that there are a lot of ways of encoding the
2844    same multilingual text in ISO2022.  Actually, there exist many
2845    coding systems such as Compound Text (used in X11's inter client
2846    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2847    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2848    localized platforms), and all of these are variants of ISO2022.
2849
2850    In addition to the above, Emacs handles two more kinds of escape
2851    sequences: ISO6429's direction specification and Emacs' private
2852    sequence for specifying character composition.
2853
2854    ISO6429's direction specification takes the following form:
2855         o CSI ']'      -- end of the current direction
2856         o CSI '0' ']'  -- end of the current direction
2857         o CSI '1' ']'  -- start of left-to-right text
2858         o CSI '2' ']'  -- start of right-to-left text
2859    The control character CSI (0x9B: control sequence introducer) is
2860    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2861
2862    Character composition specification takes the following form:
2863         o ESC '0' -- start relative composition
2864         o ESC '1' -- end composition
2865         o ESC '2' -- start rule-base composition (*)
2866         o ESC '3' -- start relative composition with alternate chars  (**)
2867         o ESC '4' -- start rule-base composition with alternate chars  (**)
2868   Since these are not standard escape sequences of any ISO standard,
2869   the use of them with these meanings is restricted to Emacs only.
2870
2871   (*) This form is used only in Emacs 20.7 and older versions,
2872   but newer versions can safely decode it.
2873   (**) This form is used only in Emacs 21.1 and newer versions,
2874   and older versions can't decode it.
2875
2876   Here's a list of example usages of these composition escape
2877   sequences (categorized by `enum composition_method').
2878
2879   COMPOSITION_RELATIVE:
2880         ESC 0 CHAR [ CHAR ] ESC 1
2881   COMPOSITION_WITH_RULE:
2882         ESC 2 CHAR [ RULE CHAR ] ESC 1
2883   COMPOSITION_WITH_ALTCHARS:
2884         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2885   COMPOSITION_WITH_RULE_ALTCHARS:
2886         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2887
2888 static enum iso_code_class_type iso_code_class[256];
2889
2890 #define SAFE_CHARSET_P(coding, id)      \
2891   ((id) <= (coding)->max_charset_id     \
2892    && (coding)->safe_charsets[id] != 255)
2893
2894 static void
2895 setup_iso_safe_charsets (Lisp_Object attrs)
2896 {
2897   Lisp_Object charset_list, safe_charsets;
2898   Lisp_Object request;
2899   Lisp_Object reg_usage;
2900   Lisp_Object tail;
2901   EMACS_INT reg94, reg96;
2902   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2903   int max_charset_id;
2904
2905   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2906   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2907       && ! EQ (charset_list, Viso_2022_charset_list))
2908     {
2909       charset_list = Viso_2022_charset_list;
2910       ASET (attrs, coding_attr_charset_list, charset_list);
2911       ASET (attrs, coding_attr_safe_charsets, Qnil);
2912     }
2913
2914   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2915     return;
2916
2917   max_charset_id = 0;
2918   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2919     {
2920       int id = XINT (XCAR (tail));
2921       if (max_charset_id < id)
2922         max_charset_id = id;
2923     }
2924
2925   safe_charsets = make_uninit_string (max_charset_id + 1);
2926   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2927   request = AREF (attrs, coding_attr_iso_request);
2928   reg_usage = AREF (attrs, coding_attr_iso_usage);
2929   reg94 = XINT (XCAR (reg_usage));
2930   reg96 = XINT (XCDR (reg_usage));
2931
2932   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2933     {
2934       Lisp_Object id;
2935       Lisp_Object reg;
2936       struct charset *charset;
2937
2938       id = XCAR (tail);
2939       charset = CHARSET_FROM_ID (XINT (id));
2940       reg = Fcdr (Fassq (id, request));
2941       if (! NILP (reg))
2942         SSET (safe_charsets, XINT (id), XINT (reg));
2943       else if (charset->iso_chars_96)
2944         {
2945           if (reg96 < 4)
2946             SSET (safe_charsets, XINT (id), reg96);
2947         }
2948       else
2949         {
2950           if (reg94 < 4)
2951             SSET (safe_charsets, XINT (id), reg94);
2952         }
2953     }
2954   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2955 }
2956
2957
2958 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2959    Return true if a text is encoded in one of ISO-2022 based coding
2960    systems.  */
2961
2962 static bool
2963 detect_coding_iso_2022 (struct coding_system *coding,
2964                         struct coding_detection_info *detect_info)
2965 {
2966   const unsigned char *src = coding->source, *src_base = src;
2967   const unsigned char *src_end = coding->source + coding->src_bytes;
2968   bool multibytep = coding->src_multibyte;
2969   bool single_shifting = 0;
2970   int id;
2971   int c, c1;
2972   ptrdiff_t consumed_chars = 0;
2973   int i;
2974   int rejected = 0;
2975   int found = 0;
2976   int composition_count = -1;
2977
2978   detect_info->checked |= CATEGORY_MASK_ISO;
2979
2980   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2981     {
2982       struct coding_system *this = &(coding_categories[i]);
2983       Lisp_Object attrs, val;
2984
2985       if (this->id < 0)
2986         continue;
2987       attrs = CODING_ID_ATTRS (this->id);
2988       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2989           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2990         setup_iso_safe_charsets (attrs);
2991       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2992       this->max_charset_id = SCHARS (val) - 1;
2993       this->safe_charsets = SDATA (val);
2994     }
2995
2996   /* A coding system of this category is always ASCII compatible.  */
2997   src += coding->head_ascii;
2998
2999   while (rejected != CATEGORY_MASK_ISO)
3000     {
3001       src_base = src;
3002       ONE_MORE_BYTE (c);
3003       switch (c)
3004         {
3005         case ISO_CODE_ESC:
3006           if (inhibit_iso_escape_detection)
3007             break;
3008           single_shifting = 0;
3009           ONE_MORE_BYTE (c);
3010           if (c == 'N' || c == 'O')
3011             {
3012               /* ESC <Fe> for SS2 or SS3.  */
3013               single_shifting = 1;
3014               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3015             }
3016           else if (c == '1')
3017             {
3018               /* End of composition.  */
3019               if (composition_count < 0
3020                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3021                 /* Invalid */
3022                 break;
3023               composition_count = -1;
3024               found |= CATEGORY_MASK_ISO;
3025             }
3026           else if (c >= '0' && c <= '4')
3027             {
3028               /* ESC <Fp> for start/end composition.  */
3029               composition_count = 0;
3030             }
3031           else
3032             {
3033               if (c >= '(' && c <= '/')
3034                 {
3035                   /* Designation sequence for a charset of dimension 1.  */
3036                   ONE_MORE_BYTE (c1);
3037                   if (c1 < ' ' || c1 >= 0x80
3038                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3039                     /* Invalid designation sequence.  Just ignore.  */
3040                     break;
3041                 }
3042               else if (c == '$')
3043                 {
3044                   /* Designation sequence for a charset of dimension 2.  */
3045                   ONE_MORE_BYTE (c);
3046                   if (c >= '@' && c <= 'B')
3047                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3048                     id = iso_charset_table[1][0][c];
3049                   else if (c >= '(' && c <= '/')
3050                     {
3051                       ONE_MORE_BYTE (c1);
3052                       if (c1 < ' ' || c1 >= 0x80
3053                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3054                         /* Invalid designation sequence.  Just ignore.  */
3055                         break;
3056                     }
3057                   else
3058                     /* Invalid designation sequence.  Just ignore it.  */
3059                     break;
3060                 }
3061               else
3062                 {
3063                   /* Invalid escape sequence.  Just ignore it.  */
3064                   break;
3065                 }
3066
3067               /* We found a valid designation sequence for CHARSET.  */
3068               rejected |= CATEGORY_MASK_ISO_8BIT;
3069               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3070                                   id))
3071                 found |= CATEGORY_MASK_ISO_7;
3072               else
3073                 rejected |= CATEGORY_MASK_ISO_7;
3074               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3075                                   id))
3076                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3077               else
3078                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3079               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3080                                   id))
3081                 found |= CATEGORY_MASK_ISO_7_ELSE;
3082               else
3083                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3084               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3085                                   id))
3086                 found |= CATEGORY_MASK_ISO_8_ELSE;
3087               else
3088                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3089             }
3090           break;
3091
3092         case ISO_CODE_SO:
3093         case ISO_CODE_SI:
3094           /* Locking shift out/in.  */
3095           if (inhibit_iso_escape_detection)
3096             break;
3097           single_shifting = 0;
3098           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3099           break;
3100
3101         case ISO_CODE_CSI:
3102           /* Control sequence introducer.  */
3103           single_shifting = 0;
3104           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3105           found |= CATEGORY_MASK_ISO_8_ELSE;
3106           goto check_extra_latin;
3107
3108         case ISO_CODE_SS2:
3109         case ISO_CODE_SS3:
3110           /* Single shift.   */
3111           if (inhibit_iso_escape_detection)
3112             break;
3113           single_shifting = 0;
3114           rejected |= CATEGORY_MASK_ISO_7BIT;
3115           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3116               & CODING_ISO_FLAG_SINGLE_SHIFT)
3117             {
3118               found |= CATEGORY_MASK_ISO_8_1;
3119               single_shifting = 1;
3120             }
3121           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3122               & CODING_ISO_FLAG_SINGLE_SHIFT)
3123             {
3124               found |= CATEGORY_MASK_ISO_8_2;
3125               single_shifting = 1;
3126             }
3127           if (single_shifting)
3128             break;
3129           goto check_extra_latin;
3130
3131         default:
3132           if (c < 0)
3133             continue;
3134           if (c < 0x80)
3135             {
3136               if (composition_count >= 0)
3137                 composition_count++;
3138               single_shifting = 0;
3139               break;
3140             }
3141           if (c >= 0xA0)
3142             {
3143               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3144               found |= CATEGORY_MASK_ISO_8_1;
3145               /* Check the length of succeeding codes of the range
3146                  0xA0..0FF.  If the byte length is even, we include
3147                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3148                  only when we are not single shifting.  */
3149               if (! single_shifting
3150                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3151                 {
3152                   int len = 1;
3153                   while (src < src_end)
3154                     {
3155                       src_base = src;
3156                       ONE_MORE_BYTE (c);
3157                       if (c < 0xA0)
3158                         {
3159                           src = src_base;
3160                           break;
3161                         }
3162                       len++;
3163                     }
3164
3165                   if (len & 1 && src < src_end)
3166                     {
3167                       rejected |= CATEGORY_MASK_ISO_8_2;
3168                       if (composition_count >= 0)
3169                         composition_count += len;
3170                     }
3171                   else
3172                     {
3173                       found |= CATEGORY_MASK_ISO_8_2;
3174                       if (composition_count >= 0)
3175                         composition_count += len / 2;
3176                     }
3177                 }
3178               break;
3179             }
3180         check_extra_latin:
3181           if (! VECTORP (Vlatin_extra_code_table)
3182               || NILP (AREF (Vlatin_extra_code_table, c)))
3183             {
3184               rejected = CATEGORY_MASK_ISO;
3185               break;
3186             }
3187           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3188               & CODING_ISO_FLAG_LATIN_EXTRA)
3189             found |= CATEGORY_MASK_ISO_8_1;
3190           else
3191             rejected |= CATEGORY_MASK_ISO_8_1;
3192           rejected |= CATEGORY_MASK_ISO_8_2;
3193           break;
3194         }
3195     }
3196   detect_info->rejected |= CATEGORY_MASK_ISO;
3197   return 0;
3198
3199  no_more_source:
3200   detect_info->rejected |= rejected;
3201   detect_info->found |= (found & ~rejected);
3202   return 1;
3203 }
3204
3205
3206 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3207    escape sequence should be kept.  */
3208 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3209   do {                                                                  \
3210     int id, prev;                                                       \
3211                                                                         \
3212     if (final < '0' || final >= 128                                     \
3213         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3214         || !SAFE_CHARSET_P (coding, id))                                \
3215       {                                                                 \
3216         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3217         chars_96 = -1;                                                  \
3218         break;                                                          \
3219       }                                                                 \
3220     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3221     if (id == charset_jisx0201_roman)                                   \
3222       {                                                                 \
3223         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3224           id = charset_ascii;                                           \
3225       }                                                                 \
3226     else if (id == charset_jisx0208_1978)                               \
3227       {                                                                 \
3228         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3229           id = charset_jisx0208;                                        \
3230       }                                                                 \
3231     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3232     /* If there was an invalid designation to REG previously, and this  \
3233        designation is ASCII to REG, we should keep this designation     \
3234        sequence.  */                                                    \
3235     if (prev == -2 && id == charset_ascii)                              \
3236       chars_96 = -1;                                                    \
3237   } while (0)
3238
3239
3240 /* Handle these composition sequence (ALT: alternate char):
3241
3242    (1) relative composition: ESC 0 CHAR ... ESC 1
3243    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3244    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3245    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3246
3247    When the start sequence (ESC 0/2/3/4) is found, this annotation
3248    header is produced.
3249
3250         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3251
3252    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3253    produced until the end sequence (ESC 1) is found:
3254
3255    (1) CHAR ... CHAR
3256    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3257    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3258    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3259
3260    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3261    annotation header is updated as below:
3262
3263    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3264    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3265    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3266    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3267
3268    If an error is found while composing, the annotation header is
3269    changed to:
3270
3271         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3272
3273    and the sequence [ -2 DECODED-RULE ] is changed to the original
3274    byte sequence as below:
3275         o the original byte sequence is B: [ B -1 ]
3276         o the original byte sequence is B1 B2: [ B1 B2 ]
3277    and the sequence [ -1 -1 ] is changed to the original byte
3278    sequence:
3279         [ ESC '0' ]
3280 */
3281
3282 /* Decode a composition rule C1 and maybe one more byte from the
3283    source, and set RULE to the encoded composition rule.  If the rule
3284    is invalid, goto invalid_code.  */
3285
3286 #define DECODE_COMPOSITION_RULE(rule)                                   \
3287   do {                                                                  \
3288     rule = c1 - 32;                                                     \
3289     if (rule < 0)                                                       \
3290       goto invalid_code;                                                \
3291     if (rule < 81)              /* old format (before ver.21) */        \
3292       {                                                                 \
3293         int gref = (rule) / 9;                                          \
3294         int nref = (rule) % 9;                                          \
3295         if (gref == 4) gref = 10;                                       \
3296         if (nref == 4) nref = 10;                                       \
3297         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3298       }                                                                 \
3299     else                        /* new format (after ver.21) */         \
3300       {                                                                 \
3301         int b;                                                          \
3302                                                                         \
3303         ONE_MORE_BYTE (b);                                              \
3304         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3305           goto invalid_code;                                            \
3306         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3307         rule += 0x100;   /* Distinguish it from the old format.  */     \
3308       }                                                                 \
3309   } while (0)
3310
3311 #define ENCODE_COMPOSITION_RULE(rule)                           \
3312   do {                                                          \
3313     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3314                                                                 \
3315     if (rule < 0x100)           /* old format */                \
3316       {                                                         \
3317         if (gref == 10) gref = 4;                               \
3318         if (nref == 10) nref = 4;                               \
3319         charbuf[idx] = 32 + gref * 9 + nref;                    \
3320         charbuf[idx + 1] = -1;                                  \
3321         new_chars++;                                            \
3322       }                                                         \
3323     else                                /* new format */        \
3324       {                                                         \
3325         charbuf[idx] = 32 + 81 + gref;                          \
3326         charbuf[idx + 1] = 32 + nref;                           \
3327         new_chars += 2;                                         \
3328       }                                                         \
3329   } while (0)
3330
3331 /* Finish the current composition as invalid.  */
3332
3333 static int
3334 finish_composition (int *charbuf, struct composition_status *cmp_status)
3335 {
3336   int idx = - cmp_status->length;
3337   int new_chars;
3338
3339   /* Recover the original ESC sequence */
3340   charbuf[idx++] = ISO_CODE_ESC;
3341   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3342                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3343                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3344                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3345                     : '4');
3346   charbuf[idx++] = -2;
3347   charbuf[idx++] = 0;
3348   charbuf[idx++] = -1;
3349   new_chars = cmp_status->nchars;
3350   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3351     for (; idx < 0; idx++)
3352       {
3353         int elt = charbuf[idx];
3354
3355         if (elt == -2)
3356           {
3357             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3358             idx++;
3359           }
3360         else if (elt == -1)
3361           {
3362             charbuf[idx++] = ISO_CODE_ESC;
3363             charbuf[idx] = '0';
3364             new_chars += 2;
3365           }
3366       }
3367   cmp_status->state = COMPOSING_NO;
3368   return new_chars;
3369 }
3370
3371 /* If characters are under composition, finish the composition.  */
3372 #define MAYBE_FINISH_COMPOSITION()                              \
3373   do {                                                          \
3374     if (cmp_status->state != COMPOSING_NO)                      \
3375       char_offset += finish_composition (charbuf, cmp_status);  \
3376   } while (0)
3377
3378 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3379
3380    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3381    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3382    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3383    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3384
3385    Produce this annotation sequence now:
3386
3387    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3388 */
3389
3390 #define DECODE_COMPOSITION_START(c1)                                       \
3391   do {                                                                     \
3392     if (c1 == '0'                                                          \
3393         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3394              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3395             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3396                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3397       {                                                                    \
3398         *charbuf++ = -1;                                                   \
3399         *charbuf++= -1;                                                    \
3400         cmp_status->state = COMPOSING_CHAR;                                \
3401         cmp_status->length += 2;                                           \
3402       }                                                                    \
3403     else                                                                   \
3404       {                                                                    \
3405         MAYBE_FINISH_COMPOSITION ();                                       \
3406         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3407                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3408                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3409                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3410         cmp_status->state                                                  \
3411           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3412         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3413         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3414         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3415         coding->annotated = 1;                                             \
3416       }                                                                    \
3417   } while (0)
3418
3419
3420 /* Handle composition end sequence ESC 1.  */
3421
3422 #define DECODE_COMPOSITION_END()                                        \
3423   do {                                                                  \
3424     if (cmp_status->nchars == 0                                         \
3425         || ((cmp_status->state == COMPOSING_CHAR)                       \
3426             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3427       {                                                                 \
3428         MAYBE_FINISH_COMPOSITION ();                                    \
3429         goto invalid_code;                                              \
3430       }                                                                 \
3431     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3432       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3433     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3434       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3435     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3436     char_offset += cmp_status->nchars;                                  \
3437     cmp_status->state = COMPOSING_NO;                                   \
3438   } while (0)
3439
3440 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3441
3442 #define STORE_COMPOSITION_RULE(rule)    \
3443   do {                                  \
3444     *charbuf++ = -2;                    \
3445     *charbuf++ = rule;                  \
3446     cmp_status->length += 2;            \
3447     cmp_status->state--;                \
3448   } while (0)
3449
3450 /* Store a composed char or a component char C in charbuf, and update
3451    cmp_status.  */
3452
3453 #define STORE_COMPOSITION_CHAR(c)                                       \
3454   do {                                                                  \
3455     *charbuf++ = (c);                                                   \
3456     cmp_status->length++;                                               \
3457     if (cmp_status->state == COMPOSING_CHAR)                            \
3458       cmp_status->nchars++;                                             \
3459     else                                                                \
3460       cmp_status->ncomps++;                                             \
3461     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3462         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3463             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3464       cmp_status->state++;                                              \
3465   } while (0)
3466
3467
3468 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3469
3470 static void
3471 decode_coding_iso_2022 (struct coding_system *coding)
3472 {
3473   const unsigned char *src = coding->source + coding->consumed;
3474   const unsigned char *src_end = coding->source + coding->src_bytes;
3475   const unsigned char *src_base;
3476   int *charbuf = coding->charbuf + coding->charbuf_used;
3477   /* We may produce two annotations (charset and composition) in one
3478      loop and one more charset annotation at the end.  */
3479   int *charbuf_end
3480     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3481   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3482   bool multibytep = coding->src_multibyte;
3483   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3484   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3485   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3486   int charset_id_2, charset_id_3;
3487   struct charset *charset;
3488   int c;
3489   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3490   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3491   ptrdiff_t char_offset = coding->produced_char;
3492   ptrdiff_t last_offset = char_offset;
3493   int last_id = charset_ascii;
3494   bool eol_dos
3495     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3496   int byte_after_cr = -1;
3497   int i;
3498
3499   setup_iso_safe_charsets (attrs);
3500   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3501
3502   if (cmp_status->state != COMPOSING_NO)
3503     {
3504       if (charbuf_end - charbuf < cmp_status->length)
3505         emacs_abort ();
3506       for (i = 0; i < cmp_status->length; i++)
3507         *charbuf++ = cmp_status->carryover[i];
3508       coding->annotated = 1;
3509     }
3510
3511   while (1)
3512     {
3513       int c1, c2, c3;
3514
3515       src_base = src;
3516       consumed_chars_base = consumed_chars;
3517
3518       if (charbuf >= charbuf_end)
3519         {
3520           if (byte_after_cr >= 0)
3521             src_base--;
3522           break;
3523         }
3524
3525       if (byte_after_cr >= 0)
3526         c1 = byte_after_cr, byte_after_cr = -1;
3527       else
3528         ONE_MORE_BYTE (c1);
3529       if (c1 < 0)
3530         goto invalid_code;
3531
3532       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3533         {
3534           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3535           char_offset++;
3536           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3537           continue;
3538         }
3539
3540       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3541         {
3542           if (c1 == ISO_CODE_ESC)
3543             {
3544               if (src + 1 >= src_end)
3545                 goto no_more_source;
3546               *charbuf++ = ISO_CODE_ESC;
3547               char_offset++;
3548               if (src[0] == '%' && src[1] == '@')
3549                 {
3550                   src += 2;
3551                   consumed_chars += 2;
3552                   char_offset += 2;
3553                   /* We are sure charbuf can contain two more chars. */
3554                   *charbuf++ = '%';
3555                   *charbuf++ = '@';
3556                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3557                 }
3558             }
3559           else
3560             {
3561               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3562               char_offset++;
3563             }
3564           continue;
3565         }
3566
3567       if ((cmp_status->state == COMPOSING_RULE
3568            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3569           && c1 != ISO_CODE_ESC)
3570         {
3571           int rule;
3572
3573           DECODE_COMPOSITION_RULE (rule);
3574           STORE_COMPOSITION_RULE (rule);
3575           continue;
3576         }
3577
3578       /* We produce at most one character.  */
3579       switch (iso_code_class [c1])
3580         {
3581         case ISO_0x20_or_0x7F:
3582           if (charset_id_0 < 0
3583               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3584             /* This is SPACE or DEL.  */
3585             charset = CHARSET_FROM_ID (charset_ascii);
3586           else
3587             charset = CHARSET_FROM_ID (charset_id_0);
3588           break;
3589
3590         case ISO_graphic_plane_0:
3591           if (charset_id_0 < 0)
3592             charset = CHARSET_FROM_ID (charset_ascii);
3593           else
3594             charset = CHARSET_FROM_ID (charset_id_0);
3595           break;
3596
3597         case ISO_0xA0_or_0xFF:
3598           if (charset_id_1 < 0
3599               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3600               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3601             goto invalid_code;
3602           /* This is a graphic character, we fall down ... */
3603
3604         case ISO_graphic_plane_1:
3605           if (charset_id_1 < 0)
3606             goto invalid_code;
3607           charset = CHARSET_FROM_ID (charset_id_1);
3608           break;
3609
3610         case ISO_control_0:
3611           if (eol_dos && c1 == '\r')
3612             ONE_MORE_BYTE (byte_after_cr);
3613           MAYBE_FINISH_COMPOSITION ();
3614           charset = CHARSET_FROM_ID (charset_ascii);
3615           break;
3616
3617         case ISO_control_1:
3618           goto invalid_code;
3619
3620         case ISO_shift_out:
3621           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3622               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3623             goto invalid_code;
3624           CODING_ISO_INVOCATION (coding, 0) = 1;
3625           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3626           continue;
3627
3628         case ISO_shift_in:
3629           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3630             goto invalid_code;
3631           CODING_ISO_INVOCATION (coding, 0) = 0;
3632           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3633           continue;
3634
3635         case ISO_single_shift_2_7:
3636           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3637             goto invalid_code;
3638         case ISO_single_shift_2:
3639           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3640             goto invalid_code;
3641           /* SS2 is handled as an escape sequence of ESC 'N' */
3642           c1 = 'N';
3643           goto label_escape_sequence;
3644
3645         case ISO_single_shift_3:
3646           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3647             goto invalid_code;
3648           /* SS2 is handled as an escape sequence of ESC 'O' */
3649           c1 = 'O';
3650           goto label_escape_sequence;
3651
3652         case ISO_control_sequence_introducer:
3653           /* CSI is handled as an escape sequence of ESC '[' ...  */
3654           c1 = '[';
3655           goto label_escape_sequence;
3656
3657         case ISO_escape:
3658           ONE_MORE_BYTE (c1);
3659         label_escape_sequence:
3660           /* Escape sequences handled here are invocation,
3661              designation, direction specification, and character
3662              composition specification.  */
3663           switch (c1)
3664             {
3665             case '&':           /* revision of following character set */
3666               ONE_MORE_BYTE (c1);
3667               if (!(c1 >= '@' && c1 <= '~'))
3668                 goto invalid_code;
3669               ONE_MORE_BYTE (c1);
3670               if (c1 != ISO_CODE_ESC)
3671                 goto invalid_code;
3672               ONE_MORE_BYTE (c1);
3673               goto label_escape_sequence;
3674
3675             case '$':           /* designation of 2-byte character set */
3676               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3677                 goto invalid_code;
3678               {
3679                 int reg, chars96;
3680
3681                 ONE_MORE_BYTE (c1);
3682                 if (c1 >= '@' && c1 <= 'B')
3683                   {     /* designation of JISX0208.1978, GB2312.1980,
3684                            or JISX0208.1980 */
3685                     reg = 0, chars96 = 0;
3686                   }
3687                 else if (c1 >= 0x28 && c1 <= 0x2B)
3688                   { /* designation of DIMENSION2_CHARS94 character set */
3689                     reg = c1 - 0x28, chars96 = 0;
3690                     ONE_MORE_BYTE (c1);
3691                   }
3692                 else if (c1 >= 0x2C && c1 <= 0x2F)
3693                   { /* designation of DIMENSION2_CHARS96 character set */
3694                     reg = c1 - 0x2C, chars96 = 1;
3695                     ONE_MORE_BYTE (c1);
3696                   }
3697                 else
3698                   goto invalid_code;
3699                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3700                 /* We must update these variables now.  */
3701                 if (reg == 0)
3702                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3703                 else if (reg == 1)
3704                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3705                 if (chars96 < 0)
3706                   goto invalid_code;
3707               }
3708               continue;
3709
3710             case 'n':           /* invocation of locking-shift-2 */
3711               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3712                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3713                 goto invalid_code;
3714               CODING_ISO_INVOCATION (coding, 0) = 2;
3715               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3716               continue;
3717
3718             case 'o':           /* invocation of locking-shift-3 */
3719               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3720                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3721                 goto invalid_code;
3722               CODING_ISO_INVOCATION (coding, 0) = 3;
3723               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3724               continue;
3725
3726             case 'N':           /* invocation of single-shift-2 */
3727               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3728                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3729                 goto invalid_code;
3730               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3731               if (charset_id_2 < 0)
3732                 charset = CHARSET_FROM_ID (charset_ascii);
3733               else
3734                 charset = CHARSET_FROM_ID (charset_id_2);
3735               ONE_MORE_BYTE (c1);
3736               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3737                 goto invalid_code;
3738               break;
3739
3740             case 'O':           /* invocation of single-shift-3 */
3741               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3742                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3743                 goto invalid_code;
3744               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3745               if (charset_id_3 < 0)
3746                 charset = CHARSET_FROM_ID (charset_ascii);
3747               else
3748                 charset = CHARSET_FROM_ID (charset_id_3);
3749               ONE_MORE_BYTE (c1);
3750               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3751                 goto invalid_code;
3752               break;
3753
3754             case '0': case '2': case '3': case '4': /* start composition */
3755               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3756                 goto invalid_code;
3757               if (last_id != charset_ascii)
3758                 {
3759                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3760                   last_id = charset_ascii;
3761                   last_offset = char_offset;
3762                 }
3763               DECODE_COMPOSITION_START (c1);
3764               continue;
3765
3766             case '1':           /* end composition */
3767               if (cmp_status->state == COMPOSING_NO)
3768                 goto invalid_code;
3769               DECODE_COMPOSITION_END ();
3770               continue;
3771
3772             case '[':           /* specification of direction */
3773               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3774                 goto invalid_code;
3775               /* For the moment, nested direction is not supported.
3776                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3777                  left-to-right, and nonzero means right-to-left.  */
3778               ONE_MORE_BYTE (c1);
3779               switch (c1)
3780                 {
3781                 case ']':       /* end of the current direction */
3782                   coding->mode &= ~CODING_MODE_DIRECTION;
3783
3784                 case '0':       /* end of the current direction */
3785                 case '1':       /* start of left-to-right direction */
3786                   ONE_MORE_BYTE (c1);
3787                   if (c1 == ']')
3788                     coding->mode &= ~CODING_MODE_DIRECTION;
3789                   else
3790                     goto invalid_code;
3791                   break;
3792
3793                 case '2':       /* start of right-to-left direction */
3794                   ONE_MORE_BYTE (c1);
3795                   if (c1 == ']')
3796                     coding->mode |= CODING_MODE_DIRECTION;
3797                   else
3798                     goto invalid_code;
3799                   break;
3800
3801                 default:
3802                   goto invalid_code;
3803                 }
3804               continue;
3805
3806             case '%':
3807               ONE_MORE_BYTE (c1);
3808               if (c1 == '/')
3809                 {
3810                   /* CTEXT extended segment:
3811                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3812                      We keep these bytes as is for the moment.
3813                      They may be decoded by post-read-conversion.  */
3814                   int dim, M, L;
3815                   int size;
3816
3817                   ONE_MORE_BYTE (dim);
3818                   if (dim < '0' || dim > '4')
3819                     goto invalid_code;
3820                   ONE_MORE_BYTE (M);
3821                   if (M < 128)
3822                     goto invalid_code;
3823                   ONE_MORE_BYTE (L);
3824                   if (L < 128)
3825                     goto invalid_code;
3826                   size = ((M - 128) * 128) + (L - 128);
3827                   if (charbuf + 6 > charbuf_end)
3828                     goto break_loop;
3829                   *charbuf++ = ISO_CODE_ESC;
3830                   *charbuf++ = '%';
3831                   *charbuf++ = '/';
3832                   *charbuf++ = dim;
3833                   *charbuf++ = BYTE8_TO_CHAR (M);
3834                   *charbuf++ = BYTE8_TO_CHAR (L);
3835                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3836                 }
3837               else if (c1 == 'G')
3838                 {
3839                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3840                      ESC % G --UTF-8-BYTES-- ESC % @
3841                      We keep these bytes as is for the moment.
3842                      They may be decoded by post-read-conversion.  */
3843                   if (charbuf + 3 > charbuf_end)
3844                     goto break_loop;
3845                   *charbuf++ = ISO_CODE_ESC;
3846                   *charbuf++ = '%';
3847                   *charbuf++ = 'G';
3848                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3849                 }
3850               else
3851                 goto invalid_code;
3852               continue;
3853               break;
3854
3855             default:
3856               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3857                 goto invalid_code;
3858               {
3859                 int reg, chars96;
3860
3861                 if (c1 >= 0x28 && c1 <= 0x2B)
3862                   { /* designation of DIMENSION1_CHARS94 character set */
3863                     reg = c1 - 0x28, chars96 = 0;
3864                     ONE_MORE_BYTE (c1);
3865                   }
3866                 else if (c1 >= 0x2C && c1 <= 0x2F)
3867                   { /* designation of DIMENSION1_CHARS96 character set */
3868                     reg = c1 - 0x2C, chars96 = 1;
3869                     ONE_MORE_BYTE (c1);
3870                   }
3871                 else
3872                   goto invalid_code;
3873                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3874                 /* We must update these variables now.  */
3875                 if (reg == 0)
3876                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3877                 else if (reg == 1)
3878                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3879                 if (chars96 < 0)
3880                   goto invalid_code;
3881               }
3882               continue;
3883             }
3884           break;
3885
3886         default:
3887           emacs_abort ();
3888         }
3889
3890       if (cmp_status->state == COMPOSING_NO
3891           && charset->id != charset_ascii
3892           && last_id != charset->id)
3893         {
3894           if (last_id != charset_ascii)
3895             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3896           last_id = charset->id;
3897           last_offset = char_offset;
3898         }
3899
3900       /* Now we know CHARSET and 1st position code C1 of a character.
3901          Produce a decoded character while getting 2nd and 3rd
3902          position codes C2, C3 if necessary.  */
3903       if (CHARSET_DIMENSION (charset) > 1)
3904         {
3905           ONE_MORE_BYTE (c2);
3906           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3907               || ((c1 & 0x80) != (c2 & 0x80)))
3908             /* C2 is not in a valid range.  */
3909             goto invalid_code;
3910           if (CHARSET_DIMENSION (charset) == 2)
3911             c1 = (c1 << 8) | c2;
3912           else
3913             {
3914               ONE_MORE_BYTE (c3);
3915               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3916                   || ((c1 & 0x80) != (c3 & 0x80)))
3917                 /* C3 is not in a valid range.  */
3918                 goto invalid_code;
3919               c1 = (c1 << 16) | (c2 << 8) | c2;
3920             }
3921         }
3922       c1 &= 0x7F7F7F;
3923       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3924       if (c < 0)
3925         {
3926           MAYBE_FINISH_COMPOSITION ();
3927           for (; src_base < src; src_base++, char_offset++)
3928             {
3929               if (ASCII_BYTE_P (*src_base))
3930                 *charbuf++ = *src_base;
3931               else
3932                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3933             }
3934         }
3935       else if (cmp_status->state == COMPOSING_NO)
3936         {
3937           *charbuf++ = c;
3938           char_offset++;
3939         }
3940       else if ((cmp_status->state == COMPOSING_CHAR
3941                 ? cmp_status->nchars
3942                 : cmp_status->ncomps)
3943                >= MAX_COMPOSITION_COMPONENTS)
3944         {
3945           /* Too long composition.  */
3946           MAYBE_FINISH_COMPOSITION ();
3947           *charbuf++ = c;
3948           char_offset++;
3949         }
3950       else
3951         STORE_COMPOSITION_CHAR (c);
3952       continue;
3953
3954     invalid_code:
3955       MAYBE_FINISH_COMPOSITION ();
3956       src = src_base;
3957       consumed_chars = consumed_chars_base;
3958       ONE_MORE_BYTE (c);
3959       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3960       char_offset++;
3961       coding->errors++;
3962       /* Reset the invocation and designation status to the safest
3963          one; i.e. designate ASCII to the graphic register 0, and
3964          invoke that register to the graphic plane 0.  This typically
3965          helps the case that an designation sequence for ASCII "ESC (
3966          B" is somehow broken (e.g. broken by a newline).  */
3967       CODING_ISO_INVOCATION (coding, 0) = 0;
3968       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3969       charset_id_0 = charset_ascii;
3970       continue;
3971
3972     break_loop:
3973       break;
3974     }
3975
3976  no_more_source:
3977   if (cmp_status->state != COMPOSING_NO)
3978     {
3979       if (coding->mode & CODING_MODE_LAST_BLOCK)
3980         MAYBE_FINISH_COMPOSITION ();
3981       else
3982         {
3983           charbuf -= cmp_status->length;
3984           for (i = 0; i < cmp_status->length; i++)
3985             cmp_status->carryover[i] = charbuf[i];
3986         }
3987     }
3988   else if (last_id != charset_ascii)
3989     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3990   coding->consumed_char += consumed_chars_base;
3991   coding->consumed = src_base - coding->source;
3992   coding->charbuf_used = charbuf - coding->charbuf;
3993 }
3994
3995
3996 /* ISO2022 encoding stuff.  */
3997
3998 /*
3999    It is not enough to say just "ISO2022" on encoding, we have to
4000    specify more details.  In Emacs, each coding system of ISO2022
4001    variant has the following specifications:
4002         1. Initial designation to G0 thru G3.
4003         2. Allows short-form designation?
4004         3. ASCII should be designated to G0 before control characters?
4005         4. ASCII should be designated to G0 at end of line?
4006         5. 7-bit environment or 8-bit environment?
4007         6. Use locking-shift?
4008         7. Use Single-shift?
4009    And the following two are only for Japanese:
4010         8. Use ASCII in place of JIS0201-1976-Roman?
4011         9. Use JISX0208-1983 in place of JISX0208-1978?
4012    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4013    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4014    details.
4015 */
4016
4017 /* Produce codes (escape sequence) for designating CHARSET to graphic
4018    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4019    '@', 'A', or 'B' and the coding system CODING allows, produce
4020    designation sequence of short-form.  */
4021
4022 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4023   do {                                                                  \
4024     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4025     const char *intermediate_char_94 = "()*+";                          \
4026     const char *intermediate_char_96 = ",-./";                          \
4027     int revision = -1;                                                  \
4028                                                                         \
4029     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4030       revision = CHARSET_ISO_REVISION (charset);                        \
4031                                                                         \
4032     if (revision >= 0)                                                  \
4033       {                                                                 \
4034         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4035         EMIT_ONE_BYTE ('@' + revision);                                 \
4036       }                                                                 \
4037     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4038     if (CHARSET_DIMENSION (charset) == 1)                               \
4039       {                                                                 \
4040         int b;                                                          \
4041         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4042           b = intermediate_char_94[reg];                                \
4043         else                                                            \
4044           b = intermediate_char_96[reg];                                \
4045         EMIT_ONE_ASCII_BYTE (b);                                        \
4046       }                                                                 \
4047     else                                                                \
4048       {                                                                 \
4049         EMIT_ONE_ASCII_BYTE ('$');                                      \
4050         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4051           {                                                             \
4052             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4053                 || reg != 0                                             \
4054                 || final_char < '@' || final_char > 'B')                \
4055               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4056           }                                                             \
4057         else                                                            \
4058           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4059       }                                                                 \
4060     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4061                                                                         \
4062     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4063   } while (0)
4064
4065
4066 /* The following two macros produce codes (control character or escape
4067    sequence) for ISO2022 single-shift functions (single-shift-2 and
4068    single-shift-3).  */
4069
4070 #define ENCODE_SINGLE_SHIFT_2                                           \
4071   do {                                                                  \
4072     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4073       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4074     else                                                                \
4075       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4076     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4077   } while (0)
4078
4079
4080 #define ENCODE_SINGLE_SHIFT_3                                           \
4081   do {                                                                  \
4082     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4083       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4084     else                                                                \
4085       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4086     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4087   } while (0)
4088
4089
4090 /* The following four macros produce codes (control character or
4091    escape sequence) for ISO2022 locking-shift functions (shift-in,
4092    shift-out, locking-shift-2, and locking-shift-3).  */
4093
4094 #define ENCODE_SHIFT_IN                                 \
4095   do {                                                  \
4096     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4097     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4098   } while (0)
4099
4100
4101 #define ENCODE_SHIFT_OUT                                \
4102   do {                                                  \
4103     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4104     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4105   } while (0)
4106
4107
4108 #define ENCODE_LOCKING_SHIFT_2                          \
4109   do {                                                  \
4110     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4111     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4112   } while (0)
4113
4114
4115 #define ENCODE_LOCKING_SHIFT_3                          \
4116   do {                                                  \
4117     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4118     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4119   } while (0)
4120
4121
4122 /* Produce codes for a DIMENSION1 character whose character set is
4123    CHARSET and whose position-code is C1.  Designation and invocation
4124    sequences are also produced in advance if necessary.  */
4125
4126 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4127   do {                                                                  \
4128     int id = CHARSET_ID (charset);                                      \
4129                                                                         \
4130     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4131         && id == charset_ascii)                                         \
4132       {                                                                 \
4133         id = charset_jisx0201_roman;                                    \
4134         charset = CHARSET_FROM_ID (id);                                 \
4135       }                                                                 \
4136                                                                         \
4137     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4138       {                                                                 \
4139         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4140           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4141         else                                                            \
4142           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4143         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4144         break;                                                          \
4145       }                                                                 \
4146     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4147       {                                                                 \
4148         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4149         break;                                                          \
4150       }                                                                 \
4151     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4152       {                                                                 \
4153         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4154         break;                                                          \
4155       }                                                                 \
4156     else                                                                \
4157       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4158          must invoke it, or, at first, designate it to some graphic     \
4159          register.  Then repeat the loop to actually produce the        \
4160          character.  */                                                 \
4161       dst = encode_invocation_designation (charset, coding, dst,        \
4162                                            &produced_chars);            \
4163   } while (1)
4164
4165
4166 /* Produce codes for a DIMENSION2 character whose character set is
4167    CHARSET and whose position-codes are C1 and C2.  Designation and
4168    invocation codes are also produced in advance if necessary.  */
4169
4170 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4171   do {                                                                  \
4172     int id = CHARSET_ID (charset);                                      \
4173                                                                         \
4174     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4175         && id == charset_jisx0208)                                      \
4176       {                                                                 \
4177         id = charset_jisx0208_1978;                                     \
4178         charset = CHARSET_FROM_ID (id);                                 \
4179       }                                                                 \
4180                                                                         \
4181     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4182       {                                                                 \
4183         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4184           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4185         else                                                            \
4186           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4187         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4188         break;                                                          \
4189       }                                                                 \
4190     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4191       {                                                                 \
4192         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4193         break;                                                          \
4194       }                                                                 \
4195     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4196       {                                                                 \
4197         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4198         break;                                                          \
4199       }                                                                 \
4200     else                                                                \
4201       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4202          must invoke it, or, at first, designate it to some graphic     \
4203          register.  Then repeat the loop to actually produce the        \
4204          character.  */                                                 \
4205       dst = encode_invocation_designation (charset, coding, dst,        \
4206                                            &produced_chars);            \
4207   } while (1)
4208
4209
4210 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4211   do {                                                                     \
4212     unsigned code;                                                         \
4213     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4214                                                                            \
4215     if (CHARSET_DIMENSION (charset) == 1)                                  \
4216       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4217     else                                                                   \
4218       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4219   } while (0)
4220
4221
4222 /* Produce designation and invocation codes at a place pointed by DST
4223    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4224    Return new DST.  */
4225
4226 static unsigned char *
4227 encode_invocation_designation (struct charset *charset,
4228                                struct coding_system *coding,
4229                                unsigned char *dst, ptrdiff_t *p_nchars)
4230 {
4231   bool multibytep = coding->dst_multibyte;
4232   ptrdiff_t produced_chars = *p_nchars;
4233   int reg;                      /* graphic register number */
4234   int id = CHARSET_ID (charset);
4235
4236   /* At first, check designations.  */
4237   for (reg = 0; reg < 4; reg++)
4238     if (id == CODING_ISO_DESIGNATION (coding, reg))
4239       break;
4240
4241   if (reg >= 4)
4242     {
4243       /* CHARSET is not yet designated to any graphic registers.  */
4244       /* At first check the requested designation.  */
4245       reg = CODING_ISO_REQUEST (coding, id);
4246       if (reg < 0)
4247         /* Since CHARSET requests no special designation, designate it
4248            to graphic register 0.  */
4249         reg = 0;
4250
4251       ENCODE_DESIGNATION (charset, reg, coding);
4252     }
4253
4254   if (CODING_ISO_INVOCATION (coding, 0) != reg
4255       && CODING_ISO_INVOCATION (coding, 1) != reg)
4256     {
4257       /* Since the graphic register REG is not invoked to any graphic
4258          planes, invoke it to graphic plane 0.  */
4259       switch (reg)
4260         {
4261         case 0:                 /* graphic register 0 */
4262           ENCODE_SHIFT_IN;
4263           break;
4264
4265         case 1:                 /* graphic register 1 */
4266           ENCODE_SHIFT_OUT;
4267           break;
4268
4269         case 2:                 /* graphic register 2 */
4270           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4271             ENCODE_SINGLE_SHIFT_2;
4272           else
4273             ENCODE_LOCKING_SHIFT_2;
4274           break;
4275
4276         case 3:                 /* graphic register 3 */
4277           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4278             ENCODE_SINGLE_SHIFT_3;
4279           else
4280             ENCODE_LOCKING_SHIFT_3;
4281           break;
4282         }
4283     }
4284
4285   *p_nchars = produced_chars;
4286   return dst;
4287 }
4288
4289
4290 /* Produce codes for designation and invocation to reset the graphic
4291    planes and registers to initial state.  */
4292 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4293   do {                                                                  \
4294     int reg;                                                            \
4295     struct charset *charset;                                            \
4296                                                                         \
4297     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4298       ENCODE_SHIFT_IN;                                                  \
4299     for (reg = 0; reg < 4; reg++)                                       \
4300       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4301           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4302               != CODING_ISO_INITIAL (coding, reg)))                     \
4303         {                                                               \
4304           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4305           ENCODE_DESIGNATION (charset, reg, coding);                    \
4306         }                                                               \
4307   } while (0)
4308
4309
4310 /* Produce designation sequences of charsets in the line started from
4311    CHARBUF to a place pointed by DST, and return the number of
4312    produced bytes.  DST should not directly point a buffer text area
4313    which may be relocated by char_charset call.
4314
4315    If the current block ends before any end-of-line, we may fail to
4316    find all the necessary designations.  */
4317
4318 static ptrdiff_t
4319 encode_designation_at_bol (struct coding_system *coding,
4320                            int *charbuf, int *charbuf_end,
4321                            unsigned char *dst)
4322 {
4323   unsigned char *orig = dst;
4324   struct charset *charset;
4325   /* Table of charsets to be designated to each graphic register.  */
4326   int r[4];
4327   int c, found = 0, reg;
4328   ptrdiff_t produced_chars = 0;
4329   bool multibytep = coding->dst_multibyte;
4330   Lisp_Object attrs;
4331   Lisp_Object charset_list;
4332
4333   attrs = CODING_ID_ATTRS (coding->id);
4334   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4335   if (EQ (charset_list, Qiso_2022))
4336     charset_list = Viso_2022_charset_list;
4337
4338   for (reg = 0; reg < 4; reg++)
4339     r[reg] = -1;
4340
4341   while (charbuf < charbuf_end && found < 4)
4342     {
4343       int id;
4344
4345       c = *charbuf++;
4346       if (c == '\n')
4347         break;
4348       charset = char_charset (c, charset_list, NULL);
4349       id = CHARSET_ID (charset);
4350       reg = CODING_ISO_REQUEST (coding, id);
4351       if (reg >= 0 && r[reg] < 0)
4352         {
4353           found++;
4354           r[reg] = id;
4355         }
4356     }
4357
4358   if (found)
4359     {
4360       for (reg = 0; reg < 4; reg++)
4361         if (r[reg] >= 0
4362             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4363           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4364     }
4365
4366   return dst - orig;
4367 }
4368
4369 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4370
4371 static bool
4372 encode_coding_iso_2022 (struct coding_system *coding)
4373 {
4374   bool multibytep = coding->dst_multibyte;
4375   int *charbuf = coding->charbuf;
4376   int *charbuf_end = charbuf + coding->charbuf_used;
4377   unsigned char *dst = coding->destination + coding->produced;
4378   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4379   int safe_room = 16;
4380   bool bol_designation
4381     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4382        && CODING_ISO_BOL (coding));
4383   ptrdiff_t produced_chars = 0;
4384   Lisp_Object attrs, eol_type, charset_list;
4385   bool ascii_compatible;
4386   int c;
4387   int preferred_charset_id = -1;
4388
4389   CODING_GET_INFO (coding, attrs, charset_list);
4390   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4391   if (VECTORP (eol_type))
4392     eol_type = Qunix;
4393
4394   setup_iso_safe_charsets (attrs);
4395   /* Charset list may have been changed.  */
4396   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4397   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4398
4399   ascii_compatible
4400     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4401        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4402                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4403
4404   while (charbuf < charbuf_end)
4405     {
4406       ASSURE_DESTINATION (safe_room);
4407
4408       if (bol_designation)
4409         {
4410           /* We have to produce designation sequences if any now.  */
4411           unsigned char desig_buf[16];
4412           int nbytes;
4413           ptrdiff_t offset;
4414
4415           charset_map_loaded = 0;
4416           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4417                                               desig_buf);
4418           if (charset_map_loaded
4419               && (offset = coding_change_destination (coding)))
4420             {
4421               dst += offset;
4422               dst_end += offset;
4423             }
4424           memcpy (dst, desig_buf, nbytes);
4425           dst += nbytes;
4426           /* We are sure that designation sequences are all ASCII bytes.  */
4427           produced_chars += nbytes;
4428           bol_designation = 0;
4429           ASSURE_DESTINATION (safe_room);
4430         }
4431
4432       c = *charbuf++;
4433
4434       if (c < 0)
4435         {
4436           /* Handle an annotation.  */
4437           switch (*charbuf)
4438             {
4439             case CODING_ANNOTATE_COMPOSITION_MASK:
4440               /* Not yet implemented.  */
4441               break;
4442             case CODING_ANNOTATE_CHARSET_MASK:
4443               preferred_charset_id = charbuf[2];
4444               if (preferred_charset_id >= 0
4445                   && NILP (Fmemq (make_number (preferred_charset_id),
4446                                   charset_list)))
4447                 preferred_charset_id = -1;
4448               break;
4449             default:
4450               emacs_abort ();
4451             }
4452           charbuf += -c - 1;
4453           continue;
4454         }
4455
4456       /* Now encode the character C.  */
4457       if (c < 0x20 || c == 0x7F)
4458         {
4459           if (c == '\n'
4460               || (c == '\r' && EQ (eol_type, Qmac)))
4461             {
4462               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4463                 ENCODE_RESET_PLANE_AND_REGISTER ();
4464               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4465                 {
4466                   int i;
4467
4468                   for (i = 0; i < 4; i++)
4469                     CODING_ISO_DESIGNATION (coding, i)
4470                       = CODING_ISO_INITIAL (coding, i);
4471                 }
4472               bol_designation = ((CODING_ISO_FLAGS (coding)
4473                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4474                                  != 0);
4475             }
4476           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4477             ENCODE_RESET_PLANE_AND_REGISTER ();
4478           EMIT_ONE_ASCII_BYTE (c);
4479         }
4480       else if (ASCII_CHAR_P (c))
4481         {
4482           if (ascii_compatible)
4483             EMIT_ONE_ASCII_BYTE (c);
4484           else
4485             {
4486               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4487               ENCODE_ISO_CHARACTER (charset, c);
4488             }
4489         }
4490       else if (CHAR_BYTE8_P (c))
4491         {
4492           c = CHAR_TO_BYTE8 (c);
4493           EMIT_ONE_BYTE (c);
4494         }
4495       else
4496         {
4497           struct charset *charset;
4498
4499           if (preferred_charset_id >= 0)
4500             {
4501               bool result;
4502
4503               charset = CHARSET_FROM_ID (preferred_charset_id);
4504               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4505               if (! result)
4506                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4507                                      NULL, charset);
4508             }
4509           else
4510             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4511                                  NULL, charset);
4512           if (!charset)
4513             {
4514               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4515                 {
4516                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4517                   charset = CHARSET_FROM_ID (charset_ascii);
4518                 }
4519               else
4520                 {
4521                   c = coding->default_char;
4522                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4523                                        charset_list, NULL, charset);
4524                 }
4525             }
4526           ENCODE_ISO_CHARACTER (charset, c);
4527         }
4528     }
4529
4530   if (coding->mode & CODING_MODE_LAST_BLOCK
4531       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4532     {
4533       ASSURE_DESTINATION (safe_room);
4534       ENCODE_RESET_PLANE_AND_REGISTER ();
4535     }
4536   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4537   CODING_ISO_BOL (coding) = bol_designation;
4538   coding->produced_char += produced_chars;
4539   coding->produced = dst - coding->destination;
4540   return 0;
4541 }
4542
4543 \f
4544 /*** 8,9. SJIS and BIG5 handlers ***/
4545
4546 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4547    quite widely.  So, for the moment, Emacs supports them in the bare
4548    C code.  But, in the future, they may be supported only by CCL.  */
4549
4550 /* SJIS is a coding system encoding three character sets: ASCII, right
4551    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4552    as is.  A character of charset katakana-jisx0201 is encoded by
4553    "position-code + 0x80".  A character of charset japanese-jisx0208
4554    is encoded in 2-byte but two position-codes are divided and shifted
4555    so that it fit in the range below.
4556
4557    --- CODE RANGE of SJIS ---
4558    (character set)      (range)
4559    ASCII                0x00 .. 0x7F
4560    KATAKANA-JISX0201    0xA0 .. 0xDF
4561    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4562             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4563    -------------------------------
4564
4565 */
4566
4567 /* BIG5 is a coding system encoding two character sets: ASCII and
4568    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4569    character set and is encoded in two-byte.
4570
4571    --- CODE RANGE of BIG5 ---
4572    (character set)      (range)
4573    ASCII                0x00 .. 0x7F
4574    Big5 (1st byte)      0xA1 .. 0xFE
4575         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4576    --------------------------
4577
4578   */
4579
4580 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4581    Return true if a text is encoded in SJIS.  */
4582
4583 static bool
4584 detect_coding_sjis (struct coding_system *coding,
4585                     struct coding_detection_info *detect_info)
4586 {
4587   const unsigned char *src = coding->source, *src_base;
4588   const unsigned char *src_end = coding->source + coding->src_bytes;
4589   bool multibytep = coding->src_multibyte;
4590   ptrdiff_t consumed_chars = 0;
4591   int found = 0;
4592   int c;
4593   Lisp_Object attrs, charset_list;
4594   int max_first_byte_of_2_byte_code;
4595
4596   CODING_GET_INFO (coding, attrs, charset_list);
4597   max_first_byte_of_2_byte_code
4598     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4599
4600   detect_info->checked |= CATEGORY_MASK_SJIS;
4601   /* A coding system of this category is always ASCII compatible.  */
4602   src += coding->head_ascii;
4603
4604   while (1)
4605     {
4606       src_base = src;
4607       ONE_MORE_BYTE (c);
4608       if (c < 0x80)
4609         continue;
4610       if ((c >= 0x81 && c <= 0x9F)
4611           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4612         {
4613           ONE_MORE_BYTE (c);
4614           if (c < 0x40 || c == 0x7F || c > 0xFC)
4615             break;
4616           found = CATEGORY_MASK_SJIS;
4617         }
4618       else if (c >= 0xA0 && c < 0xE0)
4619         found = CATEGORY_MASK_SJIS;
4620       else
4621         break;
4622     }
4623   detect_info->rejected |= CATEGORY_MASK_SJIS;
4624   return 0;
4625
4626  no_more_source:
4627   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4628     {
4629       detect_info->rejected |= CATEGORY_MASK_SJIS;
4630       return 0;
4631     }
4632   detect_info->found |= found;
4633   return 1;
4634 }
4635
4636 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4637    Return true if a text is encoded in BIG5.  */
4638
4639 static bool
4640 detect_coding_big5 (struct coding_system *coding,
4641                     struct coding_detection_info *detect_info)
4642 {
4643   const unsigned char *src = coding->source, *src_base;
4644   const unsigned char *src_end = coding->source + coding->src_bytes;
4645   bool multibytep = coding->src_multibyte;
4646   ptrdiff_t consumed_chars = 0;
4647   int found = 0;
4648   int c;
4649
4650   detect_info->checked |= CATEGORY_MASK_BIG5;
4651   /* A coding system of this category is always ASCII compatible.  */
4652   src += coding->head_ascii;
4653
4654   while (1)
4655     {
4656       src_base = src;
4657       ONE_MORE_BYTE (c);
4658       if (c < 0x80)
4659         continue;
4660       if (c >= 0xA1)
4661         {
4662           ONE_MORE_BYTE (c);
4663           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4664             return 0;
4665           found = CATEGORY_MASK_BIG5;
4666         }
4667       else
4668         break;
4669     }
4670   detect_info->rejected |= CATEGORY_MASK_BIG5;
4671   return 0;
4672
4673  no_more_source:
4674   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4675     {
4676       detect_info->rejected |= CATEGORY_MASK_BIG5;
4677       return 0;
4678     }
4679   detect_info->found |= found;
4680   return 1;
4681 }
4682
4683 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4684
4685 static void
4686 decode_coding_sjis (struct coding_system *coding)
4687 {
4688   const unsigned char *src = coding->source + coding->consumed;
4689   const unsigned char *src_end = coding->source + coding->src_bytes;
4690   const unsigned char *src_base;
4691   int *charbuf = coding->charbuf + coding->charbuf_used;
4692   /* We may produce one charset annotation in one loop and one more at
4693      the end.  */
4694   int *charbuf_end
4695     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4696   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4697   bool multibytep = coding->src_multibyte;
4698   struct charset *charset_roman, *charset_kanji, *charset_kana;
4699   struct charset *charset_kanji2;
4700   Lisp_Object attrs, charset_list, val;
4701   ptrdiff_t char_offset = coding->produced_char;
4702   ptrdiff_t last_offset = char_offset;
4703   int last_id = charset_ascii;
4704   bool eol_dos
4705     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4706   int byte_after_cr = -1;
4707
4708   CODING_GET_INFO (coding, attrs, charset_list);
4709
4710   val = charset_list;
4711   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4712   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4713   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4714   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4715
4716   while (1)
4717     {
4718       int c, c1;
4719       struct charset *charset;
4720
4721       src_base = src;
4722       consumed_chars_base = consumed_chars;
4723
4724       if (charbuf >= charbuf_end)
4725         {
4726           if (byte_after_cr >= 0)
4727             src_base--;
4728           break;
4729         }
4730
4731       if (byte_after_cr >= 0)
4732         c = byte_after_cr, byte_after_cr = -1;
4733       else
4734         ONE_MORE_BYTE (c);
4735       if (c < 0)
4736         goto invalid_code;
4737       if (c < 0x80)
4738         {
4739           if (eol_dos && c == '\r')
4740             ONE_MORE_BYTE (byte_after_cr);
4741           charset = charset_roman;
4742         }
4743       else if (c == 0x80 || c == 0xA0)
4744         goto invalid_code;
4745       else if (c >= 0xA1 && c <= 0xDF)
4746         {
4747           /* SJIS -> JISX0201-Kana */
4748           c &= 0x7F;
4749           charset = charset_kana;
4750         }
4751       else if (c <= 0xEF)
4752         {
4753           /* SJIS -> JISX0208 */
4754           ONE_MORE_BYTE (c1);
4755           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4756             goto invalid_code;
4757           c = (c << 8) | c1;
4758           SJIS_TO_JIS (c);
4759           charset = charset_kanji;
4760         }
4761       else if (c <= 0xFC && charset_kanji2)
4762         {
4763           /* SJIS -> JISX0213-2 */
4764           ONE_MORE_BYTE (c1);
4765           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4766             goto invalid_code;
4767           c = (c << 8) | c1;
4768           SJIS_TO_JIS2 (c);
4769           charset = charset_kanji2;
4770         }
4771       else
4772         goto invalid_code;
4773       if (charset->id != charset_ascii
4774           && last_id != charset->id)
4775         {
4776           if (last_id != charset_ascii)
4777             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4778           last_id = charset->id;
4779           last_offset = char_offset;
4780         }
4781       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4782       *charbuf++ = c;
4783       char_offset++;
4784       continue;
4785
4786     invalid_code:
4787       src = src_base;
4788       consumed_chars = consumed_chars_base;
4789       ONE_MORE_BYTE (c);
4790       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4791       char_offset++;
4792       coding->errors++;
4793     }
4794
4795  no_more_source:
4796   if (last_id != charset_ascii)
4797     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4798   coding->consumed_char += consumed_chars_base;
4799   coding->consumed = src_base - coding->source;
4800   coding->charbuf_used = charbuf - coding->charbuf;
4801 }
4802
4803 static void
4804 decode_coding_big5 (struct coding_system *coding)
4805 {
4806   const unsigned char *src = coding->source + coding->consumed;
4807   const unsigned char *src_end = coding->source + coding->src_bytes;
4808   const unsigned char *src_base;
4809   int *charbuf = coding->charbuf + coding->charbuf_used;
4810   /* We may produce one charset annotation in one loop and one more at
4811      the end.  */
4812   int *charbuf_end
4813     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4814   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4815   bool multibytep = coding->src_multibyte;
4816   struct charset *charset_roman, *charset_big5;
4817   Lisp_Object attrs, charset_list, val;
4818   ptrdiff_t char_offset = coding->produced_char;
4819   ptrdiff_t last_offset = char_offset;
4820   int last_id = charset_ascii;
4821   bool eol_dos
4822     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4823   int byte_after_cr = -1;
4824
4825   CODING_GET_INFO (coding, attrs, charset_list);
4826   val = charset_list;
4827   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4828   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4829
4830   while (1)
4831     {
4832       int c, c1;
4833       struct charset *charset;
4834
4835       src_base = src;
4836       consumed_chars_base = consumed_chars;
4837
4838       if (charbuf >= charbuf_end)
4839         {
4840           if (byte_after_cr >= 0)
4841             src_base--;
4842           break;
4843         }
4844
4845       if (byte_after_cr >= 0)
4846         c = byte_after_cr, byte_after_cr = -1;
4847       else
4848         ONE_MORE_BYTE (c);
4849
4850       if (c < 0)
4851         goto invalid_code;
4852       if (c < 0x80)
4853         {
4854           if (eol_dos && c == '\r')
4855             ONE_MORE_BYTE (byte_after_cr);
4856           charset = charset_roman;
4857         }
4858       else
4859         {
4860           /* BIG5 -> Big5 */
4861           if (c < 0xA1 || c > 0xFE)
4862             goto invalid_code;
4863           ONE_MORE_BYTE (c1);
4864           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4865             goto invalid_code;
4866           c = c << 8 | c1;
4867           charset = charset_big5;
4868         }
4869       if (charset->id != charset_ascii
4870           && last_id != charset->id)
4871         {
4872           if (last_id != charset_ascii)
4873             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4874           last_id = charset->id;
4875           last_offset = char_offset;
4876         }
4877       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4878       *charbuf++ = c;
4879       char_offset++;
4880       continue;
4881
4882     invalid_code:
4883       src = src_base;
4884       consumed_chars = consumed_chars_base;
4885       ONE_MORE_BYTE (c);
4886       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4887       char_offset++;
4888       coding->errors++;
4889     }
4890
4891  no_more_source:
4892   if (last_id != charset_ascii)
4893     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4894   coding->consumed_char += consumed_chars_base;
4895   coding->consumed = src_base - coding->source;
4896   coding->charbuf_used = charbuf - coding->charbuf;
4897 }
4898
4899 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4900    This function can encode charsets `ascii', `katakana-jisx0201',
4901    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4902    are sure that all these charsets are registered as official charset
4903    (i.e. do not have extended leading-codes).  Characters of other
4904    charsets are produced without any encoding.  */
4905
4906 static bool
4907 encode_coding_sjis (struct coding_system *coding)
4908 {
4909   bool multibytep = coding->dst_multibyte;
4910   int *charbuf = coding->charbuf;
4911   int *charbuf_end = charbuf + coding->charbuf_used;
4912   unsigned char *dst = coding->destination + coding->produced;
4913   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4914   int safe_room = 4;
4915   ptrdiff_t produced_chars = 0;
4916   Lisp_Object attrs, charset_list, val;
4917   bool ascii_compatible;
4918   struct charset *charset_kanji, *charset_kana;
4919   struct charset *charset_kanji2;
4920   int c;
4921
4922   CODING_GET_INFO (coding, attrs, charset_list);
4923   val = XCDR (charset_list);
4924   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4925   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4926   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4927
4928   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4929
4930   while (charbuf < charbuf_end)
4931     {
4932       ASSURE_DESTINATION (safe_room);
4933       c = *charbuf++;
4934       /* Now encode the character C.  */
4935       if (ASCII_CHAR_P (c) && ascii_compatible)
4936         EMIT_ONE_ASCII_BYTE (c);
4937       else if (CHAR_BYTE8_P (c))
4938         {
4939           c = CHAR_TO_BYTE8 (c);
4940           EMIT_ONE_BYTE (c);
4941         }
4942       else
4943         {
4944           unsigned code;
4945           struct charset *charset;
4946           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4947                                &code, charset);
4948
4949           if (!charset)
4950             {
4951               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4952                 {
4953                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4954                   charset = CHARSET_FROM_ID (charset_ascii);
4955                 }
4956               else
4957                 {
4958                   c = coding->default_char;
4959                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4960                                        charset_list, &code, charset);
4961                 }
4962             }
4963           if (code == CHARSET_INVALID_CODE (charset))
4964             emacs_abort ();
4965           if (charset == charset_kanji)
4966             {
4967               int c1, c2;
4968               JIS_TO_SJIS (code);
4969               c1 = code >> 8, c2 = code & 0xFF;
4970               EMIT_TWO_BYTES (c1, c2);
4971             }
4972           else if (charset == charset_kana)
4973             EMIT_ONE_BYTE (code | 0x80);
4974           else if (charset_kanji2 && charset == charset_kanji2)
4975             {
4976               int c1, c2;
4977
4978               c1 = code >> 8;
4979               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4980                   || c1 == 0x28
4981                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4982                 {
4983                   JIS_TO_SJIS2 (code);
4984                   c1 = code >> 8, c2 = code & 0xFF;
4985                   EMIT_TWO_BYTES (c1, c2);
4986                 }
4987               else
4988                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4989             }
4990           else
4991             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4992         }
4993     }
4994   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4995   coding->produced_char += produced_chars;
4996   coding->produced = dst - coding->destination;
4997   return 0;
4998 }
4999
5000 static bool
5001 encode_coding_big5 (struct coding_system *coding)
5002 {
5003   bool multibytep = coding->dst_multibyte;
5004   int *charbuf = coding->charbuf;
5005   int *charbuf_end = charbuf + coding->charbuf_used;
5006   unsigned char *dst = coding->destination + coding->produced;
5007   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5008   int safe_room = 4;
5009   ptrdiff_t produced_chars = 0;
5010   Lisp_Object attrs, charset_list, val;
5011   bool ascii_compatible;
5012   struct charset *charset_big5;
5013   int c;
5014
5015   CODING_GET_INFO (coding, attrs, charset_list);
5016   val = XCDR (charset_list);
5017   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5018   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5019
5020   while (charbuf < charbuf_end)
5021     {
5022       ASSURE_DESTINATION (safe_room);
5023       c = *charbuf++;
5024       /* Now encode the character C.  */
5025       if (ASCII_CHAR_P (c) && ascii_compatible)
5026         EMIT_ONE_ASCII_BYTE (c);
5027       else if (CHAR_BYTE8_P (c))
5028         {
5029           c = CHAR_TO_BYTE8 (c);
5030           EMIT_ONE_BYTE (c);
5031         }
5032       else
5033         {
5034           unsigned code;
5035           struct charset *charset;
5036           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5037                                &code, charset);
5038
5039           if (! charset)
5040             {
5041               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5042                 {
5043                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5044                   charset = CHARSET_FROM_ID (charset_ascii);
5045                 }
5046               else
5047                 {
5048                   c = coding->default_char;
5049                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5050                                        charset_list, &code, charset);
5051                 }
5052             }
5053           if (code == CHARSET_INVALID_CODE (charset))
5054             emacs_abort ();
5055           if (charset == charset_big5)
5056             {
5057               int c1, c2;
5058
5059               c1 = code >> 8, c2 = code & 0xFF;
5060               EMIT_TWO_BYTES (c1, c2);
5061             }
5062           else
5063             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5064         }
5065     }
5066   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5067   coding->produced_char += produced_chars;
5068   coding->produced = dst - coding->destination;
5069   return 0;
5070 }
5071
5072 \f
5073 /*** 10. CCL handlers ***/
5074
5075 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5076    Return true if a text is encoded in a coding system of which
5077    encoder/decoder are written in CCL program.  */
5078
5079 static bool
5080 detect_coding_ccl (struct coding_system *coding,
5081                    struct coding_detection_info *detect_info)
5082 {
5083   const unsigned char *src = coding->source, *src_base;
5084   const unsigned char *src_end = coding->source + coding->src_bytes;
5085   bool multibytep = coding->src_multibyte;
5086   ptrdiff_t consumed_chars = 0;
5087   int found = 0;
5088   unsigned char *valids;
5089   ptrdiff_t head_ascii = coding->head_ascii;
5090   Lisp_Object attrs;
5091
5092   detect_info->checked |= CATEGORY_MASK_CCL;
5093
5094   coding = &coding_categories[coding_category_ccl];
5095   valids = CODING_CCL_VALIDS (coding);
5096   attrs = CODING_ID_ATTRS (coding->id);
5097   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5098     src += head_ascii;
5099
5100   while (1)
5101     {
5102       int c;
5103
5104       src_base = src;
5105       ONE_MORE_BYTE (c);
5106       if (c < 0 || ! valids[c])
5107         break;
5108       if ((valids[c] > 1))
5109         found = CATEGORY_MASK_CCL;
5110     }
5111   detect_info->rejected |= CATEGORY_MASK_CCL;
5112   return 0;
5113
5114  no_more_source:
5115   detect_info->found |= found;
5116   return 1;
5117 }
5118
5119 static void
5120 decode_coding_ccl (struct coding_system *coding)
5121 {
5122   const unsigned char *src = coding->source + coding->consumed;
5123   const unsigned char *src_end = coding->source + coding->src_bytes;
5124   int *charbuf = coding->charbuf + coding->charbuf_used;
5125   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5126   ptrdiff_t consumed_chars = 0;
5127   bool multibytep = coding->src_multibyte;
5128   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5129   int source_charbuf[1024];
5130   int source_byteidx[1025];
5131   Lisp_Object attrs, charset_list;
5132
5133   CODING_GET_INFO (coding, attrs, charset_list);
5134
5135   while (1)
5136     {
5137       const unsigned char *p = src;
5138       ptrdiff_t offset;
5139       int i = 0;
5140
5141       if (multibytep)
5142         {
5143           while (i < 1024 && p < src_end)
5144             {
5145               source_byteidx[i] = p - src;
5146               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5147             }
5148           source_byteidx[i] = p - src;
5149         }
5150       else
5151         while (i < 1024 && p < src_end)
5152           source_charbuf[i++] = *p++;
5153
5154       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5155         ccl->last_block = 1;
5156       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5157       charset_map_loaded = 0;
5158       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5159                   charset_list);
5160       if (charset_map_loaded
5161           && (offset = coding_change_source (coding)))
5162         {
5163           p += offset;
5164           src += offset;
5165           src_end += offset;
5166         }
5167       charbuf += ccl->produced;
5168       if (multibytep)
5169         src += source_byteidx[ccl->consumed];
5170       else
5171         src += ccl->consumed;
5172       consumed_chars += ccl->consumed;
5173       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5174         break;
5175     }
5176
5177   switch (ccl->status)
5178     {
5179     case CCL_STAT_SUSPEND_BY_SRC:
5180       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5181       break;
5182     case CCL_STAT_SUSPEND_BY_DST:
5183       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5184       break;
5185     case CCL_STAT_QUIT:
5186     case CCL_STAT_INVALID_CMD:
5187       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5188       break;
5189     default:
5190       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5191       break;
5192     }
5193   coding->consumed_char += consumed_chars;
5194   coding->consumed = src - coding->source;
5195   coding->charbuf_used = charbuf - coding->charbuf;
5196 }
5197
5198 static bool
5199 encode_coding_ccl (struct coding_system *coding)
5200 {
5201   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5202   bool multibytep = coding->dst_multibyte;
5203   int *charbuf = coding->charbuf;
5204   int *charbuf_end = charbuf + coding->charbuf_used;
5205   unsigned char *dst = coding->destination + coding->produced;
5206   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5207   int destination_charbuf[1024];
5208   ptrdiff_t produced_chars = 0;
5209   int i;
5210   Lisp_Object attrs, charset_list;
5211
5212   CODING_GET_INFO (coding, attrs, charset_list);
5213   if (coding->consumed_char == coding->src_chars
5214       && coding->mode & CODING_MODE_LAST_BLOCK)
5215     ccl->last_block = 1;
5216
5217   do
5218     {
5219       ptrdiff_t offset;
5220
5221       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5222       charset_map_loaded = 0;
5223       ccl_driver (ccl, charbuf, destination_charbuf,
5224                   charbuf_end - charbuf, 1024, charset_list);
5225       if (charset_map_loaded
5226           && (offset = coding_change_destination (coding)))
5227         dst += offset;
5228       if (multibytep)
5229         {
5230           ASSURE_DESTINATION (ccl->produced * 2);
5231           for (i = 0; i < ccl->produced; i++)
5232             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5233         }
5234       else
5235         {
5236           ASSURE_DESTINATION (ccl->produced);
5237           for (i = 0; i < ccl->produced; i++)
5238             *dst++ = destination_charbuf[i] & 0xFF;
5239           produced_chars += ccl->produced;
5240         }
5241       charbuf += ccl->consumed;
5242       if (ccl->status == CCL_STAT_QUIT
5243           || ccl->status == CCL_STAT_INVALID_CMD)
5244         break;
5245     }
5246   while (charbuf < charbuf_end);
5247
5248   switch (ccl->status)
5249     {
5250     case CCL_STAT_SUSPEND_BY_SRC:
5251       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5252       break;
5253     case CCL_STAT_SUSPEND_BY_DST:
5254       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5255       break;
5256     case CCL_STAT_QUIT:
5257     case CCL_STAT_INVALID_CMD:
5258       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5259       break;
5260     default:
5261       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5262       break;
5263     }
5264
5265   coding->produced_char += produced_chars;
5266   coding->produced = dst - coding->destination;
5267   return 0;
5268 }
5269
5270 \f
5271 /*** 10, 11. no-conversion handlers ***/
5272
5273 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5274
5275 static void
5276 decode_coding_raw_text (struct coding_system *coding)
5277 {
5278   bool eol_dos
5279     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5280
5281   coding->chars_at_source = 1;
5282   coding->consumed_char = coding->src_chars;
5283   coding->consumed = coding->src_bytes;
5284   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5285     {
5286       coding->consumed_char--;
5287       coding->consumed--;
5288       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5289     }
5290   else
5291     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5292 }
5293
5294 static bool
5295 encode_coding_raw_text (struct coding_system *coding)
5296 {
5297   bool multibytep = coding->dst_multibyte;
5298   int *charbuf = coding->charbuf;
5299   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5300   unsigned char *dst = coding->destination + coding->produced;
5301   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5302   ptrdiff_t produced_chars = 0;
5303   int c;
5304
5305   if (multibytep)
5306     {
5307       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5308
5309       if (coding->src_multibyte)
5310         while (charbuf < charbuf_end)
5311           {
5312             ASSURE_DESTINATION (safe_room);
5313             c = *charbuf++;
5314             if (ASCII_CHAR_P (c))
5315               EMIT_ONE_ASCII_BYTE (c);
5316             else if (CHAR_BYTE8_P (c))
5317               {
5318                 c = CHAR_TO_BYTE8 (c);
5319                 EMIT_ONE_BYTE (c);
5320               }
5321             else
5322               {
5323                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5324
5325                 CHAR_STRING_ADVANCE (c, p1);
5326                 do
5327                   {
5328                     EMIT_ONE_BYTE (*p0);
5329                     p0++;
5330                   }
5331                 while (p0 < p1);
5332               }
5333           }
5334       else
5335         while (charbuf < charbuf_end)
5336           {
5337             ASSURE_DESTINATION (safe_room);
5338             c = *charbuf++;
5339             EMIT_ONE_BYTE (c);
5340           }
5341     }
5342   else
5343     {
5344       if (coding->src_multibyte)
5345         {
5346           int safe_room = MAX_MULTIBYTE_LENGTH;
5347
5348           while (charbuf < charbuf_end)
5349             {
5350               ASSURE_DESTINATION (safe_room);
5351               c = *charbuf++;
5352               if (ASCII_CHAR_P (c))
5353                 *dst++ = c;
5354               else if (CHAR_BYTE8_P (c))
5355                 *dst++ = CHAR_TO_BYTE8 (c);
5356               else
5357                 CHAR_STRING_ADVANCE (c, dst);
5358             }
5359         }
5360       else
5361         {
5362           ASSURE_DESTINATION (charbuf_end - charbuf);
5363           while (charbuf < charbuf_end && dst < dst_end)
5364             *dst++ = *charbuf++;
5365         }
5366       produced_chars = dst - (coding->destination + coding->produced);
5367     }
5368   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5369   coding->produced_char += produced_chars;
5370   coding->produced = dst - coding->destination;
5371   return 0;
5372 }
5373
5374 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5375    Return true if a text is encoded in a charset-based coding system.  */
5376
5377 static bool
5378 detect_coding_charset (struct coding_system *coding,
5379                        struct coding_detection_info *detect_info)
5380 {
5381   const unsigned char *src = coding->source, *src_base;
5382   const unsigned char *src_end = coding->source + coding->src_bytes;
5383   bool multibytep = coding->src_multibyte;
5384   ptrdiff_t consumed_chars = 0;
5385   Lisp_Object attrs, valids, name;
5386   int found = 0;
5387   ptrdiff_t head_ascii = coding->head_ascii;
5388   bool check_latin_extra = 0;
5389
5390   detect_info->checked |= CATEGORY_MASK_CHARSET;
5391
5392   coding = &coding_categories[coding_category_charset];
5393   attrs = CODING_ID_ATTRS (coding->id);
5394   valids = AREF (attrs, coding_attr_charset_valids);
5395   name = CODING_ID_NAME (coding->id);
5396   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5397                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5398       || strncmp (SSDATA (SYMBOL_NAME (name)),
5399                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5400     check_latin_extra = 1;
5401
5402   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5403     src += head_ascii;
5404
5405   while (1)
5406     {
5407       int c;
5408       Lisp_Object val;
5409       struct charset *charset;
5410       int dim, idx;
5411
5412       src_base = src;
5413       ONE_MORE_BYTE (c);
5414       if (c < 0)
5415         continue;
5416       val = AREF (valids, c);
5417       if (NILP (val))
5418         break;
5419       if (c >= 0x80)
5420         {
5421           if (c < 0xA0
5422               && check_latin_extra
5423               && (!VECTORP (Vlatin_extra_code_table)
5424                   || NILP (AREF (Vlatin_extra_code_table, c))))
5425             break;
5426           found = CATEGORY_MASK_CHARSET;
5427         }
5428       if (INTEGERP (val))
5429         {
5430           charset = CHARSET_FROM_ID (XFASTINT (val));
5431           dim = CHARSET_DIMENSION (charset);
5432           for (idx = 1; idx < dim; idx++)
5433             {
5434               if (src == src_end)
5435                 goto too_short;
5436               ONE_MORE_BYTE (c);
5437               if (c < charset->code_space[(dim - 1 - idx) * 4]
5438                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5439                 break;
5440             }
5441           if (idx < dim)
5442             break;
5443         }
5444       else
5445         {
5446           idx = 1;
5447           for (; CONSP (val); val = XCDR (val))
5448             {
5449               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5450               dim = CHARSET_DIMENSION (charset);
5451               while (idx < dim)
5452                 {
5453                   if (src == src_end)
5454                     goto too_short;
5455                   ONE_MORE_BYTE (c);
5456                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5457                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5458                     break;
5459                   idx++;
5460                 }
5461               if (idx == dim)
5462                 {
5463                   val = Qnil;
5464                   break;
5465                 }
5466             }
5467           if (CONSP (val))
5468             break;
5469         }
5470     }
5471  too_short:
5472   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5473   return 0;
5474
5475  no_more_source:
5476   detect_info->found |= found;
5477   return 1;
5478 }
5479
5480 static void
5481 decode_coding_charset (struct coding_system *coding)
5482 {
5483   const unsigned char *src = coding->source + coding->consumed;
5484   const unsigned char *src_end = coding->source + coding->src_bytes;
5485   const unsigned char *src_base;
5486   int *charbuf = coding->charbuf + coding->charbuf_used;
5487   /* We may produce one charset annotation in one loop and one more at
5488      the end.  */
5489   int *charbuf_end
5490     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5491   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5492   bool multibytep = coding->src_multibyte;
5493   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5494   Lisp_Object valids;
5495   ptrdiff_t char_offset = coding->produced_char;
5496   ptrdiff_t last_offset = char_offset;
5497   int last_id = charset_ascii;
5498   bool eol_dos
5499     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5500   int byte_after_cr = -1;
5501
5502   valids = AREF (attrs, coding_attr_charset_valids);
5503
5504   while (1)
5505     {
5506       int c;
5507       Lisp_Object val;
5508       struct charset *charset;
5509       int dim;
5510       int len = 1;
5511       unsigned code;
5512
5513       src_base = src;
5514       consumed_chars_base = consumed_chars;
5515
5516       if (charbuf >= charbuf_end)
5517         {
5518           if (byte_after_cr >= 0)
5519             src_base--;
5520           break;
5521         }
5522
5523       if (byte_after_cr >= 0)
5524         {
5525           c = byte_after_cr;
5526           byte_after_cr = -1;
5527         }
5528       else
5529         {
5530           ONE_MORE_BYTE (c);
5531           if (eol_dos && c == '\r')
5532             ONE_MORE_BYTE (byte_after_cr);
5533         }
5534       if (c < 0)
5535         goto invalid_code;
5536       code = c;
5537
5538       val = AREF (valids, c);
5539       if (! INTEGERP (val) && ! CONSP (val))
5540         goto invalid_code;
5541       if (INTEGERP (val))
5542         {
5543           charset = CHARSET_FROM_ID (XFASTINT (val));
5544           dim = CHARSET_DIMENSION (charset);
5545           while (len < dim)
5546             {
5547               ONE_MORE_BYTE (c);
5548               code = (code << 8) | c;
5549               len++;
5550             }
5551           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5552                               charset, code, c);
5553         }
5554       else
5555         {
5556           /* VAL is a list of charset IDs.  It is assured that the
5557              list is sorted by charset dimensions (smaller one
5558              comes first).  */
5559           while (CONSP (val))
5560             {
5561               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5562               dim = CHARSET_DIMENSION (charset);
5563               while (len < dim)
5564                 {
5565                   ONE_MORE_BYTE (c);
5566                   code = (code << 8) | c;
5567                   len++;
5568                 }
5569               CODING_DECODE_CHAR (coding, src, src_base,
5570                                   src_end, charset, code, c);
5571               if (c >= 0)
5572                 break;
5573               val = XCDR (val);
5574             }
5575         }
5576       if (c < 0)
5577         goto invalid_code;
5578       if (charset->id != charset_ascii
5579           && last_id != charset->id)
5580         {
5581           if (last_id != charset_ascii)
5582             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5583           last_id = charset->id;
5584           last_offset = char_offset;
5585         }
5586
5587       *charbuf++ = c;
5588       char_offset++;
5589       continue;
5590
5591     invalid_code:
5592       src = src_base;
5593       consumed_chars = consumed_chars_base;
5594       ONE_MORE_BYTE (c);
5595       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5596       char_offset++;
5597       coding->errors++;
5598     }
5599
5600  no_more_source:
5601   if (last_id != charset_ascii)
5602     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5603   coding->consumed_char += consumed_chars_base;
5604   coding->consumed = src_base - coding->source;
5605   coding->charbuf_used = charbuf - coding->charbuf;
5606 }
5607
5608 static bool
5609 encode_coding_charset (struct coding_system *coding)
5610 {
5611   bool multibytep = coding->dst_multibyte;
5612   int *charbuf = coding->charbuf;
5613   int *charbuf_end = charbuf + coding->charbuf_used;
5614   unsigned char *dst = coding->destination + coding->produced;
5615   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5616   int safe_room = MAX_MULTIBYTE_LENGTH;
5617   ptrdiff_t produced_chars = 0;
5618   Lisp_Object attrs, charset_list;
5619   bool ascii_compatible;
5620   int c;
5621
5622   CODING_GET_INFO (coding, attrs, charset_list);
5623   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5624
5625   while (charbuf < charbuf_end)
5626     {
5627       struct charset *charset;
5628       unsigned code;
5629
5630       ASSURE_DESTINATION (safe_room);
5631       c = *charbuf++;
5632       if (ascii_compatible && ASCII_CHAR_P (c))
5633         EMIT_ONE_ASCII_BYTE (c);
5634       else if (CHAR_BYTE8_P (c))
5635         {
5636           c = CHAR_TO_BYTE8 (c);
5637           EMIT_ONE_BYTE (c);
5638         }
5639       else
5640         {
5641           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5642                                &code, charset);
5643
5644           if (charset)
5645             {
5646               if (CHARSET_DIMENSION (charset) == 1)
5647                 EMIT_ONE_BYTE (code);
5648               else if (CHARSET_DIMENSION (charset) == 2)
5649                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5650               else if (CHARSET_DIMENSION (charset) == 3)
5651                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5652               else
5653                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5654                                  (code >> 8) & 0xFF, code & 0xFF);
5655             }
5656           else
5657             {
5658               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5659                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5660               else
5661                 c = coding->default_char;
5662               EMIT_ONE_BYTE (c);
5663             }
5664         }
5665     }
5666
5667   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5668   coding->produced_char += produced_chars;
5669   coding->produced = dst - coding->destination;
5670   return 0;
5671 }
5672
5673 \f
5674 /*** 7. C library functions ***/
5675
5676 /* Setup coding context CODING from information about CODING_SYSTEM.
5677    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5678    CODING_SYSTEM is invalid, signal an error.  */
5679
5680 void
5681 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5682 {
5683   Lisp_Object attrs;
5684   Lisp_Object eol_type;
5685   Lisp_Object coding_type;
5686   Lisp_Object val;
5687
5688   if (NILP (coding_system))
5689     coding_system = Qundecided;
5690
5691   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5692
5693   attrs = CODING_ID_ATTRS (coding->id);
5694   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5695
5696   coding->mode = 0;
5697   if (VECTORP (eol_type))
5698     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5699                             | CODING_REQUIRE_DETECTION_MASK);
5700   else if (! EQ (eol_type, Qunix))
5701     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5702                             | CODING_REQUIRE_ENCODING_MASK);
5703   else
5704     coding->common_flags = 0;
5705   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5706     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5707   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5708     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5709   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5710     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5711
5712   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5713   coding->max_charset_id = SCHARS (val) - 1;
5714   coding->safe_charsets = SDATA (val);
5715   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5716   coding->carryover_bytes = 0;
5717
5718   coding_type = CODING_ATTR_TYPE (attrs);
5719   if (EQ (coding_type, Qundecided))
5720     {
5721       coding->detector = NULL;
5722       coding->decoder = decode_coding_raw_text;
5723       coding->encoder = encode_coding_raw_text;
5724       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5725       coding->spec.undecided.inhibit_nbd
5726         = (encode_inhibit_flag
5727            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5728       coding->spec.undecided.inhibit_ied
5729         = (encode_inhibit_flag
5730            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5731       coding->spec.undecided.prefer_utf_8
5732         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5733     }
5734   else if (EQ (coding_type, Qiso_2022))
5735     {
5736       int i;
5737       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5738
5739       /* Invoke graphic register 0 to plane 0.  */
5740       CODING_ISO_INVOCATION (coding, 0) = 0;
5741       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5742       CODING_ISO_INVOCATION (coding, 1)
5743         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5744       /* Setup the initial status of designation.  */
5745       for (i = 0; i < 4; i++)
5746         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5747       /* Not single shifting initially.  */
5748       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5749       /* Beginning of buffer should also be regarded as bol. */
5750       CODING_ISO_BOL (coding) = 1;
5751       coding->detector = detect_coding_iso_2022;
5752       coding->decoder = decode_coding_iso_2022;
5753       coding->encoder = encode_coding_iso_2022;
5754       if (flags & CODING_ISO_FLAG_SAFE)
5755         coding->mode |= CODING_MODE_SAFE_ENCODING;
5756       coding->common_flags
5757         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5758             | CODING_REQUIRE_FLUSHING_MASK);
5759       if (flags & CODING_ISO_FLAG_COMPOSITION)
5760         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5761       if (flags & CODING_ISO_FLAG_DESIGNATION)
5762         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5763       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5764         {
5765           setup_iso_safe_charsets (attrs);
5766           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5767           coding->max_charset_id = SCHARS (val) - 1;
5768           coding->safe_charsets = SDATA (val);
5769         }
5770       CODING_ISO_FLAGS (coding) = flags;
5771       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5772       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5773       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5774       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5775     }
5776   else if (EQ (coding_type, Qcharset))
5777     {
5778       coding->detector = detect_coding_charset;
5779       coding->decoder = decode_coding_charset;
5780       coding->encoder = encode_coding_charset;
5781       coding->common_flags
5782         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5783     }
5784   else if (EQ (coding_type, Qutf_8))
5785     {
5786       val = AREF (attrs, coding_attr_utf_bom);
5787       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5788                                    : EQ (val, Qt) ? utf_with_bom
5789                                    : utf_without_bom);
5790       coding->detector = detect_coding_utf_8;
5791       coding->decoder = decode_coding_utf_8;
5792       coding->encoder = encode_coding_utf_8;
5793       coding->common_flags
5794         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5795       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5796         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5797     }
5798   else if (EQ (coding_type, Qutf_16))
5799     {
5800       val = AREF (attrs, coding_attr_utf_bom);
5801       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5802                                     : EQ (val, Qt) ? utf_with_bom
5803                                     : utf_without_bom);
5804       val = AREF (attrs, coding_attr_utf_16_endian);
5805       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5806                                        : utf_16_little_endian);
5807       CODING_UTF_16_SURROGATE (coding) = 0;
5808       coding->detector = detect_coding_utf_16;
5809       coding->decoder = decode_coding_utf_16;
5810       coding->encoder = encode_coding_utf_16;
5811       coding->common_flags
5812         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5813       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5814         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5815     }
5816   else if (EQ (coding_type, Qccl))
5817     {
5818       coding->detector = detect_coding_ccl;
5819       coding->decoder = decode_coding_ccl;
5820       coding->encoder = encode_coding_ccl;
5821       coding->common_flags
5822         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5823             | CODING_REQUIRE_FLUSHING_MASK);
5824     }
5825   else if (EQ (coding_type, Qemacs_mule))
5826     {
5827       coding->detector = detect_coding_emacs_mule;
5828       coding->decoder = decode_coding_emacs_mule;
5829       coding->encoder = encode_coding_emacs_mule;
5830       coding->common_flags
5831         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5832       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5833           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5834         {
5835           Lisp_Object tail, safe_charsets;
5836           int max_charset_id = 0;
5837
5838           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5839                tail = XCDR (tail))
5840             if (max_charset_id < XFASTINT (XCAR (tail)))
5841               max_charset_id = XFASTINT (XCAR (tail));
5842           safe_charsets = make_uninit_string (max_charset_id + 1);
5843           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5844           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5845                tail = XCDR (tail))
5846             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5847           coding->max_charset_id = max_charset_id;
5848           coding->safe_charsets = SDATA (safe_charsets);
5849         }
5850       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5851       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5852     }
5853   else if (EQ (coding_type, Qshift_jis))
5854     {
5855       coding->detector = detect_coding_sjis;
5856       coding->decoder = decode_coding_sjis;
5857       coding->encoder = encode_coding_sjis;
5858       coding->common_flags
5859         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5860     }
5861   else if (EQ (coding_type, Qbig5))
5862     {
5863       coding->detector = detect_coding_big5;
5864       coding->decoder = decode_coding_big5;
5865       coding->encoder = encode_coding_big5;
5866       coding->common_flags
5867         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5868     }
5869   else                          /* EQ (coding_type, Qraw_text) */
5870     {
5871       coding->detector = NULL;
5872       coding->decoder = decode_coding_raw_text;
5873       coding->encoder = encode_coding_raw_text;
5874       if (! EQ (eol_type, Qunix))
5875         {
5876           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5877           if (! VECTORP (eol_type))
5878             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5879         }
5880
5881     }
5882
5883   return;
5884 }
5885
5886 /* Return a list of charsets supported by CODING.  */
5887
5888 Lisp_Object
5889 coding_charset_list (struct coding_system *coding)
5890 {
5891   Lisp_Object attrs, charset_list;
5892
5893   CODING_GET_INFO (coding, attrs, charset_list);
5894   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5895     {
5896       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5897
5898       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5899         charset_list = Viso_2022_charset_list;
5900     }
5901   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5902     {
5903       charset_list = Vemacs_mule_charset_list;
5904     }
5905   return charset_list;
5906 }
5907
5908
5909 /* Return a list of charsets supported by CODING-SYSTEM.  */
5910
5911 Lisp_Object
5912 coding_system_charset_list (Lisp_Object coding_system)
5913 {
5914   ptrdiff_t id;
5915   Lisp_Object attrs, charset_list;
5916
5917   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5918   attrs = CODING_ID_ATTRS (id);
5919
5920   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5921     {
5922       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5923
5924       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5925         charset_list = Viso_2022_charset_list;
5926       else
5927         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5928     }
5929   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5930     {
5931       charset_list = Vemacs_mule_charset_list;
5932     }
5933   else
5934     {
5935       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5936     }
5937   return charset_list;
5938 }
5939
5940
5941 /* Return raw-text or one of its subsidiaries that has the same
5942    eol_type as CODING-SYSTEM.  */
5943
5944 Lisp_Object
5945 raw_text_coding_system (Lisp_Object coding_system)
5946 {
5947   Lisp_Object spec, attrs;
5948   Lisp_Object eol_type, raw_text_eol_type;
5949
5950   if (NILP (coding_system))
5951     return Qraw_text;
5952   spec = CODING_SYSTEM_SPEC (coding_system);
5953   attrs = AREF (spec, 0);
5954
5955   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5956     return coding_system;
5957
5958   eol_type = AREF (spec, 2);
5959   if (VECTORP (eol_type))
5960     return Qraw_text;
5961   spec = CODING_SYSTEM_SPEC (Qraw_text);
5962   raw_text_eol_type = AREF (spec, 2);
5963   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5964           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5965           : AREF (raw_text_eol_type, 2));
5966 }
5967
5968
5969 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5970    the subsidiary that has the same eol-spec as PARENT (if it is not
5971    nil and specifies end-of-line format) or the system's setting
5972    (system_eol_type).  */
5973
5974 Lisp_Object
5975 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5976 {
5977   Lisp_Object spec, eol_type;
5978
5979   if (NILP (coding_system))
5980     coding_system = Qraw_text;
5981   spec = CODING_SYSTEM_SPEC (coding_system);
5982   eol_type = AREF (spec, 2);
5983   if (VECTORP (eol_type))
5984     {
5985       Lisp_Object parent_eol_type;
5986
5987       if (! NILP (parent))
5988         {
5989           Lisp_Object parent_spec;
5990
5991           parent_spec = CODING_SYSTEM_SPEC (parent);
5992           parent_eol_type = AREF (parent_spec, 2);
5993           if (VECTORP (parent_eol_type))
5994             parent_eol_type = system_eol_type;
5995         }
5996       else
5997         parent_eol_type = system_eol_type;
5998       if (EQ (parent_eol_type, Qunix))
5999         coding_system = AREF (eol_type, 0);
6000       else if (EQ (parent_eol_type, Qdos))
6001         coding_system = AREF (eol_type, 1);
6002       else if (EQ (parent_eol_type, Qmac))
6003         coding_system = AREF (eol_type, 2);
6004     }
6005   return coding_system;
6006 }
6007
6008
6009 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6010    decided for writing to a process.  If not, complement them, and
6011    return a new coding system.  */
6012
6013 Lisp_Object
6014 complement_process_encoding_system (Lisp_Object coding_system)
6015 {
6016   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6017   Lisp_Object spec, attrs;
6018   int i;
6019
6020   for (i = 0; i < 3; i++)
6021     {
6022       if (i == 1)
6023         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6024       else if (i == 2)
6025         coding_system = preferred_coding_system ();
6026       spec = CODING_SYSTEM_SPEC (coding_system);
6027       if (NILP (spec))
6028         continue;
6029       attrs = AREF (spec, 0);
6030       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6031         coding_base = CODING_ATTR_BASE_NAME (attrs);
6032       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6033         eol_base = coding_system;
6034       if (! NILP (coding_base) && ! NILP (eol_base))
6035         break;
6036     }
6037
6038   if (i > 0)
6039     /* The original CODING_SYSTEM didn't specify text-conversion or
6040        eol-conversion.  Be sure that we return a fully complemented
6041        coding system.  */
6042     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6043   return coding_system;
6044 }
6045
6046
6047 /* Emacs has a mechanism to automatically detect a coding system if it
6048    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6049    it's impossible to distinguish some coding systems accurately
6050    because they use the same range of codes.  So, at first, coding
6051    systems are categorized into 7, those are:
6052
6053    o coding-category-emacs-mule
6054
6055         The category for a coding system which has the same code range
6056         as Emacs' internal format.  Assigned the coding-system (Lisp
6057         symbol) `emacs-mule' by default.
6058
6059    o coding-category-sjis
6060
6061         The category for a coding system which has the same code range
6062         as SJIS.  Assigned the coding-system (Lisp
6063         symbol) `japanese-shift-jis' by default.
6064
6065    o coding-category-iso-7
6066
6067         The category for a coding system which has the same code range
6068         as ISO2022 of 7-bit environment.  This doesn't use any locking
6069         shift and single shift functions.  This can encode/decode all
6070         charsets.  Assigned the coding-system (Lisp symbol)
6071         `iso-2022-7bit' by default.
6072
6073    o coding-category-iso-7-tight
6074
6075         Same as coding-category-iso-7 except that this can
6076         encode/decode only the specified charsets.
6077
6078    o coding-category-iso-8-1
6079
6080         The category for a coding system which has the same code range
6081         as ISO2022 of 8-bit environment and graphic plane 1 used only
6082         for DIMENSION1 charset.  This doesn't use any locking shift
6083         and single shift functions.  Assigned the coding-system (Lisp
6084         symbol) `iso-latin-1' by default.
6085
6086    o coding-category-iso-8-2
6087
6088         The category for a coding system which has the same code range
6089         as ISO2022 of 8-bit environment and graphic plane 1 used only
6090         for DIMENSION2 charset.  This doesn't use any locking shift
6091         and single shift functions.  Assigned the coding-system (Lisp
6092         symbol) `japanese-iso-8bit' by default.
6093
6094    o coding-category-iso-7-else
6095
6096         The category for a coding system which has the same code range
6097         as ISO2022 of 7-bit environment but uses locking shift or
6098         single shift functions.  Assigned the coding-system (Lisp
6099         symbol) `iso-2022-7bit-lock' by default.
6100
6101    o coding-category-iso-8-else
6102
6103         The category for a coding system which has the same code range
6104         as ISO2022 of 8-bit environment but uses locking shift or
6105         single shift functions.  Assigned the coding-system (Lisp
6106         symbol) `iso-2022-8bit-ss2' by default.
6107
6108    o coding-category-big5
6109
6110         The category for a coding system which has the same code range
6111         as BIG5.  Assigned the coding-system (Lisp symbol)
6112         `cn-big5' by default.
6113
6114    o coding-category-utf-8
6115
6116         The category for a coding system which has the same code range
6117         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6118         symbol) `utf-8' by default.
6119
6120    o coding-category-utf-16-be
6121
6122         The category for a coding system in which a text has an
6123         Unicode signature (cf. Unicode Standard) in the order of BIG
6124         endian at the head.  Assigned the coding-system (Lisp symbol)
6125         `utf-16-be' by default.
6126
6127    o coding-category-utf-16-le
6128
6129         The category for a coding system in which a text has an
6130         Unicode signature (cf. Unicode Standard) in the order of
6131         LITTLE endian at the head.  Assigned the coding-system (Lisp
6132         symbol) `utf-16-le' by default.
6133
6134    o coding-category-ccl
6135
6136         The category for a coding system of which encoder/decoder is
6137         written in CCL programs.  The default value is nil, i.e., no
6138         coding system is assigned.
6139
6140    o coding-category-binary
6141
6142         The category for a coding system not categorized in any of the
6143         above.  Assigned the coding-system (Lisp symbol)
6144         `no-conversion' by default.
6145
6146    Each of them is a Lisp symbol and the value is an actual
6147    `coding-system's (this is also a Lisp symbol) assigned by a user.
6148    What Emacs does actually is to detect a category of coding system.
6149    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6150    decide only one possible category, it selects a category of the
6151    highest priority.  Priorities of categories are also specified by a
6152    user in a Lisp variable `coding-category-list'.
6153
6154 */
6155
6156 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6157                                            int eol_seen);
6158
6159
6160 /* Return the number of ASCII characters at the head of the source.
6161    By side effects, set coding->head_ascii and update
6162    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6163    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6164    reliable only when all the source bytes are ASCII.  */
6165
6166 static int
6167 check_ascii (struct coding_system *coding)
6168 {
6169   const unsigned char *src, *end;
6170   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6171   int eol_seen = coding->eol_seen;
6172
6173   coding_set_source (coding);
6174   src = coding->source;
6175   end = src + coding->src_bytes;
6176
6177   if (inhibit_eol_conversion
6178       || SYMBOLP (eol_type))
6179     {
6180       /* We don't have to check EOL format.  */
6181       while (src < end && !( *src & 0x80))
6182         {
6183           if (*src++ == '\n')
6184             eol_seen |= EOL_SEEN_LF;
6185         }
6186     }
6187   else
6188     {
6189       end--;                /* We look ahead one byte for "CR LF".  */
6190       while (src < end)
6191         {
6192           int c = *src;
6193
6194           if (c & 0x80)
6195             break;
6196           src++;
6197           if (c == '\r')
6198             {
6199               if (*src == '\n')
6200                 {
6201                   eol_seen |= EOL_SEEN_CRLF;
6202                   src++;
6203                 }
6204               else
6205                 eol_seen |= EOL_SEEN_CR;
6206             }
6207           else if (c == '\n')
6208             eol_seen |= EOL_SEEN_LF;
6209         }
6210       if (src == end)
6211         {
6212           int c = *src;
6213
6214           /* All bytes but the last one C are ASCII.  */
6215           if (! (c & 0x80))
6216             {
6217               if (c == '\r')
6218                 eol_seen |= EOL_SEEN_CR;
6219               else if (c  == '\n')
6220                 eol_seen |= EOL_SEEN_LF;
6221               src++;
6222             }
6223         }
6224     }
6225   coding->head_ascii = src - coding->source;
6226   coding->eol_seen = eol_seen;
6227   return (coding->head_ascii);
6228 }
6229
6230
6231 /* Return the number of characters at the source if all the bytes are
6232    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6233    effects, update coding->eol_seen.  The value of coding->eol_seen is
6234    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6235    the value is reliable only when all the source bytes are valid
6236    UTF-8.  */
6237
6238 static int
6239 check_utf_8 (struct coding_system *coding)
6240 {
6241   const unsigned char *src, *end;
6242   int eol_seen;
6243   int nchars = coding->head_ascii;
6244
6245   if (coding->head_ascii < 0)
6246     check_ascii (coding);
6247   else
6248     coding_set_source (coding);
6249   src = coding->source + coding->head_ascii;
6250   /* We look ahead one byte for CR LF.  */
6251   end = coding->source + coding->src_bytes - 1;
6252   eol_seen = coding->eol_seen;
6253   while (src < end)
6254     {
6255       int c = *src;
6256
6257       if (UTF_8_1_OCTET_P (*src))
6258         {
6259           src++;
6260           if (c < 0x20)
6261             {
6262               if (c == '\r')
6263                 {
6264                   if (*src == '\n')
6265                     {
6266                       eol_seen |= EOL_SEEN_CRLF;
6267                       src++;
6268                       nchars++;
6269                     }
6270                   else
6271                     eol_seen |= EOL_SEEN_CR;
6272                 }
6273               else if (c == '\n')
6274                 eol_seen |= EOL_SEEN_LF;
6275             }
6276         }
6277       else if (UTF_8_2_OCTET_LEADING_P (c))
6278         {
6279           if (c < 0xC2          /* overlong sequence */
6280               || src + 1 >= end
6281               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6282             return -1;
6283           src += 2;
6284         }
6285       else if (UTF_8_3_OCTET_LEADING_P (c))
6286         {
6287           if (src + 2 >= end
6288               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6289                     && UTF_8_EXTRA_OCTET_P (src[2])))
6290             return -1;
6291           c = (((c & 0xF) << 12)
6292                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6293           if (c < 0x800                       /* overlong sequence */
6294               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6295             return -1;
6296           src += 3;
6297         }
6298       else if (UTF_8_4_OCTET_LEADING_P (c))
6299         {
6300           if (src + 3 >= end
6301               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6302                     && UTF_8_EXTRA_OCTET_P (src[2])
6303                     && UTF_8_EXTRA_OCTET_P (src[3])))
6304             return -1;
6305           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6306                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6307           if (c < 0x10000       /* overlong sequence */
6308               || c >= 0x110000) /* non-Unicode character  */
6309             return -1;
6310           src += 4;
6311         }
6312       else
6313         return -1;
6314       nchars++;
6315     }
6316
6317   if (src == end)
6318     {
6319       if (! UTF_8_1_OCTET_P (*src))
6320         return -1;
6321       nchars++;
6322       if (*src == '\r')
6323         eol_seen |= EOL_SEEN_CR;
6324       else if (*src  == '\n')
6325         eol_seen |= EOL_SEEN_LF;
6326     }
6327   coding->eol_seen = eol_seen;
6328   return nchars;
6329 }
6330
6331
6332 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6333    SOURCE is encoded.  If CATEGORY is one of
6334    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6335    two-byte, else they are encoded by one-byte.
6336
6337    Return one of EOL_SEEN_XXX.  */
6338
6339 #define MAX_EOL_CHECK_COUNT 3
6340
6341 static int
6342 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6343             enum coding_category category)
6344 {
6345   const unsigned char *src = source, *src_end = src + src_bytes;
6346   unsigned char c;
6347   int total  = 0;
6348   int eol_seen = EOL_SEEN_NONE;
6349
6350   if ((1 << category) & CATEGORY_MASK_UTF_16)
6351     {
6352       bool msb = category == (coding_category_utf_16_le
6353                               | coding_category_utf_16_le_nosig);
6354       bool lsb = !msb;
6355
6356       while (src + 1 < src_end)
6357         {
6358           c = src[lsb];
6359           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6360             {
6361               int this_eol;
6362
6363               if (c == '\n')
6364                 this_eol = EOL_SEEN_LF;
6365               else if (src + 3 >= src_end
6366                        || src[msb + 2] != 0
6367                        || src[lsb + 2] != '\n')
6368                 this_eol = EOL_SEEN_CR;
6369               else
6370                 {
6371                   this_eol = EOL_SEEN_CRLF;
6372                   src += 2;
6373                 }
6374
6375               if (eol_seen == EOL_SEEN_NONE)
6376                 /* This is the first end-of-line.  */
6377                 eol_seen = this_eol;
6378               else if (eol_seen != this_eol)
6379                 {
6380                   /* The found type is different from what found before.
6381                      Allow for stray ^M characters in DOS EOL files.  */
6382                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6383                       || (eol_seen == EOL_SEEN_CRLF
6384                           && this_eol == EOL_SEEN_CR))
6385                     eol_seen = EOL_SEEN_CRLF;
6386                   else
6387                     {
6388                       eol_seen = EOL_SEEN_LF;
6389                       break;
6390                     }
6391                 }
6392               if (++total == MAX_EOL_CHECK_COUNT)
6393                 break;
6394             }
6395           src += 2;
6396         }
6397     }
6398   else
6399     while (src < src_end)
6400       {
6401         c = *src++;
6402         if (c == '\n' || c == '\r')
6403           {
6404             int this_eol;
6405
6406             if (c == '\n')
6407               this_eol = EOL_SEEN_LF;
6408             else if (src >= src_end || *src != '\n')
6409               this_eol = EOL_SEEN_CR;
6410             else
6411               this_eol = EOL_SEEN_CRLF, src++;
6412
6413             if (eol_seen == EOL_SEEN_NONE)
6414               /* This is the first end-of-line.  */
6415               eol_seen = this_eol;
6416             else if (eol_seen != this_eol)
6417               {
6418                 /* The found type is different from what found before.
6419                    Allow for stray ^M characters in DOS EOL files.  */
6420                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6421                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6422                   eol_seen = EOL_SEEN_CRLF;
6423                 else
6424                   {
6425                     eol_seen = EOL_SEEN_LF;
6426                     break;
6427                   }
6428               }
6429             if (++total == MAX_EOL_CHECK_COUNT)
6430               break;
6431           }
6432       }
6433   return eol_seen;
6434 }
6435
6436
6437 static Lisp_Object
6438 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6439 {
6440   Lisp_Object eol_type;
6441
6442   eol_type = CODING_ID_EOL_TYPE (coding->id);
6443   if (! VECTORP (eol_type))
6444     /* Already adjusted.  */
6445     return eol_type;
6446   if (eol_seen & EOL_SEEN_LF)
6447     {
6448       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6449       eol_type = Qunix;
6450     }
6451   else if (eol_seen & EOL_SEEN_CRLF)
6452     {
6453       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6454       eol_type = Qdos;
6455     }
6456   else if (eol_seen & EOL_SEEN_CR)
6457     {
6458       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6459       eol_type = Qmac;
6460     }
6461   return eol_type;
6462 }
6463
6464 /* Detect how a text specified in CODING is encoded.  If a coding
6465    system is detected, update fields of CODING by the detected coding
6466    system.  */
6467
6468 static void
6469 detect_coding (struct coding_system *coding)
6470 {
6471   const unsigned char *src, *src_end;
6472   unsigned int saved_mode = coding->mode;
6473   Lisp_Object found = Qnil;
6474   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6475
6476   coding->consumed = coding->consumed_char = 0;
6477   coding->produced = coding->produced_char = 0;
6478   coding_set_source (coding);
6479
6480   src_end = coding->source + coding->src_bytes;
6481
6482   coding->eol_seen = EOL_SEEN_NONE;
6483   /* If we have not yet decided the text encoding type, detect it
6484      now.  */
6485   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6486     {
6487       int c, i;
6488       struct coding_detection_info detect_info;
6489       bool null_byte_found = 0, eight_bit_found = 0;
6490       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6491                                        inhibit_null_byte_detection);
6492       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6493                                        inhibit_iso_escape_detection);
6494       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6495
6496       coding->head_ascii = 0;
6497       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6498       for (src = coding->source; src < src_end; src++)
6499         {
6500           c = *src;
6501           if (c & 0x80)
6502             {
6503               eight_bit_found = 1;
6504               if (null_byte_found)
6505                 break;
6506             }
6507           else if (c < 0x20)
6508             {
6509               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6510                   && ! inhibit_ied
6511                   && ! detect_info.checked)
6512                 {
6513                   if (detect_coding_iso_2022 (coding, &detect_info))
6514                     {
6515                       /* We have scanned the whole data.  */
6516                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6517                         {
6518                           /* We didn't find an 8-bit code.  We may
6519                              have found a null-byte, but it's very
6520                              rare that a binary file conforms to
6521                              ISO-2022.  */
6522                           src = src_end;
6523                           coding->head_ascii = src - coding->source;
6524                         }
6525                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6526                       break;
6527                     }
6528                 }
6529               else if (! c && !inhibit_nbd)
6530                 {
6531                   null_byte_found = 1;
6532                   if (eight_bit_found)
6533                     break;
6534                 }
6535               else if (! disable_ascii_optimization
6536                        && ! inhibit_eol_conversion)
6537                 {
6538                   if (c == '\r')
6539                     {
6540                       if (src < src_end && src[1] == '\n')
6541                         {
6542                           coding->eol_seen |= EOL_SEEN_CRLF;
6543                           src++;
6544                           if (! eight_bit_found)
6545                             coding->head_ascii++;
6546                         }
6547                       else
6548                         coding->eol_seen |= EOL_SEEN_CR;
6549                     }
6550                   else if (c == '\n')
6551                     {
6552                       coding->eol_seen |= EOL_SEEN_LF;
6553                     }
6554                 }
6555
6556               if (! eight_bit_found)
6557                 coding->head_ascii++;
6558             }
6559           else if (! eight_bit_found)
6560             coding->head_ascii++;
6561         }
6562
6563       if (null_byte_found || eight_bit_found
6564           || coding->head_ascii < coding->src_bytes
6565           || detect_info.found)
6566         {
6567           enum coding_category category;
6568           struct coding_system *this;
6569
6570           if (coding->head_ascii == coding->src_bytes)
6571             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6572             for (i = 0; i < coding_category_raw_text; i++)
6573               {
6574                 category = coding_priorities[i];
6575                 this = coding_categories + category;
6576                 if (detect_info.found & (1 << category))
6577                   break;
6578               }
6579           else
6580             {
6581               if (null_byte_found)
6582                 {
6583                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6584                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6585                 }
6586               else if (prefer_utf_8
6587                        && detect_coding_utf_8 (coding, &detect_info))
6588                 {
6589                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6590                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6591                 }
6592               for (i = 0; i < coding_category_raw_text; i++)
6593                 {
6594                   category = coding_priorities[i];
6595                   this = coding_categories + category;
6596                   /* Some of this->detector (e.g. detect_coding_sjis)
6597                      require this information.  */
6598                   coding->id = this->id;
6599                   if (this->id < 0)
6600                     {
6601                       /* No coding system of this category is defined.  */
6602                       detect_info.rejected |= (1 << category);
6603                     }
6604                   else if (category >= coding_category_raw_text)
6605                     continue;
6606                   else if (detect_info.checked & (1 << category))
6607                     {
6608                       if (detect_info.found & (1 << category))
6609                         break;
6610                     }
6611                   else if ((*(this->detector)) (coding, &detect_info)
6612                            && detect_info.found & (1 << category))
6613                     break;
6614                 }
6615             }
6616
6617           if (i < coding_category_raw_text)
6618             {
6619               if (category == coding_category_utf_8_auto)
6620                 {
6621                   Lisp_Object coding_systems;
6622
6623                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6624                                          coding_attr_utf_bom);
6625                   if (CONSP (coding_systems))
6626                     {
6627                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6628                         found = XCAR (coding_systems);
6629                       else
6630                         found = XCDR (coding_systems);
6631                     }
6632                   else
6633                     found = CODING_ID_NAME (this->id);
6634                 }
6635               else if (category == coding_category_utf_16_auto)
6636                 {
6637                   Lisp_Object coding_systems;
6638
6639                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6640                                          coding_attr_utf_bom);
6641                   if (CONSP (coding_systems))
6642                     {
6643                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6644                         found = XCAR (coding_systems);
6645                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6646                         found = XCDR (coding_systems);
6647                     }
6648                   else
6649                     found = CODING_ID_NAME (this->id);
6650                 }
6651               else
6652                 found = CODING_ID_NAME (this->id);
6653             }
6654           else if (null_byte_found)
6655             found = Qno_conversion;
6656           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6657                    == CATEGORY_MASK_ANY)
6658             found = Qraw_text;
6659           else if (detect_info.rejected)
6660             for (i = 0; i < coding_category_raw_text; i++)
6661               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6662                 {
6663                   this = coding_categories + coding_priorities[i];
6664                   found = CODING_ID_NAME (this->id);
6665                   break;
6666                 }
6667         }
6668     }
6669   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6670            == coding_category_utf_8_auto)
6671     {
6672       Lisp_Object coding_systems;
6673       struct coding_detection_info detect_info;
6674
6675       coding_systems
6676         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6677       detect_info.found = detect_info.rejected = 0;
6678       if (check_ascii (coding) == coding->src_bytes)
6679         {
6680           if (CONSP (coding_systems))
6681             found = XCDR (coding_systems);
6682         }
6683       else
6684         {
6685           if (CONSP (coding_systems)
6686               && detect_coding_utf_8 (coding, &detect_info))
6687             {
6688               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6689                 found = XCAR (coding_systems);
6690               else
6691                 found = XCDR (coding_systems);
6692             }
6693         }
6694     }
6695   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6696            == coding_category_utf_16_auto)
6697     {
6698       Lisp_Object coding_systems;
6699       struct coding_detection_info detect_info;
6700
6701       coding_systems
6702         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6703       detect_info.found = detect_info.rejected = 0;
6704       coding->head_ascii = 0;
6705       if (CONSP (coding_systems)
6706           && detect_coding_utf_16 (coding, &detect_info))
6707         {
6708           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6709             found = XCAR (coding_systems);
6710           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6711             found = XCDR (coding_systems);
6712         }
6713     }
6714
6715   if (! NILP (found))
6716     {
6717       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6718                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6719                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6720                            : EOL_SEEN_LF);
6721
6722       setup_coding_system (found, coding);
6723       if (specified_eol != EOL_SEEN_NONE)
6724         adjust_coding_eol_type (coding, specified_eol);
6725     }
6726
6727   coding->mode = saved_mode;
6728 }
6729
6730
6731 static void
6732 decode_eol (struct coding_system *coding)
6733 {
6734   Lisp_Object eol_type;
6735   unsigned char *p, *pbeg, *pend;
6736
6737   eol_type = CODING_ID_EOL_TYPE (coding->id);
6738   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6739     return;
6740
6741   if (NILP (coding->dst_object))
6742     pbeg = coding->destination;
6743   else
6744     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6745   pend = pbeg + coding->produced;
6746
6747   if (VECTORP (eol_type))
6748     {
6749       int eol_seen = EOL_SEEN_NONE;
6750
6751       for (p = pbeg; p < pend; p++)
6752         {
6753           if (*p == '\n')
6754             eol_seen |= EOL_SEEN_LF;
6755           else if (*p == '\r')
6756             {
6757               if (p + 1 < pend && *(p + 1) == '\n')
6758                 {
6759                   eol_seen |= EOL_SEEN_CRLF;
6760                   p++;
6761                 }
6762               else
6763                 eol_seen |= EOL_SEEN_CR;
6764             }
6765         }
6766       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6767       if ((eol_seen & EOL_SEEN_CRLF) != 0
6768           && (eol_seen & EOL_SEEN_CR) != 0
6769           && (eol_seen & EOL_SEEN_LF) == 0)
6770         eol_seen = EOL_SEEN_CRLF;
6771       else if (eol_seen != EOL_SEEN_NONE
6772           && eol_seen != EOL_SEEN_LF
6773           && eol_seen != EOL_SEEN_CRLF
6774           && eol_seen != EOL_SEEN_CR)
6775         eol_seen = EOL_SEEN_LF;
6776       if (eol_seen != EOL_SEEN_NONE)
6777         eol_type = adjust_coding_eol_type (coding, eol_seen);
6778     }
6779
6780   if (EQ (eol_type, Qmac))
6781     {
6782       for (p = pbeg; p < pend; p++)
6783         if (*p == '\r')
6784           *p = '\n';
6785     }
6786   else if (EQ (eol_type, Qdos))
6787     {
6788       ptrdiff_t n = 0;
6789
6790       if (NILP (coding->dst_object))
6791         {
6792           /* Start deleting '\r' from the tail to minimize the memory
6793              movement.  */
6794           for (p = pend - 2; p >= pbeg; p--)
6795             if (*p == '\r')
6796               {
6797                 memmove (p, p + 1, pend-- - p - 1);
6798                 n++;
6799               }
6800         }
6801       else
6802         {
6803           ptrdiff_t pos_byte = coding->dst_pos_byte;
6804           ptrdiff_t pos = coding->dst_pos;
6805           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6806
6807           while (pos < pos_end)
6808             {
6809               p = BYTE_POS_ADDR (pos_byte);
6810               if (*p == '\r' && p[1] == '\n')
6811                 {
6812                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6813                   n++;
6814                   pos_end--;
6815                 }
6816               pos++;
6817               if (coding->dst_multibyte)
6818                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6819               else
6820                 pos_byte++;
6821             }
6822         }
6823       coding->produced -= n;
6824       coding->produced_char -= n;
6825     }
6826 }
6827
6828
6829 /* Return a translation table (or list of them) from coding system
6830    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6831    not ENCODEP). */
6832
6833 static Lisp_Object
6834 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6835 {
6836   Lisp_Object standard, translation_table;
6837   Lisp_Object val;
6838
6839   if (NILP (Venable_character_translation))
6840     {
6841       if (max_lookup)
6842         *max_lookup = 0;
6843       return Qnil;
6844     }
6845   if (encodep)
6846     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6847       standard = Vstandard_translation_table_for_encode;
6848   else
6849     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6850       standard = Vstandard_translation_table_for_decode;
6851   if (NILP (translation_table))
6852     translation_table = standard;
6853   else
6854     {
6855       if (SYMBOLP (translation_table))
6856         translation_table = Fget (translation_table, Qtranslation_table);
6857       else if (CONSP (translation_table))
6858         {
6859           translation_table = Fcopy_sequence (translation_table);
6860           for (val = translation_table; CONSP (val); val = XCDR (val))
6861             if (SYMBOLP (XCAR (val)))
6862               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6863         }
6864       if (CHAR_TABLE_P (standard))
6865         {
6866           if (CONSP (translation_table))
6867             translation_table = nconc2 (translation_table,
6868                                         Fcons (standard, Qnil));
6869           else
6870             translation_table = Fcons (translation_table,
6871                                        Fcons (standard, Qnil));
6872         }
6873     }
6874
6875   if (max_lookup)
6876     {
6877       *max_lookup = 1;
6878       if (CHAR_TABLE_P (translation_table)
6879           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6880         {
6881           val = XCHAR_TABLE (translation_table)->extras[1];
6882           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6883             *max_lookup = XFASTINT (val);
6884         }
6885       else if (CONSP (translation_table))
6886         {
6887           Lisp_Object tail;
6888
6889           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6890             if (CHAR_TABLE_P (XCAR (tail))
6891                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6892               {
6893                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6894                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6895                   *max_lookup = XFASTINT (tailval);
6896               }
6897         }
6898     }
6899   return translation_table;
6900 }
6901
6902 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6903   do {                                                          \
6904     trans = Qnil;                                               \
6905     if (CHAR_TABLE_P (table))                                   \
6906       {                                                         \
6907         trans = CHAR_TABLE_REF (table, c);                      \
6908         if (CHARACTERP (trans))                                 \
6909           c = XFASTINT (trans), trans = Qnil;                   \
6910       }                                                         \
6911     else if (CONSP (table))                                     \
6912       {                                                         \
6913         Lisp_Object tail;                                       \
6914                                                                 \
6915         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6916           if (CHAR_TABLE_P (XCAR (tail)))                       \
6917             {                                                   \
6918               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6919               if (CHARACTERP (trans))                           \
6920                 c = XFASTINT (trans), trans = Qnil;             \
6921               else if (! NILP (trans))                          \
6922                 break;                                          \
6923             }                                                   \
6924       }                                                         \
6925   } while (0)
6926
6927
6928 /* Return a translation of character(s) at BUF according to TRANS.
6929    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6930    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6931    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6932    translation is found, and Qnil if not found..
6933    If BUF is too short to lookup characters in FROM, return Qt.  */
6934
6935 static Lisp_Object
6936 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6937 {
6938
6939   if (INTEGERP (trans))
6940     return trans;
6941   for (; CONSP (trans); trans = XCDR (trans))
6942     {
6943       Lisp_Object val = XCAR (trans);
6944       Lisp_Object from = XCAR (val);
6945       ptrdiff_t len = ASIZE (from);
6946       ptrdiff_t i;
6947
6948       for (i = 0; i < len; i++)
6949         {
6950           if (buf + i == buf_end)
6951             return Qt;
6952           if (XINT (AREF (from, i)) != buf[i])
6953             break;
6954         }
6955       if (i == len)
6956         return val;
6957     }
6958   return Qnil;
6959 }
6960
6961
6962 static int
6963 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6964                bool last_block)
6965 {
6966   unsigned char *dst = coding->destination + coding->produced;
6967   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6968   ptrdiff_t produced;
6969   ptrdiff_t produced_chars = 0;
6970   int carryover = 0;
6971
6972   if (! coding->chars_at_source)
6973     {
6974       /* Source characters are in coding->charbuf.  */
6975       int *buf = coding->charbuf;
6976       int *buf_end = buf + coding->charbuf_used;
6977
6978       if (EQ (coding->src_object, coding->dst_object))
6979         {
6980           coding_set_source (coding);
6981           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6982         }
6983
6984       while (buf < buf_end)
6985         {
6986           int c = *buf;
6987           ptrdiff_t i;
6988
6989           if (c >= 0)
6990             {
6991               ptrdiff_t from_nchars = 1, to_nchars = 1;
6992               Lisp_Object trans = Qnil;
6993
6994               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6995               if (! NILP (trans))
6996                 {
6997                   trans = get_translation (trans, buf, buf_end);
6998                   if (INTEGERP (trans))
6999                     c = XINT (trans);
7000                   else if (CONSP (trans))
7001                     {
7002                       from_nchars = ASIZE (XCAR (trans));
7003                       trans = XCDR (trans);
7004                       if (INTEGERP (trans))
7005                         c = XINT (trans);
7006                       else
7007                         {
7008                           to_nchars = ASIZE (trans);
7009                           c = XINT (AREF (trans, 0));
7010                         }
7011                     }
7012                   else if (EQ (trans, Qt) && ! last_block)
7013                     break;
7014                 }
7015
7016               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7017                 {
7018                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7019                        / MAX_MULTIBYTE_LENGTH)
7020                       < to_nchars)
7021                     memory_full (SIZE_MAX);
7022                   dst = alloc_destination (coding,
7023                                            buf_end - buf
7024                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7025                                            dst);
7026                   if (EQ (coding->src_object, coding->dst_object))
7027                     {
7028                       coding_set_source (coding);
7029                       dst_end = (((unsigned char *) coding->source)
7030                                  + coding->consumed);
7031                     }
7032                   else
7033                     dst_end = coding->destination + coding->dst_bytes;
7034                 }
7035
7036               for (i = 0; i < to_nchars; i++)
7037                 {
7038                   if (i > 0)
7039                     c = XINT (AREF (trans, i));
7040                   if (coding->dst_multibyte
7041                       || ! CHAR_BYTE8_P (c))
7042                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7043                   else
7044                     *dst++ = CHAR_TO_BYTE8 (c);
7045                 }
7046               produced_chars += to_nchars;
7047               buf += from_nchars;
7048             }
7049           else
7050             /* This is an annotation datum.  (-C) is the length.  */
7051             buf += -c;
7052         }
7053       carryover = buf_end - buf;
7054     }
7055   else
7056     {
7057       /* Source characters are at coding->source.  */
7058       const unsigned char *src = coding->source;
7059       const unsigned char *src_end = src + coding->consumed;
7060
7061       if (EQ (coding->dst_object, coding->src_object))
7062         dst_end = (unsigned char *) src;
7063       if (coding->src_multibyte != coding->dst_multibyte)
7064         {
7065           if (coding->src_multibyte)
7066             {
7067               bool multibytep = 1;
7068               ptrdiff_t consumed_chars = 0;
7069
7070               while (1)
7071                 {
7072                   const unsigned char *src_base = src;
7073                   int c;
7074
7075                   ONE_MORE_BYTE (c);
7076                   if (dst == dst_end)
7077                     {
7078                       if (EQ (coding->src_object, coding->dst_object))
7079                         dst_end = (unsigned char *) src;
7080                       if (dst == dst_end)
7081                         {
7082                           ptrdiff_t offset = src - coding->source;
7083
7084                           dst = alloc_destination (coding, src_end - src + 1,
7085                                                    dst);
7086                           dst_end = coding->destination + coding->dst_bytes;
7087                           coding_set_source (coding);
7088                           src = coding->source + offset;
7089                           src_end = coding->source + coding->consumed;
7090                           if (EQ (coding->src_object, coding->dst_object))
7091                             dst_end = (unsigned char *) src;
7092                         }
7093                     }
7094                   *dst++ = c;
7095                   produced_chars++;
7096                 }
7097             no_more_source:
7098               ;
7099             }
7100           else
7101             while (src < src_end)
7102               {
7103                 bool multibytep = 1;
7104                 int c = *src++;
7105
7106                 if (dst >= dst_end - 1)
7107                   {
7108                     if (EQ (coding->src_object, coding->dst_object))
7109                       dst_end = (unsigned char *) src;
7110                     if (dst >= dst_end - 1)
7111                       {
7112                         ptrdiff_t offset = src - coding->source;
7113                         ptrdiff_t more_bytes;
7114
7115                         if (EQ (coding->src_object, coding->dst_object))
7116                           more_bytes = ((src_end - src) / 2) + 2;
7117                         else
7118                           more_bytes = src_end - src + 2;
7119                         dst = alloc_destination (coding, more_bytes, dst);
7120                         dst_end = coding->destination + coding->dst_bytes;
7121                         coding_set_source (coding);
7122                         src = coding->source + offset;
7123                         src_end = coding->source + coding->consumed;
7124                         if (EQ (coding->src_object, coding->dst_object))
7125                           dst_end = (unsigned char *) src;
7126                       }
7127                   }
7128                 EMIT_ONE_BYTE (c);
7129               }
7130         }
7131       else
7132         {
7133           if (!EQ (coding->src_object, coding->dst_object))
7134             {
7135               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7136
7137               if (require > 0)
7138                 {
7139                   ptrdiff_t offset = src - coding->source;
7140
7141                   dst = alloc_destination (coding, require, dst);
7142                   coding_set_source (coding);
7143                   src = coding->source + offset;
7144                   src_end = coding->source + coding->consumed;
7145                 }
7146             }
7147           produced_chars = coding->consumed_char;
7148           while (src < src_end)
7149             *dst++ = *src++;
7150         }
7151     }
7152
7153   produced = dst - (coding->destination + coding->produced);
7154   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7155     insert_from_gap (produced_chars, produced, 0);
7156   coding->produced += produced;
7157   coding->produced_char += produced_chars;
7158   return carryover;
7159 }
7160
7161 /* Compose text in CODING->object according to the annotation data at
7162    CHARBUF.  CHARBUF is an array:
7163      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7164  */
7165
7166 static void
7167 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7168 {
7169   int len;
7170   ptrdiff_t to;
7171   enum composition_method method;
7172   Lisp_Object components;
7173
7174   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7175   to = pos + charbuf[2];
7176   method = (enum composition_method) (charbuf[4]);
7177
7178   if (method == COMPOSITION_RELATIVE)
7179     components = Qnil;
7180   else
7181     {
7182       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7183       int i, j;
7184
7185       if (method == COMPOSITION_WITH_RULE)
7186         len = charbuf[2] * 3 - 2;
7187       charbuf += MAX_ANNOTATION_LENGTH;
7188       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7189       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7190         {
7191           if (charbuf[i] >= 0)
7192             args[j] = make_number (charbuf[i]);
7193           else
7194             {
7195               i++;
7196               args[j] = make_number (charbuf[i] % 0x100);
7197             }
7198         }
7199       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7200     }
7201   compose_text (pos, to, components, Qnil, coding->dst_object);
7202 }
7203
7204
7205 /* Put `charset' property on text in CODING->object according to
7206    the annotation data at CHARBUF.  CHARBUF is an array:
7207      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7208  */
7209
7210 static void
7211 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7212 {
7213   ptrdiff_t from = pos - charbuf[2];
7214   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7215
7216   Fput_text_property (make_number (from), make_number (pos),
7217                       Qcharset, CHARSET_NAME (charset),
7218                       coding->dst_object);
7219 }
7220
7221
7222 #define CHARBUF_SIZE 0x4000
7223
7224 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7225   do {                                                                  \
7226     coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int));        \
7227     coding->charbuf_size = CHARBUF_SIZE;                                \
7228   } while (0)
7229
7230
7231 static void
7232 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7233 {
7234   int *charbuf = coding->charbuf;
7235   int *charbuf_end = charbuf + coding->charbuf_used;
7236
7237   if (NILP (coding->dst_object))
7238     return;
7239
7240   while (charbuf < charbuf_end)
7241     {
7242       if (*charbuf >= 0)
7243         pos++, charbuf++;
7244       else
7245         {
7246           int len = -*charbuf;
7247
7248           if (len > 2)
7249             switch (charbuf[1])
7250               {
7251               case CODING_ANNOTATE_COMPOSITION_MASK:
7252                 produce_composition (coding, charbuf, pos);
7253                 break;
7254               case CODING_ANNOTATE_CHARSET_MASK:
7255                 produce_charset (coding, charbuf, pos);
7256                 break;
7257               }
7258           charbuf += len;
7259         }
7260     }
7261 }
7262
7263 /* Decode the data at CODING->src_object into CODING->dst_object.
7264    CODING->src_object is a buffer, a string, or nil.
7265    CODING->dst_object is a buffer.
7266
7267    If CODING->src_object is a buffer, it must be the current buffer.
7268    In this case, if CODING->src_pos is positive, it is a position of
7269    the source text in the buffer, otherwise, the source text is in the
7270    gap area of the buffer, and CODING->src_pos specifies the offset of
7271    the text from GPT (which must be the same as PT).  If this is the
7272    same buffer as CODING->dst_object, CODING->src_pos must be
7273    negative.
7274
7275    If CODING->src_object is a string, CODING->src_pos is an index to
7276    that string.
7277
7278    If CODING->src_object is nil, CODING->source must already point to
7279    the non-relocatable memory area.  In this case, CODING->src_pos is
7280    an offset from CODING->source.
7281
7282    The decoded data is inserted at the current point of the buffer
7283    CODING->dst_object.
7284 */
7285
7286 static void
7287 decode_coding (struct coding_system *coding)
7288 {
7289   Lisp_Object attrs;
7290   Lisp_Object undo_list;
7291   Lisp_Object translation_table;
7292   struct ccl_spec cclspec;
7293   int carryover;
7294   int i;
7295
7296   USE_SAFE_ALLOCA;
7297
7298   if (BUFFERP (coding->src_object)
7299       && coding->src_pos > 0
7300       && coding->src_pos < GPT
7301       && coding->src_pos + coding->src_chars > GPT)
7302     move_gap_both (coding->src_pos, coding->src_pos_byte);
7303
7304   undo_list = Qt;
7305   if (BUFFERP (coding->dst_object))
7306     {
7307       set_buffer_internal (XBUFFER (coding->dst_object));
7308       if (GPT != PT)
7309         move_gap_both (PT, PT_BYTE);
7310
7311       /* We must disable undo_list in order to record the whole insert
7312          transaction via record_insert at the end.  But doing so also
7313          disables the recording of the first change to the undo_list.
7314          Therefore we check for first change here and record it via
7315          record_first_change if needed.  */
7316       if (MODIFF <= SAVE_MODIFF)
7317         record_first_change ();
7318
7319       undo_list = BVAR (current_buffer, undo_list);
7320       bset_undo_list (current_buffer, Qt);
7321     }
7322
7323   coding->consumed = coding->consumed_char = 0;
7324   coding->produced = coding->produced_char = 0;
7325   coding->chars_at_source = 0;
7326   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7327   coding->errors = 0;
7328
7329   ALLOC_CONVERSION_WORK_AREA (coding);
7330
7331   attrs = CODING_ID_ATTRS (coding->id);
7332   translation_table = get_translation_table (attrs, 0, NULL);
7333
7334   carryover = 0;
7335   if (coding->decoder == decode_coding_ccl)
7336     {
7337       coding->spec.ccl = &cclspec;
7338       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7339     }
7340   do
7341     {
7342       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7343
7344       coding_set_source (coding);
7345       coding->annotated = 0;
7346       coding->charbuf_used = carryover;
7347       (*(coding->decoder)) (coding);
7348       coding_set_destination (coding);
7349       carryover = produce_chars (coding, translation_table, 0);
7350       if (coding->annotated)
7351         produce_annotation (coding, pos);
7352       for (i = 0; i < carryover; i++)
7353         coding->charbuf[i]
7354           = coding->charbuf[coding->charbuf_used - carryover + i];
7355     }
7356   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7357          || (coding->consumed < coding->src_bytes
7358              && (coding->result == CODING_RESULT_SUCCESS
7359                  || coding->result == CODING_RESULT_INVALID_SRC)));
7360
7361   if (carryover > 0)
7362     {
7363       coding_set_destination (coding);
7364       coding->charbuf_used = carryover;
7365       produce_chars (coding, translation_table, 1);
7366     }
7367
7368   coding->carryover_bytes = 0;
7369   if (coding->consumed < coding->src_bytes)
7370     {
7371       int nbytes = coding->src_bytes - coding->consumed;
7372       const unsigned char *src;
7373
7374       coding_set_source (coding);
7375       coding_set_destination (coding);
7376       src = coding->source + coding->consumed;
7377
7378       if (coding->mode & CODING_MODE_LAST_BLOCK)
7379         {
7380           /* Flush out unprocessed data as binary chars.  We are sure
7381              that the number of data is less than the size of
7382              coding->charbuf.  */
7383           coding->charbuf_used = 0;
7384           coding->chars_at_source = 0;
7385
7386           while (nbytes-- > 0)
7387             {
7388               int c = *src++;
7389
7390               if (c & 0x80)
7391                 c = BYTE8_TO_CHAR (c);
7392               coding->charbuf[coding->charbuf_used++] = c;
7393             }
7394           produce_chars (coding, Qnil, 1);
7395         }
7396       else
7397         {
7398           /* Record unprocessed bytes in coding->carryover.  We are
7399              sure that the number of data is less than the size of
7400              coding->carryover.  */
7401           unsigned char *p = coding->carryover;
7402
7403           if (nbytes > sizeof coding->carryover)
7404             nbytes = sizeof coding->carryover;
7405           coding->carryover_bytes = nbytes;
7406           while (nbytes-- > 0)
7407             *p++ = *src++;
7408         }
7409       coding->consumed = coding->src_bytes;
7410     }
7411
7412   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7413       && !inhibit_eol_conversion)
7414     decode_eol (coding);
7415   if (BUFFERP (coding->dst_object))
7416     {
7417       bset_undo_list (current_buffer, undo_list);
7418       record_insert (coding->dst_pos, coding->produced_char);
7419     }
7420
7421   SAFE_FREE ();
7422 }
7423
7424
7425 /* Extract an annotation datum from a composition starting at POS and
7426    ending before LIMIT of CODING->src_object (buffer or string), store
7427    the data in BUF, set *STOP to a starting position of the next
7428    composition (if any) or to LIMIT, and return the address of the
7429    next element of BUF.
7430
7431    If such an annotation is not found, set *STOP to a starting
7432    position of a composition after POS (if any) or to LIMIT, and
7433    return BUF.  */
7434
7435 static int *
7436 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7437                                struct coding_system *coding, int *buf,
7438                                ptrdiff_t *stop)
7439 {
7440   ptrdiff_t start, end;
7441   Lisp_Object prop;
7442
7443   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7444       || end > limit)
7445     *stop = limit;
7446   else if (start > pos)
7447     *stop = start;
7448   else
7449     {
7450       if (start == pos)
7451         {
7452           /* We found a composition.  Store the corresponding
7453              annotation data in BUF.  */
7454           int *head = buf;
7455           enum composition_method method = COMPOSITION_METHOD (prop);
7456           int nchars = COMPOSITION_LENGTH (prop);
7457
7458           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7459           if (method != COMPOSITION_RELATIVE)
7460             {
7461               Lisp_Object components;
7462               ptrdiff_t i, len, i_byte;
7463
7464               components = COMPOSITION_COMPONENTS (prop);
7465               if (VECTORP (components))
7466                 {
7467                   len = ASIZE (components);
7468                   for (i = 0; i < len; i++)
7469                     *buf++ = XINT (AREF (components, i));
7470                 }
7471               else if (STRINGP (components))
7472                 {
7473                   len = SCHARS (components);
7474                   i = i_byte = 0;
7475                   while (i < len)
7476                     {
7477                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7478                       buf++;
7479                     }
7480                 }
7481               else if (INTEGERP (components))
7482                 {
7483                   len = 1;
7484                   *buf++ = XINT (components);
7485                 }
7486               else if (CONSP (components))
7487                 {
7488                   for (len = 0; CONSP (components);
7489                        len++, components = XCDR (components))
7490                     *buf++ = XINT (XCAR (components));
7491                 }
7492               else
7493                 emacs_abort ();
7494               *head -= len;
7495             }
7496         }
7497
7498       if (find_composition (end, limit, &start, &end, &prop,
7499                             coding->src_object)
7500           && end <= limit)
7501         *stop = start;
7502       else
7503         *stop = limit;
7504     }
7505   return buf;
7506 }
7507
7508
7509 /* Extract an annotation datum from a text property `charset' at POS of
7510    CODING->src_object (buffer of string), store the data in BUF, set
7511    *STOP to the position where the value of `charset' property changes
7512    (limiting by LIMIT), and return the address of the next element of
7513    BUF.
7514
7515    If the property value is nil, set *STOP to the position where the
7516    property value is non-nil (limiting by LIMIT), and return BUF.  */
7517
7518 static int *
7519 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7520                            struct coding_system *coding, int *buf,
7521                            ptrdiff_t *stop)
7522 {
7523   Lisp_Object val, next;
7524   int id;
7525
7526   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7527   if (! NILP (val) && CHARSETP (val))
7528     id = XINT (CHARSET_SYMBOL_ID (val));
7529   else
7530     id = -1;
7531   ADD_CHARSET_DATA (buf, 0, id);
7532   next = Fnext_single_property_change (make_number (pos), Qcharset,
7533                                        coding->src_object,
7534                                        make_number (limit));
7535   *stop = XINT (next);
7536   return buf;
7537 }
7538
7539
7540 static void
7541 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7542                int max_lookup)
7543 {
7544   int *buf = coding->charbuf;
7545   int *buf_end = coding->charbuf + coding->charbuf_size;
7546   const unsigned char *src = coding->source + coding->consumed;
7547   const unsigned char *src_end = coding->source + coding->src_bytes;
7548   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7549   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7550   bool multibytep = coding->src_multibyte;
7551   Lisp_Object eol_type;
7552   int c;
7553   ptrdiff_t stop, stop_composition, stop_charset;
7554   int *lookup_buf = NULL;
7555
7556   if (! NILP (translation_table))
7557     lookup_buf = alloca (sizeof (int) * max_lookup);
7558
7559   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7560   if (VECTORP (eol_type))
7561     eol_type = Qunix;
7562
7563   /* Note: composition handling is not yet implemented.  */
7564   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7565
7566   if (NILP (coding->src_object))
7567     stop = stop_composition = stop_charset = end_pos;
7568   else
7569     {
7570       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7571         stop = stop_composition = pos;
7572       else
7573         stop = stop_composition = end_pos;
7574       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7575         stop = stop_charset = pos;
7576       else
7577         stop_charset = end_pos;
7578     }
7579
7580   /* Compensate for CRLF and conversion.  */
7581   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7582   while (buf < buf_end)
7583     {
7584       Lisp_Object trans;
7585
7586       if (pos == stop)
7587         {
7588           if (pos == end_pos)
7589             break;
7590           if (pos == stop_composition)
7591             buf = handle_composition_annotation (pos, end_pos, coding,
7592                                                  buf, &stop_composition);
7593           if (pos == stop_charset)
7594             buf = handle_charset_annotation (pos, end_pos, coding,
7595                                              buf, &stop_charset);
7596           stop = (stop_composition < stop_charset
7597                   ? stop_composition : stop_charset);
7598         }
7599
7600       if (! multibytep)
7601         {
7602           int bytes;
7603
7604           if (coding->encoder == encode_coding_raw_text
7605               || coding->encoder == encode_coding_ccl)
7606             c = *src++, pos++;
7607           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7608             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7609           else
7610             c = BYTE8_TO_CHAR (*src), src++, pos++;
7611         }
7612       else
7613         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7614       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7615         c = '\n';
7616       if (! EQ (eol_type, Qunix))
7617         {
7618           if (c == '\n')
7619             {
7620               if (EQ (eol_type, Qdos))
7621                 *buf++ = '\r';
7622               else
7623                 c = '\r';
7624             }
7625         }
7626
7627       trans = Qnil;
7628       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7629       if (NILP (trans))
7630         *buf++ = c;
7631       else
7632         {
7633           ptrdiff_t from_nchars = 1, to_nchars = 1;
7634           int *lookup_buf_end;
7635           const unsigned char *p = src;
7636           int i;
7637
7638           lookup_buf[0] = c;
7639           for (i = 1; i < max_lookup && p < src_end; i++)
7640             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7641           lookup_buf_end = lookup_buf + i;
7642           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7643           if (INTEGERP (trans))
7644             c = XINT (trans);
7645           else if (CONSP (trans))
7646             {
7647               from_nchars = ASIZE (XCAR (trans));
7648               trans = XCDR (trans);
7649               if (INTEGERP (trans))
7650                 c = XINT (trans);
7651               else
7652                 {
7653                   to_nchars = ASIZE (trans);
7654                   if (buf_end - buf < to_nchars)
7655                     break;
7656                   c = XINT (AREF (trans, 0));
7657                 }
7658             }
7659           else
7660             break;
7661           *buf++ = c;
7662           for (i = 1; i < to_nchars; i++)
7663             *buf++ = XINT (AREF (trans, i));
7664           for (i = 1; i < from_nchars; i++, pos++)
7665             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7666         }
7667     }
7668
7669   coding->consumed = src - coding->source;
7670   coding->consumed_char = pos - coding->src_pos;
7671   coding->charbuf_used = buf - coding->charbuf;
7672   coding->chars_at_source = 0;
7673 }
7674
7675
7676 /* Encode the text at CODING->src_object into CODING->dst_object.
7677    CODING->src_object is a buffer or a string.
7678    CODING->dst_object is a buffer or nil.
7679
7680    If CODING->src_object is a buffer, it must be the current buffer.
7681    In this case, if CODING->src_pos is positive, it is a position of
7682    the source text in the buffer, otherwise. the source text is in the
7683    gap area of the buffer, and coding->src_pos specifies the offset of
7684    the text from GPT (which must be the same as PT).  If this is the
7685    same buffer as CODING->dst_object, CODING->src_pos must be
7686    negative and CODING should not have `pre-write-conversion'.
7687
7688    If CODING->src_object is a string, CODING should not have
7689    `pre-write-conversion'.
7690
7691    If CODING->dst_object is a buffer, the encoded data is inserted at
7692    the current point of that buffer.
7693
7694    If CODING->dst_object is nil, the encoded data is placed at the
7695    memory area specified by CODING->destination.  */
7696
7697 static void
7698 encode_coding (struct coding_system *coding)
7699 {
7700   Lisp_Object attrs;
7701   Lisp_Object translation_table;
7702   int max_lookup;
7703   struct ccl_spec cclspec;
7704
7705   USE_SAFE_ALLOCA;
7706
7707   attrs = CODING_ID_ATTRS (coding->id);
7708   if (coding->encoder == encode_coding_raw_text)
7709     translation_table = Qnil, max_lookup = 0;
7710   else
7711     translation_table = get_translation_table (attrs, 1, &max_lookup);
7712
7713   if (BUFFERP (coding->dst_object))
7714     {
7715       set_buffer_internal (XBUFFER (coding->dst_object));
7716       coding->dst_multibyte
7717         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7718     }
7719
7720   coding->consumed = coding->consumed_char = 0;
7721   coding->produced = coding->produced_char = 0;
7722   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7723   coding->errors = 0;
7724
7725   ALLOC_CONVERSION_WORK_AREA (coding);
7726
7727   if (coding->encoder == encode_coding_ccl)
7728     {
7729       coding->spec.ccl = &cclspec;
7730       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7731     }
7732   do {
7733     coding_set_source (coding);
7734     consume_chars (coding, translation_table, max_lookup);
7735     coding_set_destination (coding);
7736     (*(coding->encoder)) (coding);
7737   } while (coding->consumed_char < coding->src_chars);
7738
7739   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7740     insert_from_gap (coding->produced_char, coding->produced, 0);
7741
7742   SAFE_FREE ();
7743 }
7744
7745
7746 /* Name (or base name) of work buffer for code conversion.  */
7747 static Lisp_Object Vcode_conversion_workbuf_name;
7748
7749 /* A working buffer used by the top level conversion.  Once it is
7750    created, it is never destroyed.  It has the name
7751    Vcode_conversion_workbuf_name.  The other working buffers are
7752    destroyed after the use is finished, and their names are modified
7753    versions of Vcode_conversion_workbuf_name.  */
7754 static Lisp_Object Vcode_conversion_reused_workbuf;
7755
7756 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7757 static bool reused_workbuf_in_use;
7758
7759
7760 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7761    multibyteness of returning buffer.  */
7762
7763 static Lisp_Object
7764 make_conversion_work_buffer (bool multibyte)
7765 {
7766   Lisp_Object name, workbuf;
7767   struct buffer *current;
7768
7769   if (reused_workbuf_in_use)
7770     {
7771       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7772       workbuf = Fget_buffer_create (name);
7773     }
7774   else
7775     {
7776       reused_workbuf_in_use = 1;
7777       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7778         Vcode_conversion_reused_workbuf
7779           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7780       workbuf = Vcode_conversion_reused_workbuf;
7781     }
7782   current = current_buffer;
7783   set_buffer_internal (XBUFFER (workbuf));
7784   /* We can't allow modification hooks to run in the work buffer.  For
7785      instance, directory_files_internal assumes that file decoding
7786      doesn't compile new regexps.  */
7787   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7788   Ferase_buffer ();
7789   bset_undo_list (current_buffer, Qt);
7790   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7791   set_buffer_internal (current);
7792   return workbuf;
7793 }
7794
7795
7796 static Lisp_Object
7797 code_conversion_restore (Lisp_Object arg)
7798 {
7799   Lisp_Object current, workbuf;
7800   struct gcpro gcpro1;
7801
7802   GCPRO1 (arg);
7803   current = XCAR (arg);
7804   workbuf = XCDR (arg);
7805   if (! NILP (workbuf))
7806     {
7807       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7808         reused_workbuf_in_use = 0;
7809       else
7810         Fkill_buffer (workbuf);
7811     }
7812   set_buffer_internal (XBUFFER (current));
7813   UNGCPRO;
7814   return Qnil;
7815 }
7816
7817 Lisp_Object
7818 code_conversion_save (bool with_work_buf, bool multibyte)
7819 {
7820   Lisp_Object workbuf = Qnil;
7821
7822   if (with_work_buf)
7823     workbuf = make_conversion_work_buffer (multibyte);
7824   record_unwind_protect (code_conversion_restore,
7825                          Fcons (Fcurrent_buffer (), workbuf));
7826   return workbuf;
7827 }
7828
7829 void
7830 decode_coding_gap (struct coding_system *coding,
7831                    ptrdiff_t chars, ptrdiff_t bytes)
7832 {
7833   ptrdiff_t count = SPECPDL_INDEX ();
7834   Lisp_Object attrs;
7835
7836   coding->src_object = Fcurrent_buffer ();
7837   coding->src_chars = chars;
7838   coding->src_bytes = bytes;
7839   coding->src_pos = -chars;
7840   coding->src_pos_byte = -bytes;
7841   coding->src_multibyte = chars < bytes;
7842   coding->dst_object = coding->src_object;
7843   coding->dst_pos = PT;
7844   coding->dst_pos_byte = PT_BYTE;
7845   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7846
7847   coding->head_ascii = -1;
7848   coding->detected_utf8_chars = -1;
7849   coding->eol_seen = EOL_SEEN_NONE;
7850   if (CODING_REQUIRE_DETECTION (coding))
7851     detect_coding (coding);
7852   attrs = CODING_ID_ATTRS (coding->id);
7853   if (! disable_ascii_optimization
7854       && ! coding->src_multibyte
7855       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7856       && NILP (CODING_ATTR_POST_READ (attrs))
7857       && NILP (get_translation_table (attrs, 0, NULL)))
7858     {
7859       chars = coding->head_ascii;
7860       if (chars < 0)
7861         chars = check_ascii (coding);
7862       if (chars != bytes)
7863         {
7864           /* There exists a non-ASCII byte.  */
7865           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7866             {
7867               if (coding->detected_utf8_chars >= 0)
7868                 chars = coding->detected_utf8_chars;
7869               else
7870                 chars = check_utf_8 (coding);
7871               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7872                   && coding->head_ascii == 0
7873                   && coding->source[0] == UTF_8_BOM_1
7874                   && coding->source[1] == UTF_8_BOM_2
7875                   && coding->source[2] == UTF_8_BOM_3)
7876                 {
7877                   chars--;
7878                   bytes -= 3;
7879                   coding->src_bytes -= 3;
7880                 }
7881             }
7882           else
7883             chars = -1;
7884         }
7885       if (chars >= 0)
7886         {
7887           Lisp_Object eol_type;
7888
7889           eol_type = CODING_ID_EOL_TYPE (coding->id);
7890           if (VECTORP (eol_type))
7891             {
7892               if (coding->eol_seen != EOL_SEEN_NONE)
7893                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7894             }
7895           if (EQ (eol_type, Qmac))
7896             {
7897               unsigned char *src_end = GAP_END_ADDR;
7898               unsigned char *src = src_end - coding->src_bytes;
7899
7900               while (src < src_end)
7901                 {
7902                   if (*src++ == '\r')
7903                     src[-1] = '\n';
7904                 }
7905             }
7906           else if (EQ (eol_type, Qdos))
7907             {
7908               unsigned char *src = GAP_END_ADDR;
7909               unsigned char *src_beg = src - coding->src_bytes;
7910               unsigned char *dst = src;
7911               ptrdiff_t diff;
7912
7913               while (src_beg < src)
7914                 {
7915                   *--dst = *--src;
7916                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7917                     src--;
7918                 }
7919               diff = dst - src;
7920               bytes -= diff;
7921               chars -= diff;
7922             }
7923           coding->produced = bytes;
7924           coding->produced_char = chars;
7925           insert_from_gap (chars, bytes, 1);
7926           return;
7927         }
7928     }
7929   code_conversion_save (0, 0);
7930
7931   coding->mode |= CODING_MODE_LAST_BLOCK;
7932   current_buffer->text->inhibit_shrinking = 1;
7933   decode_coding (coding);
7934   current_buffer->text->inhibit_shrinking = 0;
7935
7936   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7937     {
7938       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7939       Lisp_Object val;
7940
7941       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7942       val = call1 (CODING_ATTR_POST_READ (attrs),
7943                    make_number (coding->produced_char));
7944       CHECK_NATNUM (val);
7945       coding->produced_char += Z - prev_Z;
7946       coding->produced += Z_BYTE - prev_Z_BYTE;
7947     }
7948
7949   unbind_to (count, Qnil);
7950 }
7951
7952
7953 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7954    SRC_OBJECT into DST_OBJECT by coding context CODING.
7955
7956    SRC_OBJECT is a buffer, a string, or Qnil.
7957
7958    If it is a buffer, the text is at point of the buffer.  FROM and TO
7959    are positions in the buffer.
7960
7961    If it is a string, the text is at the beginning of the string.
7962    FROM and TO are indices to the string.
7963
7964    If it is nil, the text is at coding->source.  FROM and TO are
7965    indices to coding->source.
7966
7967    DST_OBJECT is a buffer, Qt, or Qnil.
7968
7969    If it is a buffer, the decoded text is inserted at point of the
7970    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7971    is deleted.
7972
7973    If it is Qt, a string is made from the decoded text, and
7974    set in CODING->dst_object.
7975
7976    If it is Qnil, the decoded text is stored at CODING->destination.
7977    The caller must allocate CODING->dst_bytes bytes at
7978    CODING->destination by xmalloc.  If the decoded text is longer than
7979    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7980  */
7981
7982 void
7983 decode_coding_object (struct coding_system *coding,
7984                       Lisp_Object src_object,
7985                       ptrdiff_t from, ptrdiff_t from_byte,
7986                       ptrdiff_t to, ptrdiff_t to_byte,
7987                       Lisp_Object dst_object)
7988 {
7989   ptrdiff_t count = SPECPDL_INDEX ();
7990   unsigned char *destination IF_LINT (= NULL);
7991   ptrdiff_t dst_bytes IF_LINT (= 0);
7992   ptrdiff_t chars = to - from;
7993   ptrdiff_t bytes = to_byte - from_byte;
7994   Lisp_Object attrs;
7995   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7996   bool need_marker_adjustment = 0;
7997   Lisp_Object old_deactivate_mark;
7998
7999   old_deactivate_mark = Vdeactivate_mark;
8000
8001   if (NILP (dst_object))
8002     {
8003       destination = coding->destination;
8004       dst_bytes = coding->dst_bytes;
8005     }
8006
8007   coding->src_object = src_object;
8008   coding->src_chars = chars;
8009   coding->src_bytes = bytes;
8010   coding->src_multibyte = chars < bytes;
8011
8012   if (STRINGP (src_object))
8013     {
8014       coding->src_pos = from;
8015       coding->src_pos_byte = from_byte;
8016     }
8017   else if (BUFFERP (src_object))
8018     {
8019       set_buffer_internal (XBUFFER (src_object));
8020       if (from != GPT)
8021         move_gap_both (from, from_byte);
8022       if (EQ (src_object, dst_object))
8023         {
8024           struct Lisp_Marker *tail;
8025
8026           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8027             {
8028               tail->need_adjustment
8029                 = tail->charpos == (tail->insertion_type ? from : to);
8030               need_marker_adjustment |= tail->need_adjustment;
8031             }
8032           saved_pt = PT, saved_pt_byte = PT_BYTE;
8033           TEMP_SET_PT_BOTH (from, from_byte);
8034           current_buffer->text->inhibit_shrinking = 1;
8035           del_range_both (from, from_byte, to, to_byte, 1);
8036           coding->src_pos = -chars;
8037           coding->src_pos_byte = -bytes;
8038         }
8039       else
8040         {
8041           coding->src_pos = from;
8042           coding->src_pos_byte = from_byte;
8043         }
8044     }
8045
8046   if (CODING_REQUIRE_DETECTION (coding))
8047     detect_coding (coding);
8048   attrs = CODING_ID_ATTRS (coding->id);
8049
8050   if (EQ (dst_object, Qt)
8051       || (! NILP (CODING_ATTR_POST_READ (attrs))
8052           && NILP (dst_object)))
8053     {
8054       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8055       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8056       coding->dst_pos = BEG;
8057       coding->dst_pos_byte = BEG_BYTE;
8058     }
8059   else if (BUFFERP (dst_object))
8060     {
8061       code_conversion_save (0, 0);
8062       coding->dst_object = dst_object;
8063       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8064       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8065       coding->dst_multibyte
8066         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8067     }
8068   else
8069     {
8070       code_conversion_save (0, 0);
8071       coding->dst_object = Qnil;
8072       /* Most callers presume this will return a multibyte result, and they
8073          won't use `binary' or `raw-text' anyway, so let's not worry about
8074          CODING_FOR_UNIBYTE.  */
8075       coding->dst_multibyte = 1;
8076     }
8077
8078   decode_coding (coding);
8079
8080   if (BUFFERP (coding->dst_object))
8081     set_buffer_internal (XBUFFER (coding->dst_object));
8082
8083   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8084     {
8085       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8086       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8087       Lisp_Object val;
8088
8089       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8090       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8091               old_deactivate_mark);
8092       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8093                         make_number (coding->produced_char));
8094       UNGCPRO;
8095       CHECK_NATNUM (val);
8096       coding->produced_char += Z - prev_Z;
8097       coding->produced += Z_BYTE - prev_Z_BYTE;
8098     }
8099
8100   if (EQ (dst_object, Qt))
8101     {
8102       coding->dst_object = Fbuffer_string ();
8103     }
8104   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8105     {
8106       set_buffer_internal (XBUFFER (coding->dst_object));
8107       if (dst_bytes < coding->produced)
8108         {
8109           eassert (coding->produced > 0);
8110           destination = xrealloc (destination, coding->produced);
8111           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8112             move_gap_both (BEGV, BEGV_BYTE);
8113           memcpy (destination, BEGV_ADDR, coding->produced);
8114           coding->destination = destination;
8115         }
8116     }
8117
8118   if (saved_pt >= 0)
8119     {
8120       /* This is the case of:
8121          (BUFFERP (src_object) && EQ (src_object, dst_object))
8122          As we have moved PT while replacing the original buffer
8123          contents, we must recover it now.  */
8124       set_buffer_internal (XBUFFER (src_object));
8125       current_buffer->text->inhibit_shrinking = 0;
8126       if (saved_pt < from)
8127         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8128       else if (saved_pt < from + chars)
8129         TEMP_SET_PT_BOTH (from, from_byte);
8130       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8131         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8132                           saved_pt_byte + (coding->produced - bytes));
8133       else
8134         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8135                           saved_pt_byte + (coding->produced - bytes));
8136
8137       if (need_marker_adjustment)
8138         {
8139           struct Lisp_Marker *tail;
8140
8141           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8142             if (tail->need_adjustment)
8143               {
8144                 tail->need_adjustment = 0;
8145                 if (tail->insertion_type)
8146                   {
8147                     tail->bytepos = from_byte;
8148                     tail->charpos = from;
8149                   }
8150                 else
8151                   {
8152                     tail->bytepos = from_byte + coding->produced;
8153                     tail->charpos
8154                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8155                          ? tail->bytepos : from + coding->produced_char);
8156                   }
8157               }
8158         }
8159     }
8160
8161   Vdeactivate_mark = old_deactivate_mark;
8162   unbind_to (count, coding->dst_object);
8163 }
8164
8165
8166 void
8167 encode_coding_object (struct coding_system *coding,
8168                       Lisp_Object src_object,
8169                       ptrdiff_t from, ptrdiff_t from_byte,
8170                       ptrdiff_t to, ptrdiff_t to_byte,
8171                       Lisp_Object dst_object)
8172 {
8173   ptrdiff_t count = SPECPDL_INDEX ();
8174   ptrdiff_t chars = to - from;
8175   ptrdiff_t bytes = to_byte - from_byte;
8176   Lisp_Object attrs;
8177   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8178   bool need_marker_adjustment = 0;
8179   bool kill_src_buffer = 0;
8180   Lisp_Object old_deactivate_mark;
8181
8182   old_deactivate_mark = Vdeactivate_mark;
8183
8184   coding->src_object = src_object;
8185   coding->src_chars = chars;
8186   coding->src_bytes = bytes;
8187   coding->src_multibyte = chars < bytes;
8188
8189   attrs = CODING_ID_ATTRS (coding->id);
8190
8191   if (EQ (src_object, dst_object))
8192     {
8193       struct Lisp_Marker *tail;
8194
8195       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8196         {
8197           tail->need_adjustment
8198             = tail->charpos == (tail->insertion_type ? from : to);
8199           need_marker_adjustment |= tail->need_adjustment;
8200         }
8201     }
8202
8203   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8204     {
8205       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8206       set_buffer_internal (XBUFFER (coding->src_object));
8207       if (STRINGP (src_object))
8208         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8209       else if (BUFFERP (src_object))
8210         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8211       else
8212         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8213
8214       if (EQ (src_object, dst_object))
8215         {
8216           set_buffer_internal (XBUFFER (src_object));
8217           saved_pt = PT, saved_pt_byte = PT_BYTE;
8218           del_range_both (from, from_byte, to, to_byte, 1);
8219           set_buffer_internal (XBUFFER (coding->src_object));
8220         }
8221
8222       {
8223         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8224
8225         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8226                 old_deactivate_mark);
8227         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8228                     make_number (BEG), make_number (Z));
8229         UNGCPRO;
8230       }
8231       if (XBUFFER (coding->src_object) != current_buffer)
8232         kill_src_buffer = 1;
8233       coding->src_object = Fcurrent_buffer ();
8234       if (BEG != GPT)
8235         move_gap_both (BEG, BEG_BYTE);
8236       coding->src_chars = Z - BEG;
8237       coding->src_bytes = Z_BYTE - BEG_BYTE;
8238       coding->src_pos = BEG;
8239       coding->src_pos_byte = BEG_BYTE;
8240       coding->src_multibyte = Z < Z_BYTE;
8241     }
8242   else if (STRINGP (src_object))
8243     {
8244       code_conversion_save (0, 0);
8245       coding->src_pos = from;
8246       coding->src_pos_byte = from_byte;
8247     }
8248   else if (BUFFERP (src_object))
8249     {
8250       code_conversion_save (0, 0);
8251       set_buffer_internal (XBUFFER (src_object));
8252       if (EQ (src_object, dst_object))
8253         {
8254           saved_pt = PT, saved_pt_byte = PT_BYTE;
8255           coding->src_object = del_range_1 (from, to, 1, 1);
8256           coding->src_pos = 0;
8257           coding->src_pos_byte = 0;
8258         }
8259       else
8260         {
8261           if (from < GPT && to >= GPT)
8262             move_gap_both (from, from_byte);
8263           coding->src_pos = from;
8264           coding->src_pos_byte = from_byte;
8265         }
8266     }
8267   else
8268     code_conversion_save (0, 0);
8269
8270   if (BUFFERP (dst_object))
8271     {
8272       coding->dst_object = dst_object;
8273       if (EQ (src_object, dst_object))
8274         {
8275           coding->dst_pos = from;
8276           coding->dst_pos_byte = from_byte;
8277         }
8278       else
8279         {
8280           struct buffer *current = current_buffer;
8281
8282           set_buffer_temp (XBUFFER (dst_object));
8283           coding->dst_pos = PT;
8284           coding->dst_pos_byte = PT_BYTE;
8285           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8286           set_buffer_temp (current);
8287         }
8288       coding->dst_multibyte
8289         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8290     }
8291   else if (EQ (dst_object, Qt))
8292     {
8293       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8294       coding->dst_object = Qnil;
8295       coding->destination = xmalloc (dst_bytes);
8296       coding->dst_bytes = dst_bytes;
8297       coding->dst_multibyte = 0;
8298     }
8299   else
8300     {
8301       coding->dst_object = Qnil;
8302       coding->dst_multibyte = 0;
8303     }
8304
8305   encode_coding (coding);
8306
8307   if (EQ (dst_object, Qt))
8308     {
8309       if (BUFFERP (coding->dst_object))
8310         coding->dst_object = Fbuffer_string ();
8311       else
8312         {
8313           coding->dst_object
8314             = make_unibyte_string ((char *) coding->destination,
8315                                    coding->produced);
8316           xfree (coding->destination);
8317         }
8318     }
8319
8320   if (saved_pt >= 0)
8321     {
8322       /* This is the case of:
8323          (BUFFERP (src_object) && EQ (src_object, dst_object))
8324          As we have moved PT while replacing the original buffer
8325          contents, we must recover it now.  */
8326       set_buffer_internal (XBUFFER (src_object));
8327       if (saved_pt < from)
8328         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8329       else if (saved_pt < from + chars)
8330         TEMP_SET_PT_BOTH (from, from_byte);
8331       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8332         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8333                           saved_pt_byte + (coding->produced - bytes));
8334       else
8335         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8336                           saved_pt_byte + (coding->produced - bytes));
8337
8338       if (need_marker_adjustment)
8339         {
8340           struct Lisp_Marker *tail;
8341
8342           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8343             if (tail->need_adjustment)
8344               {
8345                 tail->need_adjustment = 0;
8346                 if (tail->insertion_type)
8347                   {
8348                     tail->bytepos = from_byte;
8349                     tail->charpos = from;
8350                   }
8351                 else
8352                   {
8353                     tail->bytepos = from_byte + coding->produced;
8354                     tail->charpos
8355                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8356                          ? tail->bytepos : from + coding->produced_char);
8357                   }
8358               }
8359         }
8360     }
8361
8362   if (kill_src_buffer)
8363     Fkill_buffer (coding->src_object);
8364
8365   Vdeactivate_mark = old_deactivate_mark;
8366   unbind_to (count, Qnil);
8367 }
8368
8369
8370 Lisp_Object
8371 preferred_coding_system (void)
8372 {
8373   int id = coding_categories[coding_priorities[0]].id;
8374
8375   return CODING_ID_NAME (id);
8376 }
8377
8378 #if defined (WINDOWSNT) || defined (CYGWIN)
8379
8380 Lisp_Object
8381 from_unicode (Lisp_Object str)
8382 {
8383   CHECK_STRING (str);
8384   if (!STRING_MULTIBYTE (str) &&
8385       SBYTES (str) & 1)
8386     {
8387       str = Fsubstring (str, make_number (0), make_number (-1));
8388     }
8389
8390   return code_convert_string_norecord (str, Qutf_16le, 0);
8391 }
8392
8393 Lisp_Object
8394 from_unicode_buffer (const wchar_t* wstr)
8395 {
8396     return from_unicode (
8397         make_unibyte_string (
8398             (char*) wstr,
8399             /* we get one of the two final 0 bytes for free. */
8400             1 + sizeof (wchar_t) * wcslen (wstr)));
8401 }
8402
8403 wchar_t *
8404 to_unicode (Lisp_Object str, Lisp_Object *buf)
8405 {
8406   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8407   /* We need to make another copy (in addition to the one made by
8408      code_convert_string_norecord) to ensure that the final string is
8409      _doubly_ zero terminated --- that is, that the string is
8410      terminated by two zero bytes and one utf-16le null character.
8411      Because strings are already terminated with a single zero byte,
8412      we just add one additional zero. */
8413   str = make_uninit_string (SBYTES (*buf) + 1);
8414   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8415   SDATA (str) [SBYTES (*buf)] = '\0';
8416   *buf = str;
8417   return WCSDATA (*buf);
8418 }
8419
8420 #endif /* WINDOWSNT || CYGWIN */
8421
8422 \f
8423 #ifdef emacs
8424 /*** 8. Emacs Lisp library functions ***/
8425
8426 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8427        doc: /* Return t if OBJECT is nil or a coding-system.
8428 See the documentation of `define-coding-system' for information
8429 about coding-system objects.  */)
8430   (Lisp_Object object)
8431 {
8432   if (NILP (object)
8433       || CODING_SYSTEM_ID (object) >= 0)
8434     return Qt;
8435   if (! SYMBOLP (object)
8436       || NILP (Fget (object, Qcoding_system_define_form)))
8437     return Qnil;
8438   return Qt;
8439 }
8440
8441 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8442        Sread_non_nil_coding_system, 1, 1, 0,
8443        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8444   (Lisp_Object prompt)
8445 {
8446   Lisp_Object val;
8447   do
8448     {
8449       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8450                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8451     }
8452   while (SCHARS (val) == 0);
8453   return (Fintern (val, Qnil));
8454 }
8455
8456 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8457        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8458 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8459 Ignores case when completing coding systems (all Emacs coding systems
8460 are lower-case).  */)
8461   (Lisp_Object prompt, Lisp_Object default_coding_system)
8462 {
8463   Lisp_Object val;
8464   ptrdiff_t count = SPECPDL_INDEX ();
8465
8466   if (SYMBOLP (default_coding_system))
8467     default_coding_system = SYMBOL_NAME (default_coding_system);
8468   specbind (Qcompletion_ignore_case, Qt);
8469   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8470                           Qt, Qnil, Qcoding_system_history,
8471                           default_coding_system, Qnil);
8472   unbind_to (count, Qnil);
8473   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8474 }
8475
8476 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8477        1, 1, 0,
8478        doc: /* Check validity of CODING-SYSTEM.
8479 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8480 It is valid if it is nil or a symbol defined as a coding system by the
8481 function `define-coding-system'.  */)
8482   (Lisp_Object coding_system)
8483 {
8484   Lisp_Object define_form;
8485
8486   define_form = Fget (coding_system, Qcoding_system_define_form);
8487   if (! NILP (define_form))
8488     {
8489       Fput (coding_system, Qcoding_system_define_form, Qnil);
8490       safe_eval (define_form);
8491     }
8492   if (!NILP (Fcoding_system_p (coding_system)))
8493     return coding_system;
8494   xsignal1 (Qcoding_system_error, coding_system);
8495 }
8496
8497 \f
8498 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8499    HIGHEST, return the coding system of the highest
8500    priority among the detected coding systems.  Otherwise return a
8501    list of detected coding systems sorted by their priorities.  If
8502    MULTIBYTEP, it is assumed that the bytes are in correct
8503    multibyte form but contains only ASCII and eight-bit chars.
8504    Otherwise, the bytes are raw bytes.
8505
8506    CODING-SYSTEM controls the detection as below:
8507
8508    If it is nil, detect both text-format and eol-format.  If the
8509    text-format part of CODING-SYSTEM is already specified
8510    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8511    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8512    detect only text-format.  */
8513
8514 Lisp_Object
8515 detect_coding_system (const unsigned char *src,
8516                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8517                       bool highest, bool multibytep,
8518                       Lisp_Object coding_system)
8519 {
8520   const unsigned char *src_end = src + src_bytes;
8521   Lisp_Object attrs, eol_type;
8522   Lisp_Object val = Qnil;
8523   struct coding_system coding;
8524   ptrdiff_t id;
8525   struct coding_detection_info detect_info;
8526   enum coding_category base_category;
8527   bool null_byte_found = 0, eight_bit_found = 0;
8528
8529   if (NILP (coding_system))
8530     coding_system = Qundecided;
8531   setup_coding_system (coding_system, &coding);
8532   attrs = CODING_ID_ATTRS (coding.id);
8533   eol_type = CODING_ID_EOL_TYPE (coding.id);
8534   coding_system = CODING_ATTR_BASE_NAME (attrs);
8535
8536   coding.source = src;
8537   coding.src_chars = src_chars;
8538   coding.src_bytes = src_bytes;
8539   coding.src_multibyte = multibytep;
8540   coding.consumed = 0;
8541   coding.mode |= CODING_MODE_LAST_BLOCK;
8542   coding.head_ascii = 0;
8543
8544   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8545
8546   /* At first, detect text-format if necessary.  */
8547   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8548   if (base_category == coding_category_undecided)
8549     {
8550       enum coding_category category IF_LINT (= 0);
8551       struct coding_system *this IF_LINT (= NULL);
8552       int c, i;
8553       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8554                                        inhibit_null_byte_detection);
8555       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8556                                        inhibit_iso_escape_detection);
8557       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8558
8559       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8560       for (; src < src_end; src++)
8561         {
8562           c = *src;
8563           if (c & 0x80)
8564             {
8565               eight_bit_found = 1;
8566               if (null_byte_found)
8567                 break;
8568             }
8569           else if (c < 0x20)
8570             {
8571               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8572                   && ! inhibit_ied
8573                   && ! detect_info.checked)
8574                 {
8575                   if (detect_coding_iso_2022 (&coding, &detect_info))
8576                     {
8577                       /* We have scanned the whole data.  */
8578                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8579                         {
8580                           /* We didn't find an 8-bit code.  We may
8581                              have found a null-byte, but it's very
8582                              rare that a binary file confirm to
8583                              ISO-2022.  */
8584                           src = src_end;
8585                           coding.head_ascii = src - coding.source;
8586                         }
8587                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8588                       break;
8589                     }
8590                 }
8591               else if (! c && !inhibit_nbd)
8592                 {
8593                   null_byte_found = 1;
8594                   if (eight_bit_found)
8595                     break;
8596                 }
8597               if (! eight_bit_found)
8598                 coding.head_ascii++;
8599             }
8600           else if (! eight_bit_found)
8601             coding.head_ascii++;
8602         }
8603
8604       if (null_byte_found || eight_bit_found
8605           || coding.head_ascii < coding.src_bytes
8606           || detect_info.found)
8607         {
8608           if (coding.head_ascii == coding.src_bytes)
8609             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8610             for (i = 0; i < coding_category_raw_text; i++)
8611               {
8612                 category = coding_priorities[i];
8613                 this = coding_categories + category;
8614                 if (detect_info.found & (1 << category))
8615                   break;
8616               }
8617           else
8618             {
8619               if (null_byte_found)
8620                 {
8621                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8622                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8623                 }
8624               else if (prefer_utf_8
8625                        && detect_coding_utf_8 (&coding, &detect_info))
8626                 {
8627                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8628                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8629                 }
8630               for (i = 0; i < coding_category_raw_text; i++)
8631                 {
8632                   category = coding_priorities[i];
8633                   this = coding_categories + category;
8634
8635                   if (this->id < 0)
8636                     {
8637                       /* No coding system of this category is defined.  */
8638                       detect_info.rejected |= (1 << category);
8639                     }
8640                   else if (category >= coding_category_raw_text)
8641                     continue;
8642                   else if (detect_info.checked & (1 << category))
8643                     {
8644                       if (highest
8645                           && (detect_info.found & (1 << category)))
8646                         break;
8647                     }
8648                   else if ((*(this->detector)) (&coding, &detect_info)
8649                            && highest
8650                            && (detect_info.found & (1 << category)))
8651                     {
8652                       if (category == coding_category_utf_16_auto)
8653                         {
8654                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8655                             category = coding_category_utf_16_le;
8656                           else
8657                             category = coding_category_utf_16_be;
8658                         }
8659                       break;
8660                     }
8661                 }
8662             }
8663         }
8664
8665       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8666           || null_byte_found)
8667         {
8668           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8669           id = CODING_SYSTEM_ID (Qno_conversion);
8670           val = Fcons (make_number (id), Qnil);
8671         }
8672       else if (! detect_info.rejected && ! detect_info.found)
8673         {
8674           detect_info.found = CATEGORY_MASK_ANY;
8675           id = coding_categories[coding_category_undecided].id;
8676           val = Fcons (make_number (id), Qnil);
8677         }
8678       else if (highest)
8679         {
8680           if (detect_info.found)
8681             {
8682               detect_info.found = 1 << category;
8683               val = Fcons (make_number (this->id), Qnil);
8684             }
8685           else
8686             for (i = 0; i < coding_category_raw_text; i++)
8687               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8688                 {
8689                   detect_info.found = 1 << coding_priorities[i];
8690                   id = coding_categories[coding_priorities[i]].id;
8691                   val = Fcons (make_number (id), Qnil);
8692                   break;
8693                 }
8694         }
8695       else
8696         {
8697           int mask = detect_info.rejected | detect_info.found;
8698           int found = 0;
8699
8700           for (i = coding_category_raw_text - 1; i >= 0; i--)
8701             {
8702               category = coding_priorities[i];
8703               if (! (mask & (1 << category)))
8704                 {
8705                   found |= 1 << category;
8706                   id = coding_categories[category].id;
8707                   if (id >= 0)
8708                     val = Fcons (make_number (id), val);
8709                 }
8710             }
8711           for (i = coding_category_raw_text - 1; i >= 0; i--)
8712             {
8713               category = coding_priorities[i];
8714               if (detect_info.found & (1 << category))
8715                 {
8716                   id = coding_categories[category].id;
8717                   val = Fcons (make_number (id), val);
8718                 }
8719             }
8720           detect_info.found |= found;
8721         }
8722     }
8723   else if (base_category == coding_category_utf_8_auto)
8724     {
8725       if (detect_coding_utf_8 (&coding, &detect_info))
8726         {
8727           struct coding_system *this;
8728
8729           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8730             this = coding_categories + coding_category_utf_8_sig;
8731           else
8732             this = coding_categories + coding_category_utf_8_nosig;
8733           val = Fcons (make_number (this->id), Qnil);
8734         }
8735     }
8736   else if (base_category == coding_category_utf_16_auto)
8737     {
8738       if (detect_coding_utf_16 (&coding, &detect_info))
8739         {
8740           struct coding_system *this;
8741
8742           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8743             this = coding_categories + coding_category_utf_16_le;
8744           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8745             this = coding_categories + coding_category_utf_16_be;
8746           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8747             this = coding_categories + coding_category_utf_16_be_nosig;
8748           else
8749             this = coding_categories + coding_category_utf_16_le_nosig;
8750           val = Fcons (make_number (this->id), Qnil);
8751         }
8752     }
8753   else
8754     {
8755       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8756       val = Fcons (make_number (coding.id), Qnil);
8757     }
8758
8759   /* Then, detect eol-format if necessary.  */
8760   {
8761     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8762     Lisp_Object tail;
8763
8764     if (VECTORP (eol_type))
8765       {
8766         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8767           {
8768             if (null_byte_found)
8769               normal_eol = EOL_SEEN_LF;
8770             else
8771               normal_eol = detect_eol (coding.source, src_bytes,
8772                                        coding_category_raw_text);
8773           }
8774         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8775                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8776           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8777                                       coding_category_utf_16_be);
8778         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8779                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8780           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8781                                       coding_category_utf_16_le);
8782       }
8783     else
8784       {
8785         if (EQ (eol_type, Qunix))
8786           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8787         else if (EQ (eol_type, Qdos))
8788           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8789         else
8790           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8791       }
8792
8793     for (tail = val; CONSP (tail); tail = XCDR (tail))
8794       {
8795         enum coding_category category;
8796         int this_eol;
8797
8798         id = XINT (XCAR (tail));
8799         attrs = CODING_ID_ATTRS (id);
8800         category = XINT (CODING_ATTR_CATEGORY (attrs));
8801         eol_type = CODING_ID_EOL_TYPE (id);
8802         if (VECTORP (eol_type))
8803           {
8804             if (category == coding_category_utf_16_be
8805                 || category == coding_category_utf_16_be_nosig)
8806               this_eol = utf_16_be_eol;
8807             else if (category == coding_category_utf_16_le
8808                      || category == coding_category_utf_16_le_nosig)
8809               this_eol = utf_16_le_eol;
8810             else
8811               this_eol = normal_eol;
8812
8813             if (this_eol == EOL_SEEN_LF)
8814               XSETCAR (tail, AREF (eol_type, 0));
8815             else if (this_eol == EOL_SEEN_CRLF)
8816               XSETCAR (tail, AREF (eol_type, 1));
8817             else if (this_eol == EOL_SEEN_CR)
8818               XSETCAR (tail, AREF (eol_type, 2));
8819             else
8820               XSETCAR (tail, CODING_ID_NAME (id));
8821           }
8822         else
8823           XSETCAR (tail, CODING_ID_NAME (id));
8824       }
8825   }
8826
8827   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8828 }
8829
8830
8831 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8832        2, 3, 0,
8833        doc: /* Detect coding system of the text in the region between START and END.
8834 Return a list of possible coding systems ordered by priority.
8835 The coding systems to try and their priorities follows what
8836 the function `coding-system-priority-list' (which see) returns.
8837
8838 If only ASCII characters are found (except for such ISO-2022 control
8839 characters as ESC), it returns a list of single element `undecided'
8840 or its subsidiary coding system according to a detected end-of-line
8841 format.
8842
8843 If optional argument HIGHEST is non-nil, return the coding system of
8844 highest priority.  */)
8845   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8846 {
8847   ptrdiff_t from, to;
8848   ptrdiff_t from_byte, to_byte;
8849
8850   validate_region (&start, &end);
8851   from = XINT (start), to = XINT (end);
8852   from_byte = CHAR_TO_BYTE (from);
8853   to_byte = CHAR_TO_BYTE (to);
8854
8855   if (from < GPT && to >= GPT)
8856     move_gap_both (to, to_byte);
8857
8858   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8859                                to - from, to_byte - from_byte,
8860                                !NILP (highest),
8861                                !NILP (BVAR (current_buffer
8862                                       , enable_multibyte_characters)),
8863                                Qnil);
8864 }
8865
8866 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8867        1, 2, 0,
8868        doc: /* Detect coding system of the text in STRING.
8869 Return a list of possible coding systems ordered by priority.
8870 The coding systems to try and their priorities follows what
8871 the function `coding-system-priority-list' (which see) returns.
8872
8873 If only ASCII characters are found (except for such ISO-2022 control
8874 characters as ESC), it returns a list of single element `undecided'
8875 or its subsidiary coding system according to a detected end-of-line
8876 format.
8877
8878 If optional argument HIGHEST is non-nil, return the coding system of
8879 highest priority.  */)
8880   (Lisp_Object string, Lisp_Object highest)
8881 {
8882   CHECK_STRING (string);
8883
8884   return detect_coding_system (SDATA (string),
8885                                SCHARS (string), SBYTES (string),
8886                                !NILP (highest), STRING_MULTIBYTE (string),
8887                                Qnil);
8888 }
8889
8890
8891 static bool
8892 char_encodable_p (int c, Lisp_Object attrs)
8893 {
8894   Lisp_Object tail;
8895   struct charset *charset;
8896   Lisp_Object translation_table;
8897
8898   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8899   if (! NILP (translation_table))
8900     c = translate_char (translation_table, c);
8901   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8902        CONSP (tail); tail = XCDR (tail))
8903     {
8904       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8905       if (CHAR_CHARSET_P (c, charset))
8906         break;
8907     }
8908   return (! NILP (tail));
8909 }
8910
8911
8912 /* Return a list of coding systems that safely encode the text between
8913    START and END.  If EXCLUDE is non-nil, it is a list of coding
8914    systems not to check.  The returned list doesn't contain any such
8915    coding systems.  In any case, if the text contains only ASCII or is
8916    unibyte, return t.  */
8917
8918 DEFUN ("find-coding-systems-region-internal",
8919        Ffind_coding_systems_region_internal,
8920        Sfind_coding_systems_region_internal, 2, 3, 0,
8921        doc: /* Internal use only.  */)
8922   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8923 {
8924   Lisp_Object coding_attrs_list, safe_codings;
8925   ptrdiff_t start_byte, end_byte;
8926   const unsigned char *p, *pbeg, *pend;
8927   int c;
8928   Lisp_Object tail, elt, work_table;
8929
8930   if (STRINGP (start))
8931     {
8932       if (!STRING_MULTIBYTE (start)
8933           || SCHARS (start) == SBYTES (start))
8934         return Qt;
8935       start_byte = 0;
8936       end_byte = SBYTES (start);
8937     }
8938   else
8939     {
8940       CHECK_NUMBER_COERCE_MARKER (start);
8941       CHECK_NUMBER_COERCE_MARKER (end);
8942       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8943         args_out_of_range (start, end);
8944       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8945         return Qt;
8946       start_byte = CHAR_TO_BYTE (XINT (start));
8947       end_byte = CHAR_TO_BYTE (XINT (end));
8948       if (XINT (end) - XINT (start) == end_byte - start_byte)
8949         return Qt;
8950
8951       if (XINT (start) < GPT && XINT (end) > GPT)
8952         {
8953           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8954             move_gap_both (XINT (start), start_byte);
8955           else
8956             move_gap_both (XINT (end), end_byte);
8957         }
8958     }
8959
8960   coding_attrs_list = Qnil;
8961   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8962     if (NILP (exclude)
8963         || NILP (Fmemq (XCAR (tail), exclude)))
8964       {
8965         Lisp_Object attrs;
8966
8967         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8968         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
8969           {
8970             ASET (attrs, coding_attr_trans_tbl,
8971                   get_translation_table (attrs, 1, NULL));
8972             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8973           }
8974       }
8975
8976   if (STRINGP (start))
8977     p = pbeg = SDATA (start);
8978   else
8979     p = pbeg = BYTE_POS_ADDR (start_byte);
8980   pend = p + (end_byte - start_byte);
8981
8982   while (p < pend && ASCII_BYTE_P (*p)) p++;
8983   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8984
8985   work_table = Fmake_char_table (Qnil, Qnil);
8986   while (p < pend)
8987     {
8988       if (ASCII_BYTE_P (*p))
8989         p++;
8990       else
8991         {
8992           c = STRING_CHAR_ADVANCE (p);
8993           if (!NILP (char_table_ref (work_table, c)))
8994             /* This character was already checked.  Ignore it.  */
8995             continue;
8996
8997           charset_map_loaded = 0;
8998           for (tail = coding_attrs_list; CONSP (tail);)
8999             {
9000               elt = XCAR (tail);
9001               if (NILP (elt))
9002                 tail = XCDR (tail);
9003               else if (char_encodable_p (c, elt))
9004                 tail = XCDR (tail);
9005               else if (CONSP (XCDR (tail)))
9006                 {
9007                   XSETCAR (tail, XCAR (XCDR (tail)));
9008                   XSETCDR (tail, XCDR (XCDR (tail)));
9009                 }
9010               else
9011                 {
9012                   XSETCAR (tail, Qnil);
9013                   tail = XCDR (tail);
9014                 }
9015             }
9016           if (charset_map_loaded)
9017             {
9018               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9019
9020               if (STRINGP (start))
9021                 pbeg = SDATA (start);
9022               else
9023                 pbeg = BYTE_POS_ADDR (start_byte);
9024               p = pbeg + p_offset;
9025               pend = pbeg + pend_offset;
9026             }
9027           char_table_set (work_table, c, Qt);
9028         }
9029     }
9030
9031   safe_codings = list2 (Qraw_text, Qno_conversion);
9032   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9033     if (! NILP (XCAR (tail)))
9034       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9035
9036   return safe_codings;
9037 }
9038
9039
9040 DEFUN ("unencodable-char-position", Funencodable_char_position,
9041        Sunencodable_char_position, 3, 5, 0,
9042        doc: /*
9043 Return position of first un-encodable character in a region.
9044 START and END specify the region and CODING-SYSTEM specifies the
9045 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9046
9047 If optional 4th argument COUNT is non-nil, it specifies at most how
9048 many un-encodable characters to search.  In this case, the value is a
9049 list of positions.
9050
9051 If optional 5th argument STRING is non-nil, it is a string to search
9052 for un-encodable characters.  In that case, START and END are indexes
9053 to the string.  */)
9054   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
9055 {
9056   EMACS_INT n;
9057   struct coding_system coding;
9058   Lisp_Object attrs, charset_list, translation_table;
9059   Lisp_Object positions;
9060   ptrdiff_t from, to;
9061   const unsigned char *p, *stop, *pend;
9062   bool ascii_compatible;
9063
9064   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9065   attrs = CODING_ID_ATTRS (coding.id);
9066   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9067     return Qnil;
9068   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9069   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9070   translation_table = get_translation_table (attrs, 1, NULL);
9071
9072   if (NILP (string))
9073     {
9074       validate_region (&start, &end);
9075       from = XINT (start);
9076       to = XINT (end);
9077       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9078           || (ascii_compatible
9079               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9080         return Qnil;
9081       p = CHAR_POS_ADDR (from);
9082       pend = CHAR_POS_ADDR (to);
9083       if (from < GPT && to >= GPT)
9084         stop = GPT_ADDR;
9085       else
9086         stop = pend;
9087     }
9088   else
9089     {
9090       CHECK_STRING (string);
9091       CHECK_NATNUM (start);
9092       CHECK_NATNUM (end);
9093       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
9094         args_out_of_range_3 (string, start, end);
9095       from = XINT (start);
9096       to = XINT (end);
9097       if (! STRING_MULTIBYTE (string))
9098         return Qnil;
9099       p = SDATA (string) + string_char_to_byte (string, from);
9100       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9101       if (ascii_compatible && (to - from) == (pend - p))
9102         return Qnil;
9103     }
9104
9105   if (NILP (count))
9106     n = 1;
9107   else
9108     {
9109       CHECK_NATNUM (count);
9110       n = XINT (count);
9111     }
9112
9113   positions = Qnil;
9114   charset_map_loaded = 0;
9115   while (1)
9116     {
9117       int c;
9118
9119       if (ascii_compatible)
9120         while (p < stop && ASCII_BYTE_P (*p))
9121           p++, from++;
9122       if (p >= stop)
9123         {
9124           if (p >= pend)
9125             break;
9126           stop = pend;
9127           p = GAP_END_ADDR;
9128         }
9129
9130       c = STRING_CHAR_ADVANCE (p);
9131       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9132           && ! char_charset (translate_char (translation_table, c),
9133                              charset_list, NULL))
9134         {
9135           positions = Fcons (make_number (from), positions);
9136           n--;
9137           if (n == 0)
9138             break;
9139         }
9140
9141       from++;
9142       if (charset_map_loaded && NILP (string))
9143         {
9144           p = CHAR_POS_ADDR (from);
9145           pend = CHAR_POS_ADDR (to);
9146           if (from < GPT && to >= GPT)
9147             stop = GPT_ADDR;
9148           else
9149             stop = pend;
9150           charset_map_loaded = 0;
9151         }
9152     }
9153
9154   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9155 }
9156
9157
9158 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9159        Scheck_coding_systems_region, 3, 3, 0,
9160        doc: /* Check if the region is encodable by coding systems.
9161
9162 START and END are buffer positions specifying the region.
9163 CODING-SYSTEM-LIST is a list of coding systems to check.
9164
9165 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9166 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9167 whole region, POS0, POS1, ... are buffer positions where non-encodable
9168 characters are found.
9169
9170 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9171 value is nil.
9172
9173 START may be a string.  In that case, check if the string is
9174 encodable, and the value contains indices to the string instead of
9175 buffer positions.  END is ignored.
9176
9177 If the current buffer (or START if it is a string) is unibyte, the value
9178 is nil.  */)
9179   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9180 {
9181   Lisp_Object list;
9182   ptrdiff_t start_byte, end_byte;
9183   ptrdiff_t pos;
9184   const unsigned char *p, *pbeg, *pend;
9185   int c;
9186   Lisp_Object tail, elt, attrs;
9187
9188   if (STRINGP (start))
9189     {
9190       if (!STRING_MULTIBYTE (start)
9191           || SCHARS (start) == SBYTES (start))
9192         return Qnil;
9193       start_byte = 0;
9194       end_byte = SBYTES (start);
9195       pos = 0;
9196     }
9197   else
9198     {
9199       CHECK_NUMBER_COERCE_MARKER (start);
9200       CHECK_NUMBER_COERCE_MARKER (end);
9201       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9202         args_out_of_range (start, end);
9203       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9204         return Qnil;
9205       start_byte = CHAR_TO_BYTE (XINT (start));
9206       end_byte = CHAR_TO_BYTE (XINT (end));
9207       if (XINT (end) - XINT (start) == end_byte - start_byte)
9208         return Qnil;
9209
9210       if (XINT (start) < GPT && XINT (end) > GPT)
9211         {
9212           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9213             move_gap_both (XINT (start), start_byte);
9214           else
9215             move_gap_both (XINT (end), end_byte);
9216         }
9217       pos = XINT (start);
9218     }
9219
9220   list = Qnil;
9221   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9222     {
9223       elt = XCAR (tail);
9224       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9225       ASET (attrs, coding_attr_trans_tbl,
9226             get_translation_table (attrs, 1, NULL));
9227       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
9228     }
9229
9230   if (STRINGP (start))
9231     p = pbeg = SDATA (start);
9232   else
9233     p = pbeg = BYTE_POS_ADDR (start_byte);
9234   pend = p + (end_byte - start_byte);
9235
9236   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9237   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9238
9239   while (p < pend)
9240     {
9241       if (ASCII_BYTE_P (*p))
9242         p++;
9243       else
9244         {
9245           c = STRING_CHAR_ADVANCE (p);
9246
9247           charset_map_loaded = 0;
9248           for (tail = list; CONSP (tail); tail = XCDR (tail))
9249             {
9250               elt = XCDR (XCAR (tail));
9251               if (! char_encodable_p (c, XCAR (elt)))
9252                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9253             }
9254           if (charset_map_loaded)
9255             {
9256               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9257
9258               if (STRINGP (start))
9259                 pbeg = SDATA (start);
9260               else
9261                 pbeg = BYTE_POS_ADDR (start_byte);
9262               p = pbeg + p_offset;
9263               pend = pbeg + pend_offset;
9264             }
9265         }
9266       pos++;
9267     }
9268
9269   tail = list;
9270   list = Qnil;
9271   for (; CONSP (tail); tail = XCDR (tail))
9272     {
9273       elt = XCAR (tail);
9274       if (CONSP (XCDR (XCDR (elt))))
9275         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9276                       list);
9277     }
9278
9279   return list;
9280 }
9281
9282
9283 static Lisp_Object
9284 code_convert_region (Lisp_Object start, Lisp_Object end,
9285                      Lisp_Object coding_system, Lisp_Object dst_object,
9286                      bool encodep, bool norecord)
9287 {
9288   struct coding_system coding;
9289   ptrdiff_t from, from_byte, to, to_byte;
9290   Lisp_Object src_object;
9291
9292   if (NILP (coding_system))
9293     coding_system = Qno_conversion;
9294   else
9295     CHECK_CODING_SYSTEM (coding_system);
9296   src_object = Fcurrent_buffer ();
9297   if (NILP (dst_object))
9298     dst_object = src_object;
9299   else if (! EQ (dst_object, Qt))
9300     CHECK_BUFFER (dst_object);
9301
9302   validate_region (&start, &end);
9303   from = XFASTINT (start);
9304   from_byte = CHAR_TO_BYTE (from);
9305   to = XFASTINT (end);
9306   to_byte = CHAR_TO_BYTE (to);
9307
9308   setup_coding_system (coding_system, &coding);
9309   coding.mode |= CODING_MODE_LAST_BLOCK;
9310
9311   if (encodep)
9312     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9313                           dst_object);
9314   else
9315     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9316                           dst_object);
9317   if (! norecord)
9318     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9319
9320   return (BUFFERP (dst_object)
9321           ? make_number (coding.produced_char)
9322           : coding.dst_object);
9323 }
9324
9325
9326 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9327        3, 4, "r\nzCoding system: ",
9328        doc: /* Decode the current region from the specified coding system.
9329 When called from a program, takes four arguments:
9330         START, END, CODING-SYSTEM, and DESTINATION.
9331 START and END are buffer positions.
9332
9333 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9334 If nil, the region between START and END is replaced by the decoded text.
9335 If buffer, the decoded text is inserted in that buffer after point (point
9336 does not move).
9337 In those cases, the length of the decoded text is returned.
9338 If DESTINATION is t, the decoded text is returned.
9339
9340 This function sets `last-coding-system-used' to the precise coding system
9341 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9342 not fully specified.)  */)
9343   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9344 {
9345   return code_convert_region (start, end, coding_system, destination, 0, 0);
9346 }
9347
9348 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9349        3, 4, "r\nzCoding system: ",
9350        doc: /* Encode the current region by specified coding system.
9351 When called from a program, takes four arguments:
9352         START, END, CODING-SYSTEM and DESTINATION.
9353 START and END are buffer positions.
9354
9355 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9356 If nil, the region between START and END is replace by the encoded text.
9357 If buffer, the encoded text is inserted in that buffer after point (point
9358 does not move).
9359 In those cases, the length of the encoded text is returned.
9360 If DESTINATION is t, the encoded text is returned.
9361
9362 This function sets `last-coding-system-used' to the precise coding system
9363 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9364 not fully specified.)  */)
9365   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9366 {
9367   return code_convert_region (start, end, coding_system, destination, 1, 0);
9368 }
9369
9370 Lisp_Object
9371 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9372                      Lisp_Object dst_object, bool encodep, bool nocopy,
9373                      bool norecord)
9374 {
9375   struct coding_system coding;
9376   ptrdiff_t chars, bytes;
9377
9378   CHECK_STRING (string);
9379   if (NILP (coding_system))
9380     {
9381       if (! norecord)
9382         Vlast_coding_system_used = Qno_conversion;
9383       if (NILP (dst_object))
9384         return (nocopy ? Fcopy_sequence (string) : string);
9385     }
9386
9387   if (NILP (coding_system))
9388     coding_system = Qno_conversion;
9389   else
9390     CHECK_CODING_SYSTEM (coding_system);
9391   if (NILP (dst_object))
9392     dst_object = Qt;
9393   else if (! EQ (dst_object, Qt))
9394     CHECK_BUFFER (dst_object);
9395
9396   setup_coding_system (coding_system, &coding);
9397   coding.mode |= CODING_MODE_LAST_BLOCK;
9398   chars = SCHARS (string);
9399   bytes = SBYTES (string);
9400   if (encodep)
9401     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9402   else
9403     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9404   if (! norecord)
9405     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9406
9407   return (BUFFERP (dst_object)
9408           ? make_number (coding.produced_char)
9409           : coding.dst_object);
9410 }
9411
9412
9413 /* Encode or decode STRING according to CODING_SYSTEM.
9414    Do not set Vlast_coding_system_used.
9415
9416    This function is called only from macros DECODE_FILE and
9417    ENCODE_FILE, thus we ignore character composition.  */
9418
9419 Lisp_Object
9420 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9421                               bool encodep)
9422 {
9423   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9424 }
9425
9426
9427 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9428        2, 4, 0,
9429        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9430
9431 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9432 if the decoding operation is trivial.
9433
9434 Optional fourth arg BUFFER non-nil means that the decoded text is
9435 inserted in that buffer after point (point does not move).  In this
9436 case, the return value is the length of the decoded text.
9437
9438 This function sets `last-coding-system-used' to the precise coding system
9439 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9440 not fully specified.)  */)
9441   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9442 {
9443   return code_convert_string (string, coding_system, buffer,
9444                               0, ! NILP (nocopy), 0);
9445 }
9446
9447 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9448        2, 4, 0,
9449        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9450
9451 Optional third arg NOCOPY non-nil means it is OK to return STRING
9452 itself if the encoding operation is trivial.
9453
9454 Optional fourth arg BUFFER non-nil means that the encoded text is
9455 inserted in that buffer after point (point does not move).  In this
9456 case, the return value is the length of the encoded text.
9457
9458 This function sets `last-coding-system-used' to the precise coding system
9459 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9460 not fully specified.)  */)
9461   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9462 {
9463   return code_convert_string (string, coding_system, buffer,
9464                               1, ! NILP (nocopy), 0);
9465 }
9466
9467 \f
9468 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9469        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9470 Return the corresponding character.  */)
9471   (Lisp_Object code)
9472 {
9473   Lisp_Object spec, attrs, val;
9474   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9475   EMACS_INT ch;
9476   int c;
9477
9478   CHECK_NATNUM (code);
9479   ch = XFASTINT (code);
9480   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9481   attrs = AREF (spec, 0);
9482
9483   if (ASCII_BYTE_P (ch)
9484       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9485     return code;
9486
9487   val = CODING_ATTR_CHARSET_LIST (attrs);
9488   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9489   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9490   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9491
9492   if (ch <= 0x7F)
9493     {
9494       c = ch;
9495       charset = charset_roman;
9496     }
9497   else if (ch >= 0xA0 && ch < 0xDF)
9498     {
9499       c = ch - 0x80;
9500       charset = charset_kana;
9501     }
9502   else
9503     {
9504       EMACS_INT c1 = ch >> 8;
9505       int c2 = ch & 0xFF;
9506
9507       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9508           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9509         error ("Invalid code: %"pI"d", ch);
9510       c = ch;
9511       SJIS_TO_JIS (c);
9512       charset = charset_kanji;
9513     }
9514   c = DECODE_CHAR (charset, c);
9515   if (c < 0)
9516     error ("Invalid code: %"pI"d", ch);
9517   return make_number (c);
9518 }
9519
9520
9521 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9522        doc: /* Encode a Japanese character CH to shift_jis encoding.
9523 Return the corresponding code in SJIS.  */)
9524   (Lisp_Object ch)
9525 {
9526   Lisp_Object spec, attrs, charset_list;
9527   int c;
9528   struct charset *charset;
9529   unsigned code;
9530
9531   CHECK_CHARACTER (ch);
9532   c = XFASTINT (ch);
9533   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9534   attrs = AREF (spec, 0);
9535
9536   if (ASCII_CHAR_P (c)
9537       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9538     return ch;
9539
9540   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9541   charset = char_charset (c, charset_list, &code);
9542   if (code == CHARSET_INVALID_CODE (charset))
9543     error ("Can't encode by shift_jis encoding: %c", c);
9544   JIS_TO_SJIS (code);
9545
9546   return make_number (code);
9547 }
9548
9549 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9550        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9551 Return the corresponding character.  */)
9552   (Lisp_Object code)
9553 {
9554   Lisp_Object spec, attrs, val;
9555   struct charset *charset_roman, *charset_big5, *charset;
9556   EMACS_INT ch;
9557   int c;
9558
9559   CHECK_NATNUM (code);
9560   ch = XFASTINT (code);
9561   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9562   attrs = AREF (spec, 0);
9563
9564   if (ASCII_BYTE_P (ch)
9565       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9566     return code;
9567
9568   val = CODING_ATTR_CHARSET_LIST (attrs);
9569   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9570   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9571
9572   if (ch <= 0x7F)
9573     {
9574       c = ch;
9575       charset = charset_roman;
9576     }
9577   else
9578     {
9579       EMACS_INT b1 = ch >> 8;
9580       int b2 = ch & 0x7F;
9581       if (b1 < 0xA1 || b1 > 0xFE
9582           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9583         error ("Invalid code: %"pI"d", ch);
9584       c = ch;
9585       charset = charset_big5;
9586     }
9587   c = DECODE_CHAR (charset, c);
9588   if (c < 0)
9589     error ("Invalid code: %"pI"d", ch);
9590   return make_number (c);
9591 }
9592
9593 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9594        doc: /* Encode the Big5 character CH to BIG5 coding system.
9595 Return the corresponding character code in Big5.  */)
9596   (Lisp_Object ch)
9597 {
9598   Lisp_Object spec, attrs, charset_list;
9599   struct charset *charset;
9600   int c;
9601   unsigned code;
9602
9603   CHECK_CHARACTER (ch);
9604   c = XFASTINT (ch);
9605   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9606   attrs = AREF (spec, 0);
9607   if (ASCII_CHAR_P (c)
9608       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9609     return ch;
9610
9611   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9612   charset = char_charset (c, charset_list, &code);
9613   if (code == CHARSET_INVALID_CODE (charset))
9614     error ("Can't encode by Big5 encoding: %c", c);
9615
9616   return make_number (code);
9617 }
9618
9619 \f
9620 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9621        Sset_terminal_coding_system_internal, 1, 2, 0,
9622        doc: /* Internal use only.  */)
9623   (Lisp_Object coding_system, Lisp_Object terminal)
9624 {
9625   struct terminal *term = get_terminal (terminal, 1);
9626   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9627   CHECK_SYMBOL (coding_system);
9628   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9629   /* We had better not send unsafe characters to terminal.  */
9630   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9631   /* Character composition should be disabled.  */
9632   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9633   terminal_coding->src_multibyte = 1;
9634   terminal_coding->dst_multibyte = 0;
9635   tset_charset_list
9636     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9637             ? coding_charset_list (terminal_coding)
9638             : Fcons (make_number (charset_ascii), Qnil)));
9639   return Qnil;
9640 }
9641
9642 DEFUN ("set-safe-terminal-coding-system-internal",
9643        Fset_safe_terminal_coding_system_internal,
9644        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9645        doc: /* Internal use only.  */)
9646   (Lisp_Object coding_system)
9647 {
9648   CHECK_SYMBOL (coding_system);
9649   setup_coding_system (Fcheck_coding_system (coding_system),
9650                        &safe_terminal_coding);
9651   /* Character composition should be disabled.  */
9652   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9653   safe_terminal_coding.src_multibyte = 1;
9654   safe_terminal_coding.dst_multibyte = 0;
9655   return Qnil;
9656 }
9657
9658 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9659        Sterminal_coding_system, 0, 1, 0,
9660        doc: /* Return coding system specified for terminal output on the given terminal.
9661 TERMINAL may be a terminal object, a frame, or nil for the selected
9662 frame's terminal device.  */)
9663   (Lisp_Object terminal)
9664 {
9665   struct coding_system *terminal_coding
9666     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9667   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9668
9669   /* For backward compatibility, return nil if it is `undecided'.  */
9670   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9671 }
9672
9673 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9674        Sset_keyboard_coding_system_internal, 1, 2, 0,
9675        doc: /* Internal use only.  */)
9676   (Lisp_Object coding_system, Lisp_Object terminal)
9677 {
9678   struct terminal *t = get_terminal (terminal, 1);
9679   CHECK_SYMBOL (coding_system);
9680   if (NILP (coding_system))
9681     coding_system = Qno_conversion;
9682   else
9683     Fcheck_coding_system (coding_system);
9684   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9685   /* Character composition should be disabled.  */
9686   TERMINAL_KEYBOARD_CODING (t)->common_flags
9687     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9688   return Qnil;
9689 }
9690
9691 DEFUN ("keyboard-coding-system",
9692        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9693        doc: /* Return coding system specified for decoding keyboard input.  */)
9694   (Lisp_Object terminal)
9695 {
9696   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9697                          (get_terminal (terminal, 1))->id);
9698 }
9699
9700 \f
9701 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9702        Sfind_operation_coding_system,  1, MANY, 0,
9703        doc: /* Choose a coding system for an operation based on the target name.
9704 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9705 DECODING-SYSTEM is the coding system to use for decoding
9706 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9707 for encoding (in case OPERATION does encoding).
9708
9709 The first argument OPERATION specifies an I/O primitive:
9710   For file I/O, `insert-file-contents' or `write-region'.
9711   For process I/O, `call-process', `call-process-region', or `start-process'.
9712   For network I/O, `open-network-stream'.
9713
9714 The remaining arguments should be the same arguments that were passed
9715 to the primitive.  Depending on which primitive, one of those arguments
9716 is selected as the TARGET.  For example, if OPERATION does file I/O,
9717 whichever argument specifies the file name is TARGET.
9718
9719 TARGET has a meaning which depends on OPERATION:
9720   For file I/O, TARGET is a file name (except for the special case below).
9721   For process I/O, TARGET is a process name.
9722   For network I/O, TARGET is a service name or a port number.
9723
9724 This function looks up what is specified for TARGET in
9725 `file-coding-system-alist', `process-coding-system-alist',
9726 or `network-coding-system-alist' depending on OPERATION.
9727 They may specify a coding system, a cons of coding systems,
9728 or a function symbol to call.
9729 In the last case, we call the function with one argument,
9730 which is a list of all the arguments given to this function.
9731 If the function can't decide a coding system, it can return
9732 `undecided' so that the normal code-detection is performed.
9733
9734 If OPERATION is `insert-file-contents', the argument corresponding to
9735 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9736 file name to look up, and BUFFER is a buffer that contains the file's
9737 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9738 function to call for FILENAME, that function should examine the
9739 contents of BUFFER instead of reading the file.
9740
9741 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9742   (ptrdiff_t nargs, Lisp_Object *args)
9743 {
9744   Lisp_Object operation, target_idx, target, val;
9745   register Lisp_Object chain;
9746
9747   if (nargs < 2)
9748     error ("Too few arguments");
9749   operation = args[0];
9750   if (!SYMBOLP (operation)
9751       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9752     error ("Invalid first argument");
9753   if (nargs <= 1 + XFASTINT (target_idx))
9754     error ("Too few arguments for operation `%s'",
9755            SDATA (SYMBOL_NAME (operation)));
9756   target = args[XFASTINT (target_idx) + 1];
9757   if (!(STRINGP (target)
9758         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9759             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9760         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9761     error ("Invalid argument %"pI"d of operation `%s'",
9762            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9763   if (CONSP (target))
9764     target = XCAR (target);
9765
9766   chain = ((EQ (operation, Qinsert_file_contents)
9767             || EQ (operation, Qwrite_region))
9768            ? Vfile_coding_system_alist
9769            : (EQ (operation, Qopen_network_stream)
9770               ? Vnetwork_coding_system_alist
9771               : Vprocess_coding_system_alist));
9772   if (NILP (chain))
9773     return Qnil;
9774
9775   for (; CONSP (chain); chain = XCDR (chain))
9776     {
9777       Lisp_Object elt;
9778
9779       elt = XCAR (chain);
9780       if (CONSP (elt)
9781           && ((STRINGP (target)
9782                && STRINGP (XCAR (elt))
9783                && fast_string_match (XCAR (elt), target) >= 0)
9784               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9785         {
9786           val = XCDR (elt);
9787           /* Here, if VAL is both a valid coding system and a valid
9788              function symbol, we return VAL as a coding system.  */
9789           if (CONSP (val))
9790             return val;
9791           if (! SYMBOLP (val))
9792             return Qnil;
9793           if (! NILP (Fcoding_system_p (val)))
9794             return Fcons (val, val);
9795           if (! NILP (Ffboundp (val)))
9796             {
9797               /* We use call1 rather than safe_call1
9798                  so as to get bug reports about functions called here
9799                  which don't handle the current interface.  */
9800               val = call1 (val, Flist (nargs, args));
9801               if (CONSP (val))
9802                 return val;
9803               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9804                 return Fcons (val, val);
9805             }
9806           return Qnil;
9807         }
9808     }
9809   return Qnil;
9810 }
9811
9812 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9813        Sset_coding_system_priority, 0, MANY, 0,
9814        doc: /* Assign higher priority to the coding systems given as arguments.
9815 If multiple coding systems belong to the same category,
9816 all but the first one are ignored.
9817
9818 usage: (set-coding-system-priority &rest coding-systems)  */)
9819   (ptrdiff_t nargs, Lisp_Object *args)
9820 {
9821   ptrdiff_t i, j;
9822   bool changed[coding_category_max];
9823   enum coding_category priorities[coding_category_max];
9824
9825   memset (changed, 0, sizeof changed);
9826
9827   for (i = j = 0; i < nargs; i++)
9828     {
9829       enum coding_category category;
9830       Lisp_Object spec, attrs;
9831
9832       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9833       attrs = AREF (spec, 0);
9834       category = XINT (CODING_ATTR_CATEGORY (attrs));
9835       if (changed[category])
9836         /* Ignore this coding system because a coding system of the
9837            same category already had a higher priority.  */
9838         continue;
9839       changed[category] = 1;
9840       priorities[j++] = category;
9841       if (coding_categories[category].id >= 0
9842           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9843         setup_coding_system (args[i], &coding_categories[category]);
9844       Fset (AREF (Vcoding_category_table, category), args[i]);
9845     }
9846
9847   /* Now we have decided top J priorities.  Reflect the order of the
9848      original priorities to the remaining priorities.  */
9849
9850   for (i = j, j = 0; i < coding_category_max; i++, j++)
9851     {
9852       while (j < coding_category_max
9853              && changed[coding_priorities[j]])
9854         j++;
9855       if (j == coding_category_max)
9856         emacs_abort ();
9857       priorities[i] = coding_priorities[j];
9858     }
9859
9860   memcpy (coding_priorities, priorities, sizeof priorities);
9861
9862   /* Update `coding-category-list'.  */
9863   Vcoding_category_list = Qnil;
9864   for (i = coding_category_max; i-- > 0; )
9865     Vcoding_category_list
9866       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9867                Vcoding_category_list);
9868
9869   return Qnil;
9870 }
9871
9872 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9873        Scoding_system_priority_list, 0, 1, 0,
9874        doc: /* Return a list of coding systems ordered by their priorities.
9875 The list contains a subset of coding systems; i.e. coding systems
9876 assigned to each coding category (see `coding-category-list').
9877
9878 HIGHESTP non-nil means just return the highest priority one.  */)
9879   (Lisp_Object highestp)
9880 {
9881   int i;
9882   Lisp_Object val;
9883
9884   for (i = 0, val = Qnil; i < coding_category_max; i++)
9885     {
9886       enum coding_category category = coding_priorities[i];
9887       int id = coding_categories[category].id;
9888       Lisp_Object attrs;
9889
9890       if (id < 0)
9891         continue;
9892       attrs = CODING_ID_ATTRS (id);
9893       if (! NILP (highestp))
9894         return CODING_ATTR_BASE_NAME (attrs);
9895       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9896     }
9897   return Fnreverse (val);
9898 }
9899
9900 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9901
9902 static Lisp_Object
9903 make_subsidiaries (Lisp_Object base)
9904 {
9905   Lisp_Object subsidiaries;
9906   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9907   char *buf = alloca (base_name_len + 6);
9908   int i;
9909
9910   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9911   subsidiaries = make_uninit_vector (3);
9912   for (i = 0; i < 3; i++)
9913     {
9914       strcpy (buf + base_name_len, suffixes[i]);
9915       ASET (subsidiaries, i, intern (buf));
9916     }
9917   return subsidiaries;
9918 }
9919
9920
9921 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9922        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9923        doc: /* For internal use only.
9924 usage: (define-coding-system-internal ...)  */)
9925   (ptrdiff_t nargs, Lisp_Object *args)
9926 {
9927   Lisp_Object name;
9928   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9929   Lisp_Object attrs;            /* Vector of attributes.  */
9930   Lisp_Object eol_type;
9931   Lisp_Object aliases;
9932   Lisp_Object coding_type, charset_list, safe_charsets;
9933   enum coding_category category;
9934   Lisp_Object tail, val;
9935   int max_charset_id = 0;
9936   int i;
9937
9938   if (nargs < coding_arg_max)
9939     goto short_args;
9940
9941   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9942
9943   name = args[coding_arg_name];
9944   CHECK_SYMBOL (name);
9945   ASET (attrs, coding_attr_base_name, name);
9946
9947   val = args[coding_arg_mnemonic];
9948   if (! STRINGP (val))
9949     CHECK_CHARACTER (val);
9950   ASET (attrs, coding_attr_mnemonic, val);
9951
9952   coding_type = args[coding_arg_coding_type];
9953   CHECK_SYMBOL (coding_type);
9954   ASET (attrs, coding_attr_type, coding_type);
9955
9956   charset_list = args[coding_arg_charset_list];
9957   if (SYMBOLP (charset_list))
9958     {
9959       if (EQ (charset_list, Qiso_2022))
9960         {
9961           if (! EQ (coding_type, Qiso_2022))
9962             error ("Invalid charset-list");
9963           charset_list = Viso_2022_charset_list;
9964         }
9965       else if (EQ (charset_list, Qemacs_mule))
9966         {
9967           if (! EQ (coding_type, Qemacs_mule))
9968             error ("Invalid charset-list");
9969           charset_list = Vemacs_mule_charset_list;
9970         }
9971       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9972         {
9973           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9974             error ("Invalid charset-list");
9975           if (max_charset_id < XFASTINT (XCAR (tail)))
9976             max_charset_id = XFASTINT (XCAR (tail));
9977         }
9978     }
9979   else
9980     {
9981       charset_list = Fcopy_sequence (charset_list);
9982       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9983         {
9984           struct charset *charset;
9985
9986           val = XCAR (tail);
9987           CHECK_CHARSET_GET_CHARSET (val, charset);
9988           if (EQ (coding_type, Qiso_2022)
9989               ? CHARSET_ISO_FINAL (charset) < 0
9990               : EQ (coding_type, Qemacs_mule)
9991               ? CHARSET_EMACS_MULE_ID (charset) < 0
9992               : 0)
9993             error ("Can't handle charset `%s'",
9994                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9995
9996           XSETCAR (tail, make_number (charset->id));
9997           if (max_charset_id < charset->id)
9998             max_charset_id = charset->id;
9999         }
10000     }
10001   ASET (attrs, coding_attr_charset_list, charset_list);
10002
10003   safe_charsets = make_uninit_string (max_charset_id + 1);
10004   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10005   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10006     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10007   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10008
10009   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10010
10011   val = args[coding_arg_decode_translation_table];
10012   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10013     CHECK_SYMBOL (val);
10014   ASET (attrs, coding_attr_decode_tbl, val);
10015
10016   val = args[coding_arg_encode_translation_table];
10017   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10018     CHECK_SYMBOL (val);
10019   ASET (attrs, coding_attr_encode_tbl, val);
10020
10021   val = args[coding_arg_post_read_conversion];
10022   CHECK_SYMBOL (val);
10023   ASET (attrs, coding_attr_post_read, val);
10024
10025   val = args[coding_arg_pre_write_conversion];
10026   CHECK_SYMBOL (val);
10027   ASET (attrs, coding_attr_pre_write, val);
10028
10029   val = args[coding_arg_default_char];
10030   if (NILP (val))
10031     ASET (attrs, coding_attr_default_char, make_number (' '));
10032   else
10033     {
10034       CHECK_CHARACTER (val);
10035       ASET (attrs, coding_attr_default_char, val);
10036     }
10037
10038   val = args[coding_arg_for_unibyte];
10039   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10040
10041   val = args[coding_arg_plist];
10042   CHECK_LIST (val);
10043   ASET (attrs, coding_attr_plist, val);
10044
10045   if (EQ (coding_type, Qcharset))
10046     {
10047       /* Generate a lisp vector of 256 elements.  Each element is nil,
10048          integer, or a list of charset IDs.
10049
10050          If Nth element is nil, the byte code N is invalid in this
10051          coding system.
10052
10053          If Nth element is a number NUM, N is the first byte of a
10054          charset whose ID is NUM.
10055
10056          If Nth element is a list of charset IDs, N is the first byte
10057          of one of them.  The list is sorted by dimensions of the
10058          charsets.  A charset of smaller dimension comes first. */
10059       val = Fmake_vector (make_number (256), Qnil);
10060
10061       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10062         {
10063           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10064           int dim = CHARSET_DIMENSION (charset);
10065           int idx = (dim - 1) * 4;
10066
10067           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10068             ASET (attrs, coding_attr_ascii_compat, Qt);
10069
10070           for (i = charset->code_space[idx];
10071                i <= charset->code_space[idx + 1]; i++)
10072             {
10073               Lisp_Object tmp, tmp2;
10074               int dim2;
10075
10076               tmp = AREF (val, i);
10077               if (NILP (tmp))
10078                 tmp = XCAR (tail);
10079               else if (NUMBERP (tmp))
10080                 {
10081                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10082                   if (dim < dim2)
10083                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
10084                   else
10085                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
10086                 }
10087               else
10088                 {
10089                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10090                     {
10091                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10092                       if (dim < dim2)
10093                         break;
10094                     }
10095                   if (NILP (tmp2))
10096                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
10097                   else
10098                     {
10099                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10100                       XSETCAR (tmp2, XCAR (tail));
10101                     }
10102                 }
10103               ASET (val, i, tmp);
10104             }
10105         }
10106       ASET (attrs, coding_attr_charset_valids, val);
10107       category = coding_category_charset;
10108     }
10109   else if (EQ (coding_type, Qccl))
10110     {
10111       Lisp_Object valids;
10112
10113       if (nargs < coding_arg_ccl_max)
10114         goto short_args;
10115
10116       val = args[coding_arg_ccl_decoder];
10117       CHECK_CCL_PROGRAM (val);
10118       if (VECTORP (val))
10119         val = Fcopy_sequence (val);
10120       ASET (attrs, coding_attr_ccl_decoder, val);
10121
10122       val = args[coding_arg_ccl_encoder];
10123       CHECK_CCL_PROGRAM (val);
10124       if (VECTORP (val))
10125         val = Fcopy_sequence (val);
10126       ASET (attrs, coding_attr_ccl_encoder, val);
10127
10128       val = args[coding_arg_ccl_valids];
10129       valids = Fmake_string (make_number (256), make_number (0));
10130       for (tail = val; CONSP (tail); tail = XCDR (tail))
10131         {
10132           int from, to;
10133
10134           val = XCAR (tail);
10135           if (INTEGERP (val))
10136             {
10137               if (! (0 <= XINT (val) && XINT (val) <= 255))
10138                 args_out_of_range_3 (val, make_number (0), make_number (255));
10139               from = to = XINT (val);
10140             }
10141           else
10142             {
10143               CHECK_CONS (val);
10144               CHECK_NATNUM_CAR (val);
10145               CHECK_NUMBER_CDR (val);
10146               if (XINT (XCAR (val)) > 255)
10147                 args_out_of_range_3 (XCAR (val),
10148                                      make_number (0), make_number (255));
10149               from = XINT (XCAR (val));
10150               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10151                 args_out_of_range_3 (XCDR (val),
10152                                      XCAR (val), make_number (255));
10153               to = XINT (XCDR (val));
10154             }
10155           for (i = from; i <= to; i++)
10156             SSET (valids, i, 1);
10157         }
10158       ASET (attrs, coding_attr_ccl_valids, valids);
10159
10160       category = coding_category_ccl;
10161     }
10162   else if (EQ (coding_type, Qutf_16))
10163     {
10164       Lisp_Object bom, endian;
10165
10166       ASET (attrs, coding_attr_ascii_compat, Qnil);
10167
10168       if (nargs < coding_arg_utf16_max)
10169         goto short_args;
10170
10171       bom = args[coding_arg_utf16_bom];
10172       if (! NILP (bom) && ! EQ (bom, Qt))
10173         {
10174           CHECK_CONS (bom);
10175           val = XCAR (bom);
10176           CHECK_CODING_SYSTEM (val);
10177           val = XCDR (bom);
10178           CHECK_CODING_SYSTEM (val);
10179         }
10180       ASET (attrs, coding_attr_utf_bom, bom);
10181
10182       endian = args[coding_arg_utf16_endian];
10183       CHECK_SYMBOL (endian);
10184       if (NILP (endian))
10185         endian = Qbig;
10186       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10187         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10188       ASET (attrs, coding_attr_utf_16_endian, endian);
10189
10190       category = (CONSP (bom)
10191                   ? coding_category_utf_16_auto
10192                   : NILP (bom)
10193                   ? (EQ (endian, Qbig)
10194                      ? coding_category_utf_16_be_nosig
10195                      : coding_category_utf_16_le_nosig)
10196                   : (EQ (endian, Qbig)
10197                      ? coding_category_utf_16_be
10198                      : coding_category_utf_16_le));
10199     }
10200   else if (EQ (coding_type, Qiso_2022))
10201     {
10202       Lisp_Object initial, reg_usage, request, flags;
10203
10204       if (nargs < coding_arg_iso2022_max)
10205         goto short_args;
10206
10207       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10208       CHECK_VECTOR (initial);
10209       for (i = 0; i < 4; i++)
10210         {
10211           val = AREF (initial, i);
10212           if (! NILP (val))
10213             {
10214               struct charset *charset;
10215
10216               CHECK_CHARSET_GET_CHARSET (val, charset);
10217               ASET (initial, i, make_number (CHARSET_ID (charset)));
10218               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10219                 ASET (attrs, coding_attr_ascii_compat, Qt);
10220             }
10221           else
10222             ASET (initial, i, make_number (-1));
10223         }
10224
10225       reg_usage = args[coding_arg_iso2022_reg_usage];
10226       CHECK_CONS (reg_usage);
10227       CHECK_NUMBER_CAR (reg_usage);
10228       CHECK_NUMBER_CDR (reg_usage);
10229
10230       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10231       for (tail = request; CONSP (tail); tail = XCDR (tail))
10232         {
10233           int id;
10234           Lisp_Object tmp1;
10235
10236           val = XCAR (tail);
10237           CHECK_CONS (val);
10238           tmp1 = XCAR (val);
10239           CHECK_CHARSET_GET_ID (tmp1, id);
10240           CHECK_NATNUM_CDR (val);
10241           if (XINT (XCDR (val)) >= 4)
10242             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10243           XSETCAR (val, make_number (id));
10244         }
10245
10246       flags = args[coding_arg_iso2022_flags];
10247       CHECK_NATNUM (flags);
10248       i = XINT (flags) & INT_MAX;
10249       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10250         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10251       flags = make_number (i);
10252
10253       ASET (attrs, coding_attr_iso_initial, initial);
10254       ASET (attrs, coding_attr_iso_usage, reg_usage);
10255       ASET (attrs, coding_attr_iso_request, request);
10256       ASET (attrs, coding_attr_iso_flags, flags);
10257       setup_iso_safe_charsets (attrs);
10258
10259       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10260         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10261                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10262                     ? coding_category_iso_7_else
10263                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10264                     ? coding_category_iso_7
10265                     : coding_category_iso_7_tight);
10266       else
10267         {
10268           int id = XINT (AREF (initial, 1));
10269
10270           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10271                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10272                        || id < 0)
10273                       ? coding_category_iso_8_else
10274                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10275                       ? coding_category_iso_8_1
10276                       : coding_category_iso_8_2);
10277         }
10278       if (category != coding_category_iso_8_1
10279           && category != coding_category_iso_8_2)
10280         ASET (attrs, coding_attr_ascii_compat, Qnil);
10281     }
10282   else if (EQ (coding_type, Qemacs_mule))
10283     {
10284       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10285         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10286       ASET (attrs, coding_attr_ascii_compat, Qt);
10287       category = coding_category_emacs_mule;
10288     }
10289   else if (EQ (coding_type, Qshift_jis))
10290     {
10291
10292       struct charset *charset;
10293
10294       if (XINT (Flength (charset_list)) != 3
10295           && XINT (Flength (charset_list)) != 4)
10296         error ("There should be three or four charsets");
10297
10298       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10299       if (CHARSET_DIMENSION (charset) != 1)
10300         error ("Dimension of charset %s is not one",
10301                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10302       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10303         ASET (attrs, coding_attr_ascii_compat, Qt);
10304
10305       charset_list = XCDR (charset_list);
10306       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10307       if (CHARSET_DIMENSION (charset) != 1)
10308         error ("Dimension of charset %s is not one",
10309                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10310
10311       charset_list = XCDR (charset_list);
10312       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10313       if (CHARSET_DIMENSION (charset) != 2)
10314         error ("Dimension of charset %s is not two",
10315                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10316
10317       charset_list = XCDR (charset_list);
10318       if (! NILP (charset_list))
10319         {
10320           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10321           if (CHARSET_DIMENSION (charset) != 2)
10322             error ("Dimension of charset %s is not two",
10323                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10324         }
10325
10326       category = coding_category_sjis;
10327       Vsjis_coding_system = name;
10328     }
10329   else if (EQ (coding_type, Qbig5))
10330     {
10331       struct charset *charset;
10332
10333       if (XINT (Flength (charset_list)) != 2)
10334         error ("There should be just two charsets");
10335
10336       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10337       if (CHARSET_DIMENSION (charset) != 1)
10338         error ("Dimension of charset %s is not one",
10339                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10340       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10341         ASET (attrs, coding_attr_ascii_compat, Qt);
10342
10343       charset_list = XCDR (charset_list);
10344       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10345       if (CHARSET_DIMENSION (charset) != 2)
10346         error ("Dimension of charset %s is not two",
10347                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10348
10349       category = coding_category_big5;
10350       Vbig5_coding_system = name;
10351     }
10352   else if (EQ (coding_type, Qraw_text))
10353     {
10354       category = coding_category_raw_text;
10355       ASET (attrs, coding_attr_ascii_compat, Qt);
10356     }
10357   else if (EQ (coding_type, Qutf_8))
10358     {
10359       Lisp_Object bom;
10360
10361       if (nargs < coding_arg_utf8_max)
10362         goto short_args;
10363
10364       bom = args[coding_arg_utf8_bom];
10365       if (! NILP (bom) && ! EQ (bom, Qt))
10366         {
10367           CHECK_CONS (bom);
10368           val = XCAR (bom);
10369           CHECK_CODING_SYSTEM (val);
10370           val = XCDR (bom);
10371           CHECK_CODING_SYSTEM (val);
10372         }
10373       ASET (attrs, coding_attr_utf_bom, bom);
10374       if (NILP (bom))
10375         ASET (attrs, coding_attr_ascii_compat, Qt);
10376
10377       category = (CONSP (bom) ? coding_category_utf_8_auto
10378                   : NILP (bom) ? coding_category_utf_8_nosig
10379                   : coding_category_utf_8_sig);
10380     }
10381   else if (EQ (coding_type, Qundecided))
10382     {
10383       if (nargs < coding_arg_undecided_max)
10384         goto short_args;
10385       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10386             args[coding_arg_undecided_inhibit_null_byte_detection]);
10387       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10388             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10389       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10390             args[coding_arg_undecided_prefer_utf_8]);
10391       category = coding_category_undecided;
10392     }
10393   else
10394     error ("Invalid coding system type: %s",
10395            SDATA (SYMBOL_NAME (coding_type)));
10396
10397   ASET (attrs, coding_attr_category, make_number (category));
10398   ASET (attrs, coding_attr_plist,
10399         Fcons (QCcategory,
10400                Fcons (AREF (Vcoding_category_table, category),
10401                       CODING_ATTR_PLIST (attrs))));
10402   ASET (attrs, coding_attr_plist,
10403         Fcons (QCascii_compatible_p,
10404                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10405                       CODING_ATTR_PLIST (attrs))));
10406
10407   eol_type = args[coding_arg_eol_type];
10408   if (! NILP (eol_type)
10409       && ! EQ (eol_type, Qunix)
10410       && ! EQ (eol_type, Qdos)
10411       && ! EQ (eol_type, Qmac))
10412     error ("Invalid eol-type");
10413
10414   aliases = Fcons (name, Qnil);
10415
10416   if (NILP (eol_type))
10417     {
10418       eol_type = make_subsidiaries (name);
10419       for (i = 0; i < 3; i++)
10420         {
10421           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10422
10423           this_name = AREF (eol_type, i);
10424           this_aliases = Fcons (this_name, Qnil);
10425           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10426           this_spec = make_uninit_vector (3);
10427           ASET (this_spec, 0, attrs);
10428           ASET (this_spec, 1, this_aliases);
10429           ASET (this_spec, 2, this_eol_type);
10430           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10431           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10432           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10433           if (NILP (val))
10434             Vcoding_system_alist
10435               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10436                        Vcoding_system_alist);
10437         }
10438     }
10439
10440   spec_vec = make_uninit_vector (3);
10441   ASET (spec_vec, 0, attrs);
10442   ASET (spec_vec, 1, aliases);
10443   ASET (spec_vec, 2, eol_type);
10444
10445   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10446   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10447   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10448   if (NILP (val))
10449     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10450                                   Vcoding_system_alist);
10451
10452   {
10453     int id = coding_categories[category].id;
10454
10455     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10456       setup_coding_system (name, &coding_categories[category]);
10457   }
10458
10459   return Qnil;
10460
10461  short_args:
10462   return Fsignal (Qwrong_number_of_arguments,
10463                   Fcons (intern ("define-coding-system-internal"),
10464                          make_number (nargs)));
10465 }
10466
10467
10468 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10469        3, 3, 0,
10470        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10471   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10472 {
10473   Lisp_Object spec, attrs;
10474
10475   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10476   attrs = AREF (spec, 0);
10477   if (EQ (prop, QCmnemonic))
10478     {
10479       if (! STRINGP (val))
10480         CHECK_CHARACTER (val);
10481       ASET (attrs, coding_attr_mnemonic, val);
10482     }
10483   else if (EQ (prop, QCdefault_char))
10484     {
10485       if (NILP (val))
10486         val = make_number (' ');
10487       else
10488         CHECK_CHARACTER (val);
10489       ASET (attrs, coding_attr_default_char, val);
10490     }
10491   else if (EQ (prop, QCdecode_translation_table))
10492     {
10493       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10494         CHECK_SYMBOL (val);
10495       ASET (attrs, coding_attr_decode_tbl, val);
10496     }
10497   else if (EQ (prop, QCencode_translation_table))
10498     {
10499       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10500         CHECK_SYMBOL (val);
10501       ASET (attrs, coding_attr_encode_tbl, val);
10502     }
10503   else if (EQ (prop, QCpost_read_conversion))
10504     {
10505       CHECK_SYMBOL (val);
10506       ASET (attrs, coding_attr_post_read, val);
10507     }
10508   else if (EQ (prop, QCpre_write_conversion))
10509     {
10510       CHECK_SYMBOL (val);
10511       ASET (attrs, coding_attr_pre_write, val);
10512     }
10513   else if (EQ (prop, QCascii_compatible_p))
10514     {
10515       ASET (attrs, coding_attr_ascii_compat, val);
10516     }
10517
10518   ASET (attrs, coding_attr_plist,
10519         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10520   return val;
10521 }
10522
10523
10524 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10525        Sdefine_coding_system_alias, 2, 2, 0,
10526        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10527   (Lisp_Object alias, Lisp_Object coding_system)
10528 {
10529   Lisp_Object spec, aliases, eol_type, val;
10530
10531   CHECK_SYMBOL (alias);
10532   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10533   aliases = AREF (spec, 1);
10534   /* ALIASES should be a list of length more than zero, and the first
10535      element is a base coding system.  Append ALIAS at the tail of the
10536      list.  */
10537   while (!NILP (XCDR (aliases)))
10538     aliases = XCDR (aliases);
10539   XSETCDR (aliases, Fcons (alias, Qnil));
10540
10541   eol_type = AREF (spec, 2);
10542   if (VECTORP (eol_type))
10543     {
10544       Lisp_Object subsidiaries;
10545       int i;
10546
10547       subsidiaries = make_subsidiaries (alias);
10548       for (i = 0; i < 3; i++)
10549         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10550                                      AREF (eol_type, i));
10551     }
10552
10553   Fputhash (alias, spec, Vcoding_system_hash_table);
10554   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10555   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10556   if (NILP (val))
10557     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10558                                   Vcoding_system_alist);
10559
10560   return Qnil;
10561 }
10562
10563 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10564        1, 1, 0,
10565        doc: /* Return the base of CODING-SYSTEM.
10566 Any alias or subsidiary coding system is not a base coding system.  */)
10567   (Lisp_Object coding_system)
10568 {
10569   Lisp_Object spec, attrs;
10570
10571   if (NILP (coding_system))
10572     return (Qno_conversion);
10573   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10574   attrs = AREF (spec, 0);
10575   return CODING_ATTR_BASE_NAME (attrs);
10576 }
10577
10578 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10579        1, 1, 0,
10580        doc: "Return the property list of CODING-SYSTEM.")
10581   (Lisp_Object coding_system)
10582 {
10583   Lisp_Object spec, attrs;
10584
10585   if (NILP (coding_system))
10586     coding_system = Qno_conversion;
10587   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10588   attrs = AREF (spec, 0);
10589   return CODING_ATTR_PLIST (attrs);
10590 }
10591
10592
10593 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10594        1, 1, 0,
10595        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10596   (Lisp_Object coding_system)
10597 {
10598   Lisp_Object spec;
10599
10600   if (NILP (coding_system))
10601     coding_system = Qno_conversion;
10602   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10603   return AREF (spec, 1);
10604 }
10605
10606 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10607        Scoding_system_eol_type, 1, 1, 0,
10608        doc: /* Return eol-type of CODING-SYSTEM.
10609 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10610
10611 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10612 and CR respectively.
10613
10614 A vector value indicates that a format of end-of-line should be
10615 detected automatically.  Nth element of the vector is the subsidiary
10616 coding system whose eol-type is N.  */)
10617   (Lisp_Object coding_system)
10618 {
10619   Lisp_Object spec, eol_type;
10620   int n;
10621
10622   if (NILP (coding_system))
10623     coding_system = Qno_conversion;
10624   if (! CODING_SYSTEM_P (coding_system))
10625     return Qnil;
10626   spec = CODING_SYSTEM_SPEC (coding_system);
10627   eol_type = AREF (spec, 2);
10628   if (VECTORP (eol_type))
10629     return Fcopy_sequence (eol_type);
10630   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10631   return make_number (n);
10632 }
10633
10634 #endif /* emacs */
10635
10636 \f
10637 /*** 9. Post-amble ***/
10638
10639 void
10640 init_coding_once (void)
10641 {
10642   int i;
10643
10644   for (i = 0; i < coding_category_max; i++)
10645     {
10646       coding_categories[i].id = -1;
10647       coding_priorities[i] = i;
10648     }
10649
10650   /* ISO2022 specific initialize routine.  */
10651   for (i = 0; i < 0x20; i++)
10652     iso_code_class[i] = ISO_control_0;
10653   for (i = 0x21; i < 0x7F; i++)
10654     iso_code_class[i] = ISO_graphic_plane_0;
10655   for (i = 0x80; i < 0xA0; i++)
10656     iso_code_class[i] = ISO_control_1;
10657   for (i = 0xA1; i < 0xFF; i++)
10658     iso_code_class[i] = ISO_graphic_plane_1;
10659   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10660   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10661   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10662   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10663   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10664   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10665   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10666   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10667   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10668
10669   for (i = 0; i < 256; i++)
10670     {
10671       emacs_mule_bytes[i] = 1;
10672     }
10673   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10674   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10675   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10676   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10677 }
10678
10679 #ifdef emacs
10680
10681 void
10682 syms_of_coding (void)
10683 {
10684   staticpro (&Vcoding_system_hash_table);
10685   {
10686     Lisp_Object args[2];
10687     args[0] = QCtest;
10688     args[1] = Qeq;
10689     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10690   }
10691
10692   staticpro (&Vsjis_coding_system);
10693   Vsjis_coding_system = Qnil;
10694
10695   staticpro (&Vbig5_coding_system);
10696   Vbig5_coding_system = Qnil;
10697
10698   staticpro (&Vcode_conversion_reused_workbuf);
10699   Vcode_conversion_reused_workbuf = Qnil;
10700
10701   staticpro (&Vcode_conversion_workbuf_name);
10702   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10703
10704   reused_workbuf_in_use = 0;
10705
10706   DEFSYM (Qcharset, "charset");
10707   DEFSYM (Qtarget_idx, "target-idx");
10708   DEFSYM (Qcoding_system_history, "coding-system-history");
10709   Fset (Qcoding_system_history, Qnil);
10710
10711   /* Target FILENAME is the first argument.  */
10712   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10713   /* Target FILENAME is the third argument.  */
10714   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10715
10716   DEFSYM (Qcall_process, "call-process");
10717   /* Target PROGRAM is the first argument.  */
10718   Fput (Qcall_process, Qtarget_idx, make_number (0));
10719
10720   DEFSYM (Qcall_process_region, "call-process-region");
10721   /* Target PROGRAM is the third argument.  */
10722   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10723
10724   DEFSYM (Qstart_process, "start-process");
10725   /* Target PROGRAM is the third argument.  */
10726   Fput (Qstart_process, Qtarget_idx, make_number (2));
10727
10728   DEFSYM (Qopen_network_stream, "open-network-stream");
10729   /* Target SERVICE is the fourth argument.  */
10730   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10731
10732   DEFSYM (Qcoding_system, "coding-system");
10733   DEFSYM (Qcoding_aliases, "coding-aliases");
10734
10735   DEFSYM (Qeol_type, "eol-type");
10736   DEFSYM (Qunix, "unix");
10737   DEFSYM (Qdos, "dos");
10738   DEFSYM (Qmac, "mac");
10739
10740   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10741   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10742   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10743   DEFSYM (Qdefault_char, "default-char");
10744   DEFSYM (Qundecided, "undecided");
10745   DEFSYM (Qno_conversion, "no-conversion");
10746   DEFSYM (Qraw_text, "raw-text");
10747
10748   DEFSYM (Qiso_2022, "iso-2022");
10749
10750   DEFSYM (Qutf_8, "utf-8");
10751   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10752
10753 #if defined (WINDOWSNT) || defined (CYGWIN)
10754   /* No, not utf-16-le: that one has a BOM.  */
10755   DEFSYM (Qutf_16le, "utf-16le");
10756 #endif
10757
10758   DEFSYM (Qutf_16, "utf-16");
10759   DEFSYM (Qbig, "big");
10760   DEFSYM (Qlittle, "little");
10761
10762   DEFSYM (Qshift_jis, "shift-jis");
10763   DEFSYM (Qbig5, "big5");
10764
10765   DEFSYM (Qcoding_system_p, "coding-system-p");
10766
10767   DEFSYM (Qcoding_system_error, "coding-system-error");
10768   Fput (Qcoding_system_error, Qerror_conditions,
10769         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10770   Fput (Qcoding_system_error, Qerror_message,
10771         build_pure_c_string ("Invalid coding system"));
10772
10773   /* Intern this now in case it isn't already done.
10774      Setting this variable twice is harmless.
10775      But don't staticpro it here--that is done in alloc.c.  */
10776   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10777
10778   DEFSYM (Qtranslation_table, "translation-table");
10779   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10780   DEFSYM (Qtranslation_table_id, "translation-table-id");
10781   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10782   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10783
10784   DEFSYM (Qvalid_codes, "valid-codes");
10785
10786   DEFSYM (Qemacs_mule, "emacs-mule");
10787
10788   DEFSYM (QCcategory, ":category");
10789   DEFSYM (QCmnemonic, ":mnemonic");
10790   DEFSYM (QCdefault_char, ":default-char");
10791   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10792   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10793   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10794   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10795   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10796
10797   Vcoding_category_table
10798     = Fmake_vector (make_number (coding_category_max), Qnil);
10799   staticpro (&Vcoding_category_table);
10800   /* Followings are target of code detection.  */
10801   ASET (Vcoding_category_table, coding_category_iso_7,
10802         intern_c_string ("coding-category-iso-7"));
10803   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10804         intern_c_string ("coding-category-iso-7-tight"));
10805   ASET (Vcoding_category_table, coding_category_iso_8_1,
10806         intern_c_string ("coding-category-iso-8-1"));
10807   ASET (Vcoding_category_table, coding_category_iso_8_2,
10808         intern_c_string ("coding-category-iso-8-2"));
10809   ASET (Vcoding_category_table, coding_category_iso_7_else,
10810         intern_c_string ("coding-category-iso-7-else"));
10811   ASET (Vcoding_category_table, coding_category_iso_8_else,
10812         intern_c_string ("coding-category-iso-8-else"));
10813   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10814         intern_c_string ("coding-category-utf-8-auto"));
10815   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10816         intern_c_string ("coding-category-utf-8"));
10817   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10818         intern_c_string ("coding-category-utf-8-sig"));
10819   ASET (Vcoding_category_table, coding_category_utf_16_be,
10820         intern_c_string ("coding-category-utf-16-be"));
10821   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10822         intern_c_string ("coding-category-utf-16-auto"));
10823   ASET (Vcoding_category_table, coding_category_utf_16_le,
10824         intern_c_string ("coding-category-utf-16-le"));
10825   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10826         intern_c_string ("coding-category-utf-16-be-nosig"));
10827   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10828         intern_c_string ("coding-category-utf-16-le-nosig"));
10829   ASET (Vcoding_category_table, coding_category_charset,
10830         intern_c_string ("coding-category-charset"));
10831   ASET (Vcoding_category_table, coding_category_sjis,
10832         intern_c_string ("coding-category-sjis"));
10833   ASET (Vcoding_category_table, coding_category_big5,
10834         intern_c_string ("coding-category-big5"));
10835   ASET (Vcoding_category_table, coding_category_ccl,
10836         intern_c_string ("coding-category-ccl"));
10837   ASET (Vcoding_category_table, coding_category_emacs_mule,
10838         intern_c_string ("coding-category-emacs-mule"));
10839   /* Followings are NOT target of code detection.  */
10840   ASET (Vcoding_category_table, coding_category_raw_text,
10841         intern_c_string ("coding-category-raw-text"));
10842   ASET (Vcoding_category_table, coding_category_undecided,
10843         intern_c_string ("coding-category-undecided"));
10844
10845   DEFSYM (Qinsufficient_source, "insufficient-source");
10846   DEFSYM (Qinvalid_source, "invalid-source");
10847   DEFSYM (Qinterrupted, "interrupted");
10848   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10849
10850   defsubr (&Scoding_system_p);
10851   defsubr (&Sread_coding_system);
10852   defsubr (&Sread_non_nil_coding_system);
10853   defsubr (&Scheck_coding_system);
10854   defsubr (&Sdetect_coding_region);
10855   defsubr (&Sdetect_coding_string);
10856   defsubr (&Sfind_coding_systems_region_internal);
10857   defsubr (&Sunencodable_char_position);
10858   defsubr (&Scheck_coding_systems_region);
10859   defsubr (&Sdecode_coding_region);
10860   defsubr (&Sencode_coding_region);
10861   defsubr (&Sdecode_coding_string);
10862   defsubr (&Sencode_coding_string);
10863   defsubr (&Sdecode_sjis_char);
10864   defsubr (&Sencode_sjis_char);
10865   defsubr (&Sdecode_big5_char);
10866   defsubr (&Sencode_big5_char);
10867   defsubr (&Sset_terminal_coding_system_internal);
10868   defsubr (&Sset_safe_terminal_coding_system_internal);
10869   defsubr (&Sterminal_coding_system);
10870   defsubr (&Sset_keyboard_coding_system_internal);
10871   defsubr (&Skeyboard_coding_system);
10872   defsubr (&Sfind_operation_coding_system);
10873   defsubr (&Sset_coding_system_priority);
10874   defsubr (&Sdefine_coding_system_internal);
10875   defsubr (&Sdefine_coding_system_alias);
10876   defsubr (&Scoding_system_put);
10877   defsubr (&Scoding_system_base);
10878   defsubr (&Scoding_system_plist);
10879   defsubr (&Scoding_system_aliases);
10880   defsubr (&Scoding_system_eol_type);
10881   defsubr (&Scoding_system_priority_list);
10882
10883   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10884                doc: /* List of coding systems.
10885
10886 Do not alter the value of this variable manually.  This variable should be
10887 updated by the functions `define-coding-system' and
10888 `define-coding-system-alias'.  */);
10889   Vcoding_system_list = Qnil;
10890
10891   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10892                doc: /* Alist of coding system names.
10893 Each element is one element list of coding system name.
10894 This variable is given to `completing-read' as COLLECTION argument.
10895
10896 Do not alter the value of this variable manually.  This variable should be
10897 updated by the functions `make-coding-system' and
10898 `define-coding-system-alias'.  */);
10899   Vcoding_system_alist = Qnil;
10900
10901   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10902                doc: /* List of coding-categories (symbols) ordered by priority.
10903
10904 On detecting a coding system, Emacs tries code detection algorithms
10905 associated with each coding-category one by one in this order.  When
10906 one algorithm agrees with a byte sequence of source text, the coding
10907 system bound to the corresponding coding-category is selected.
10908
10909 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10910   {
10911     int i;
10912
10913     Vcoding_category_list = Qnil;
10914     for (i = coding_category_max - 1; i >= 0; i--)
10915       Vcoding_category_list
10916         = Fcons (AREF (Vcoding_category_table, i),
10917                  Vcoding_category_list);
10918   }
10919
10920   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10921                doc: /* Specify the coding system for read operations.
10922 It is useful to bind this variable with `let', but do not set it globally.
10923 If the value is a coding system, it is used for decoding on read operation.
10924 If not, an appropriate element is used from one of the coding system alists.
10925 There are three such tables: `file-coding-system-alist',
10926 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10927   Vcoding_system_for_read = Qnil;
10928
10929   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10930                doc: /* Specify the coding system for write operations.
10931 Programs bind this variable with `let', but you should not set it globally.
10932 If the value is a coding system, it is used for encoding of output,
10933 when writing it to a file and when sending it to a file or subprocess.
10934
10935 If this does not specify a coding system, an appropriate element
10936 is used from one of the coding system alists.
10937 There are three such tables: `file-coding-system-alist',
10938 `process-coding-system-alist', and `network-coding-system-alist'.
10939 For output to files, if the above procedure does not specify a coding system,
10940 the value of `buffer-file-coding-system' is used.  */);
10941   Vcoding_system_for_write = Qnil;
10942
10943   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10944                doc: /*
10945 Coding system used in the latest file or process I/O.  */);
10946   Vlast_coding_system_used = Qnil;
10947
10948   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10949                doc: /*
10950 Error status of the last code conversion.
10951
10952 When an error was detected in the last code conversion, this variable
10953 is set to one of the following symbols.
10954   `insufficient-source'
10955   `inconsistent-eol'
10956   `invalid-source'
10957   `interrupted'
10958   `insufficient-memory'
10959 When no error was detected, the value doesn't change.  So, to check
10960 the error status of a code conversion by this variable, you must
10961 explicitly set this variable to nil before performing code
10962 conversion.  */);
10963   Vlast_code_conversion_error = Qnil;
10964
10965   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10966                doc: /*
10967 *Non-nil means always inhibit code conversion of end-of-line format.
10968 See info node `Coding Systems' and info node `Text and Binary' concerning
10969 such conversion.  */);
10970   inhibit_eol_conversion = 0;
10971
10972   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10973                doc: /*
10974 Non-nil means process buffer inherits coding system of process output.
10975 Bind it to t if the process output is to be treated as if it were a file
10976 read from some filesystem.  */);
10977   inherit_process_coding_system = 0;
10978
10979   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10980                doc: /*
10981 Alist to decide a coding system to use for a file I/O operation.
10982 The format is ((PATTERN . VAL) ...),
10983 where PATTERN is a regular expression matching a file name,
10984 VAL is a coding system, a cons of coding systems, or a function symbol.
10985 If VAL is a coding system, it is used for both decoding and encoding
10986 the file contents.
10987 If VAL is a cons of coding systems, the car part is used for decoding,
10988 and the cdr part is used for encoding.
10989 If VAL is a function symbol, the function must return a coding system
10990 or a cons of coding systems which are used as above.  The function is
10991 called with an argument that is a list of the arguments with which
10992 `find-operation-coding-system' was called.  If the function can't decide
10993 a coding system, it can return `undecided' so that the normal
10994 code-detection is performed.
10995
10996 See also the function `find-operation-coding-system'
10997 and the variable `auto-coding-alist'.  */);
10998   Vfile_coding_system_alist = Qnil;
10999
11000   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11001                doc: /*
11002 Alist to decide a coding system to use for a process I/O operation.
11003 The format is ((PATTERN . VAL) ...),
11004 where PATTERN is a regular expression matching a program name,
11005 VAL is a coding system, a cons of coding systems, or a function symbol.
11006 If VAL is a coding system, it is used for both decoding what received
11007 from the program and encoding what sent to the program.
11008 If VAL is a cons of coding systems, the car part is used for decoding,
11009 and the cdr part is used for encoding.
11010 If VAL is a function symbol, the function must return a coding system
11011 or a cons of coding systems which are used as above.
11012
11013 See also the function `find-operation-coding-system'.  */);
11014   Vprocess_coding_system_alist = Qnil;
11015
11016   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11017                doc: /*
11018 Alist to decide a coding system to use for a network I/O operation.
11019 The format is ((PATTERN . VAL) ...),
11020 where PATTERN is a regular expression matching a network service name
11021 or is a port number to connect to,
11022 VAL is a coding system, a cons of coding systems, or a function symbol.
11023 If VAL is a coding system, it is used for both decoding what received
11024 from the network stream and encoding what sent to the network stream.
11025 If VAL is a cons of coding systems, the car part is used for decoding,
11026 and the cdr part is used for encoding.
11027 If VAL is a function symbol, the function must return a coding system
11028 or a cons of coding systems which are used as above.
11029
11030 See also the function `find-operation-coding-system'.  */);
11031   Vnetwork_coding_system_alist = Qnil;
11032
11033   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11034                doc: /* Coding system to use with system messages.
11035 Also used for decoding keyboard input on X Window system.  */);
11036   Vlocale_coding_system = Qnil;
11037
11038   /* The eol mnemonics are reset in startup.el system-dependently.  */
11039   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11040                doc: /*
11041 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11042   eol_mnemonic_unix = build_pure_c_string (":");
11043
11044   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11045                doc: /*
11046 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11047   eol_mnemonic_dos = build_pure_c_string ("\\");
11048
11049   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11050                doc: /*
11051 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11052   eol_mnemonic_mac = build_pure_c_string ("/");
11053
11054   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11055                doc: /*
11056 *String displayed in mode line when end-of-line format is not yet determined.  */);
11057   eol_mnemonic_undecided = build_pure_c_string (":");
11058
11059   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11060                doc: /*
11061 *Non-nil enables character translation while encoding and decoding.  */);
11062   Venable_character_translation = Qt;
11063
11064   DEFVAR_LISP ("standard-translation-table-for-decode",
11065                Vstandard_translation_table_for_decode,
11066                doc: /* Table for translating characters while decoding.  */);
11067   Vstandard_translation_table_for_decode = Qnil;
11068
11069   DEFVAR_LISP ("standard-translation-table-for-encode",
11070                Vstandard_translation_table_for_encode,
11071                doc: /* Table for translating characters while encoding.  */);
11072   Vstandard_translation_table_for_encode = Qnil;
11073
11074   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11075                doc: /* Alist of charsets vs revision numbers.
11076 While encoding, if a charset (car part of an element) is found,
11077 designate it with the escape sequence identifying revision (cdr part
11078 of the element).  */);
11079   Vcharset_revision_table = Qnil;
11080
11081   DEFVAR_LISP ("default-process-coding-system",
11082                Vdefault_process_coding_system,
11083                doc: /* Cons of coding systems used for process I/O by default.
11084 The car part is used for decoding a process output,
11085 the cdr part is used for encoding a text to be sent to a process.  */);
11086   Vdefault_process_coding_system = Qnil;
11087
11088   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11089                doc: /*
11090 Table of extra Latin codes in the range 128..159 (inclusive).
11091 This is a vector of length 256.
11092 If Nth element is non-nil, the existence of code N in a file
11093 \(or output of subprocess) doesn't prevent it to be detected as
11094 a coding system of ISO 2022 variant which has a flag
11095 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11096 or reading output of a subprocess.
11097 Only 128th through 159th elements have a meaning.  */);
11098   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11099
11100   DEFVAR_LISP ("select-safe-coding-system-function",
11101                Vselect_safe_coding_system_function,
11102                doc: /*
11103 Function to call to select safe coding system for encoding a text.
11104
11105 If set, this function is called to force a user to select a proper
11106 coding system which can encode the text in the case that a default
11107 coding system used in each operation can't encode the text.  The
11108 function should take care that the buffer is not modified while
11109 the coding system is being selected.
11110
11111 The default value is `select-safe-coding-system' (which see).  */);
11112   Vselect_safe_coding_system_function = Qnil;
11113
11114   DEFVAR_BOOL ("coding-system-require-warning",
11115                coding_system_require_warning,
11116                doc: /* Internal use only.
11117 If non-nil, on writing a file, `select-safe-coding-system-function' is
11118 called even if `coding-system-for-write' is non-nil.  The command
11119 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11120   coding_system_require_warning = 0;
11121
11122
11123   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11124                inhibit_iso_escape_detection,
11125                doc: /*
11126 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11127
11128 When Emacs reads text, it tries to detect how the text is encoded.
11129 This code detection is sensitive to escape sequences.  If Emacs sees
11130 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11131 of the ISO2022 encodings, and decodes text by the corresponding coding
11132 system (e.g. `iso-2022-7bit').
11133
11134 However, there may be a case that you want to read escape sequences in
11135 a file as is.  In such a case, you can set this variable to non-nil.
11136 Then the code detection will ignore any escape sequences, and no text is
11137 detected as encoded in some ISO-2022 encoding.  The result is that all
11138 escape sequences become visible in a buffer.
11139
11140 The default value is nil, and it is strongly recommended not to change
11141 it.  That is because many Emacs Lisp source files that contain
11142 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11143 in Emacs's distribution, and they won't be decoded correctly on
11144 reading if you suppress escape sequence detection.
11145
11146 The other way to read escape sequences in a file without decoding is
11147 to explicitly specify some coding system that doesn't use ISO-2022
11148 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11149   inhibit_iso_escape_detection = 0;
11150
11151   DEFVAR_BOOL ("inhibit-null-byte-detection",
11152                inhibit_null_byte_detection,
11153                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11154 By default, Emacs treats it as binary data, and does not attempt to
11155 decode it.  The effect is as if you specified `no-conversion' for
11156 reading that text.
11157
11158 Set this to non-nil when a regular text happens to include null bytes.
11159 Examples are Index nodes of Info files and null-byte delimited output
11160 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11161 decode text as usual.  */);
11162   inhibit_null_byte_detection = 0;
11163
11164   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11165                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11166 Internal use only.  Removed after the experimental optimizer gets stable. */);
11167   disable_ascii_optimization = 0;
11168
11169   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11170                doc: /* Char table for translating self-inserting characters.
11171 This is applied to the result of input methods, not their input.
11172 See also `keyboard-translate-table'.
11173
11174 Use of this variable for character code unification was rendered
11175 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11176 internal character representation.  */);
11177     Vtranslation_table_for_input = Qnil;
11178
11179   {
11180     Lisp_Object args[coding_arg_undecided_max];
11181     Lisp_Object plist[16];
11182     int i;
11183
11184     for (i = 0; i < coding_arg_undecided_max; i++)
11185       args[i] = Qnil;
11186
11187     plist[0] = intern_c_string (":name");
11188     plist[1] = args[coding_arg_name] = Qno_conversion;
11189     plist[2] = intern_c_string (":mnemonic");
11190     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11191     plist[4] = intern_c_string (":coding-type");
11192     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11193     plist[6] = intern_c_string (":ascii-compatible-p");
11194     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11195     plist[8] = intern_c_string (":default-char");
11196     plist[9] = args[coding_arg_default_char] = make_number (0);
11197     plist[10] = intern_c_string (":for-unibyte");
11198     plist[11] = args[coding_arg_for_unibyte] = Qt;
11199     plist[12] = intern_c_string (":docstring");
11200     plist[13] = build_pure_c_string ("Do no conversion.\n\
11201 \n\
11202 When you visit a file with this coding, the file is read into a\n\
11203 unibyte buffer as is, thus each byte of a file is treated as a\n\
11204 character.");
11205     plist[14] = intern_c_string (":eol-type");
11206     plist[15] = args[coding_arg_eol_type] = Qunix;
11207     args[coding_arg_plist] = Flist (16, plist);
11208     Fdefine_coding_system_internal (coding_arg_max, args);
11209
11210     plist[1] = args[coding_arg_name] = Qundecided;
11211     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11212     plist[5] = args[coding_arg_coding_type] = Qundecided;
11213     /* This is already set.
11214        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11215     plist[8] = intern_c_string (":charset-list");
11216     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11217     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11218     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11219     plist[15] = args[coding_arg_eol_type] = Qnil;
11220     args[coding_arg_plist] = Flist (16, plist);
11221     Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11222   }
11223
11224   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11225
11226   {
11227     int i;
11228
11229     for (i = 0; i < coding_category_max; i++)
11230       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11231   }
11232 #if defined (DOS_NT)
11233   system_eol_type = Qdos;
11234 #else
11235   system_eol_type = Qunix;
11236 #endif
11237   staticpro (&system_eol_type);
11238 }
11239
11240 char *
11241 emacs_strerror (int error_number)
11242 {
11243   char *str;
11244
11245   synchronize_system_messages_locale ();
11246   str = strerror (error_number);
11247
11248   if (! NILP (Vlocale_coding_system))
11249     {
11250       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11251                                                       Vlocale_coding_system,
11252                                                       0);
11253       str = SSDATA (dec);
11254     }
11255
11256   return str;
11257 }
11258
11259 #endif /* emacs */