src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2013 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 497
 498 /* A character to be produced on output if encoding of the original
 499    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 500 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 501
 502 /* UTF-8 section */
 503 #define CODING_UTF_8_BOM(coding)        \
 504   ((coding)->spec.utf_8_bom)
 505
 506 /* UTF-16 section */
 507 #define CODING_UTF_16_BOM(coding)       \
 508   ((coding)->spec.utf_16.bom)
 509
 510 #define CODING_UTF_16_ENDIAN(coding)    \
 511   ((coding)->spec.utf_16.endian)
 512
 513 #define CODING_UTF_16_SURROGATE(coding) \
 514   ((coding)->spec.utf_16.surrogate)
 515
 516
 517 /* CCL section */
 518 #define CODING_CCL_DECODER(coding)      \
 519   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 520 #define CODING_CCL_ENCODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 522 #define CODING_CCL_VALIDS(coding)                                          \
 523   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 524
 525 /* Index for each coding category in `coding_categories' */
 526
 527 enum coding_category
 528   {
 529     coding_category_iso_7,
 530     coding_category_iso_7_tight,
 531     coding_category_iso_8_1,
 532     coding_category_iso_8_2,
 533     coding_category_iso_7_else,
 534     coding_category_iso_8_else,
 535     coding_category_utf_8_auto,
 536     coding_category_utf_8_nosig,
 537     coding_category_utf_8_sig,
 538     coding_category_utf_16_auto,
 539     coding_category_utf_16_be,
 540     coding_category_utf_16_le,
 541     coding_category_utf_16_be_nosig,
 542     coding_category_utf_16_le_nosig,
 543     coding_category_charset,
 544     coding_category_sjis,
 545     coding_category_big5,
 546     coding_category_ccl,
 547     coding_category_emacs_mule,
 548     /* All above are targets of code detection.  */
 549     coding_category_raw_text,
 550     coding_category_undecided,
 551     coding_category_max
 552   };
 553
 554 /* Definitions of flag bits used in detect_coding_XXXX.  */
 555 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 556 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 557 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 558 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 559 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 560 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 561 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 562 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 563 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 564 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 565 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 566 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 567 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 568 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 569 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 570 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 571 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 572 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 573 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 574 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 575
 576 /* This value is returned if detect_coding_mask () find nothing other
 577    than ASCII characters.  */
 578 #define CATEGORY_MASK_ANY               \
 579   (CATEGORY_MASK_ISO_7                  \
 580    | CATEGORY_MASK_ISO_7_TIGHT          \
 581    | CATEGORY_MASK_ISO_8_1              \
 582    | CATEGORY_MASK_ISO_8_2              \
 583    | CATEGORY_MASK_ISO_7_ELSE           \
 584    | CATEGORY_MASK_ISO_8_ELSE           \
 585    | CATEGORY_MASK_UTF_8_AUTO           \
 586    | CATEGORY_MASK_UTF_8_NOSIG          \
 587    | CATEGORY_MASK_UTF_8_SIG            \
 588    | CATEGORY_MASK_UTF_16_AUTO          \
 589    | CATEGORY_MASK_UTF_16_BE            \
 590    | CATEGORY_MASK_UTF_16_LE            \
 591    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 592    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 593    | CATEGORY_MASK_CHARSET              \
 594    | CATEGORY_MASK_SJIS                 \
 595    | CATEGORY_MASK_BIG5                 \
 596    | CATEGORY_MASK_CCL                  \
 597    | CATEGORY_MASK_EMACS_MULE)
 598
 599
 600 #define CATEGORY_MASK_ISO_7BIT \
 601   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 602
 603 #define CATEGORY_MASK_ISO_8BIT \
 604   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 605
 606 #define CATEGORY_MASK_ISO_ELSE \
 607   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 608
 609 #define CATEGORY_MASK_ISO_ESCAPE        \
 610   (CATEGORY_MASK_ISO_7                  \
 611    | CATEGORY_MASK_ISO_7_TIGHT          \
 612    | CATEGORY_MASK_ISO_7_ELSE           \
 613    | CATEGORY_MASK_ISO_8_ELSE)
 614
 615 #define CATEGORY_MASK_ISO       \
 616   (  CATEGORY_MASK_ISO_7BIT     \
 617      | CATEGORY_MASK_ISO_8BIT   \
 618      | CATEGORY_MASK_ISO_ELSE)
 619
 620 #define CATEGORY_MASK_UTF_16            \
 621   (CATEGORY_MASK_UTF_16_AUTO            \
 622    | CATEGORY_MASK_UTF_16_BE            \
 623    | CATEGORY_MASK_UTF_16_LE            \
 624    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 625    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 626
 627 #define CATEGORY_MASK_UTF_8     \
 628   (CATEGORY_MASK_UTF_8_AUTO     \
 629    | CATEGORY_MASK_UTF_8_NOSIG  \
 630    | CATEGORY_MASK_UTF_8_SIG)
 631
 632 /* Table of coding categories (Lisp symbols).  This variable is for
 633    internal use only.  */
 634 static Lisp_Object Vcoding_category_table;
 635
 636 /* Table of coding-categories ordered by priority.  */
 637 static enum coding_category coding_priorities[coding_category_max];
 638
 639 /* Nth element is a coding context for the coding system bound to the
 640    Nth coding category.  */
 641 static struct coding_system coding_categories[coding_category_max];
 642
 643 /*** Commonly used macros and functions ***/
 644
 645 #ifndef min
 646 #define min(a, b) ((a) < (b) ? (a) : (b))
 647 #endif
 648 #ifndef max
 649 #define max(a, b) ((a) > (b) ? (a) : (b))
 650 #endif
 651
 652 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 653   do {                                                  \
 654     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 655     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 656   } while (0)
 657
 658
 659 /* Safely get one byte from the source text pointed by SRC which ends
 660    at SRC_END, and set C to that byte.  If there are not enough bytes
 661    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 662    and a multibyte character is found at SRC, set C to the
 663    negative value of the character code.  The caller should declare
 664    and set these variables appropriately in advance:
 665         src, src_end, multibytep */
 666
 667 #define ONE_MORE_BYTE(c)                                \
 668   do {                                                  \
 669     if (src == src_end)                                 \
 670       {                                                 \
 671         if (src_base < src)                             \
 672           record_conversion_result                      \
 673             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 674         goto no_more_source;                            \
 675       }                                                 \
 676     c = *src++;                                         \
 677     if (multibytep && (c & 0x80))                       \
 678       {                                                 \
 679         if ((c & 0xFE) == 0xC0)                         \
 680           c = ((c & 1) << 6) | *src++;                  \
 681         else                                            \
 682           {                                             \
 683             src--;                                      \
 684             c = - string_char (src, &src, NULL);        \
 685             record_conversion_result                    \
 686               (coding, CODING_RESULT_INVALID_SRC);      \
 687           }                                             \
 688       }                                                 \
 689     consumed_chars++;                                   \
 690   } while (0)
 691
 692 /* Safely get two bytes from the source text pointed by SRC which ends
 693    at SRC_END, and set C1 and C2 to those bytes while skipping the
 694    heading multibyte characters.  If there are not enough bytes in the
 695    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 696    a multibyte character is found for C2, set C2 to the negative value
 697    of the character code.  The caller should declare and set these
 698    variables appropriately in advance:
 699         src, src_end, multibytep
 700    It is intended that this macro is used in detect_coding_utf_16.  */
 701
 702 #define TWO_MORE_BYTES(c1, c2)                          \
 703   do {                                                  \
 704     do {                                                \
 705       if (src == src_end)                               \
 706         goto no_more_source;                            \
 707       c1 = *src++;                                      \
 708       if (multibytep && (c1 & 0x80))                    \
 709         {                                               \
 710           if ((c1 & 0xFE) == 0xC0)                      \
 711             c1 = ((c1 & 1) << 6) | *src++;              \
 712           else                                          \
 713             {                                           \
 714               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 715               c1 = -1;                                  \
 716             }                                           \
 717         }                                               \
 718     } while (c1 < 0);                                   \
 719     if (src == src_end)                                 \
 720       goto no_more_source;                              \
 721     c2 = *src++;                                        \
 722     if (multibytep && (c2 & 0x80))                      \
 723       {                                                 \
 724         if ((c2 & 0xFE) == 0xC0)                        \
 725           c2 = ((c2 & 1) << 6) | *src++;                \
 726         else                                            \
 727           c2 = -1;                                      \
 728       }                                                 \
 729   } while (0)
 730
 731
 732 /* Store a byte C in the place pointed by DST and increment DST to the
 733    next free point, and increment PRODUCED_CHARS.  The caller should
 734    assure that C is 0..127, and declare and set the variable `dst'
 735    appropriately in advance.
 736 */
 737
 738
 739 #define EMIT_ONE_ASCII_BYTE(c)  \
 740   do {                          \
 741     produced_chars++;           \
 742     *dst++ = (c);               \
 743   } while (0)
 744
 745
 746 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 747
 748 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 749   do {                                  \
 750     produced_chars += 2;                \
 751     *dst++ = (c1), *dst++ = (c2);       \
 752   } while (0)
 753
 754
 755 /* Store a byte C in the place pointed by DST and increment DST to the
 756    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 757    store in an appropriate multibyte form.  The caller should
 758    declare and set the variables `dst' and `multibytep' appropriately
 759    in advance.  */
 760
 761 #define EMIT_ONE_BYTE(c)                \
 762   do {                                  \
 763     produced_chars++;                   \
 764     if (multibytep)                     \
 765       {                                 \
 766         unsigned ch = (c);              \
 767         if (ch >= 0x80)                 \
 768           ch = BYTE8_TO_CHAR (ch);      \
 769         CHAR_STRING_ADVANCE (ch, dst);  \
 770       }                                 \
 771     else                                \
 772       *dst++ = (c);                     \
 773   } while (0)
 774
 775
 776 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 777
 778 #define EMIT_TWO_BYTES(c1, c2)          \
 779   do {                                  \
 780     produced_chars += 2;                \
 781     if (multibytep)                     \
 782       {                                 \
 783         unsigned ch;                    \
 784                                         \
 785         ch = (c1);                      \
 786         if (ch >= 0x80)                 \
 787           ch = BYTE8_TO_CHAR (ch);      \
 788         CHAR_STRING_ADVANCE (ch, dst);  \
 789         ch = (c2);                      \
 790         if (ch >= 0x80)                 \
 791           ch = BYTE8_TO_CHAR (ch);      \
 792         CHAR_STRING_ADVANCE (ch, dst);  \
 793       }                                 \
 794     else                                \
 795       {                                 \
 796         *dst++ = (c1);                  \
 797         *dst++ = (c2);                  \
 798       }                                 \
 799   } while (0)
 800
 801
 802 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 803   do {                                  \
 804     EMIT_ONE_BYTE (c1);                 \
 805     EMIT_TWO_BYTES (c2, c3);            \
 806   } while (0)
 807
 808
 809 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 810   do {                                          \
 811     EMIT_TWO_BYTES (c1, c2);                    \
 812     EMIT_TWO_BYTES (c3, c4);                    \
 813   } while (0)
 814
 815
 816 static void
 817 record_conversion_result (struct coding_system *coding,
 818                           enum coding_result_code result)
 819 {
 820   coding->result = result;
 821   switch (result)
 822     {
 823     case CODING_RESULT_INSUFFICIENT_SRC:
 824       Vlast_code_conversion_error = Qinsufficient_source;
 825       break;
 826     case CODING_RESULT_INVALID_SRC:
 827       Vlast_code_conversion_error = Qinvalid_source;
 828       break;
 829     case CODING_RESULT_INTERRUPT:
 830       Vlast_code_conversion_error = Qinterrupted;
 831       break;
 832     case CODING_RESULT_INSUFFICIENT_DST:
 833       /* Don't record this error in Vlast_code_conversion_error
 834          because it happens just temporarily and is resolved when the
 835          whole conversion is finished.  */
 836       break;
 837     case CODING_RESULT_SUCCESS:
 838       break;
 839     default:
 840       Vlast_code_conversion_error = intern ("Unknown error");
 841     }
 842 }
 843
 844 /* These wrapper macros are used to preserve validity of pointers into
 845    buffer text across calls to decode_char, encode_char, etc, which
 846    could cause relocation of buffers if it loads a charset map,
 847    because loading a charset map allocates large structures.  */
 848
 849 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 850   do {                                                                       \
 851     ptrdiff_t offset;                                                        \
 852                                                                              \
 853     charset_map_loaded = 0;                                                  \
 854     c = DECODE_CHAR (charset, code);                                         \
 855     if (charset_map_loaded                                                   \
 856         && (offset = coding_change_source (coding)))                         \
 857       {                                                                      \
 858         src += offset;                                                       \
 859         src_base += offset;                                                  \
 860         src_end += offset;                                                   \
 861       }                                                                      \
 862   } while (0)
 863
 864 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 865   do {                                                                  \
 866     ptrdiff_t offset;                                                   \
 867                                                                         \
 868     charset_map_loaded = 0;                                             \
 869     code = ENCODE_CHAR (charset, c);                                    \
 870     if (charset_map_loaded                                              \
 871         && (offset = coding_change_destination (coding)))               \
 872       {                                                                 \
 873         dst += offset;                                                  \
 874         dst_end += offset;                                              \
 875       }                                                                 \
 876   } while (0)
 877
 878 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 879   do {                                                                  \
 880     ptrdiff_t offset;                                                   \
 881                                                                         \
 882     charset_map_loaded = 0;                                             \
 883     charset = char_charset (c, charset_list, code_return);              \
 884     if (charset_map_loaded                                              \
 885         && (offset = coding_change_destination (coding)))               \
 886       {                                                                 \
 887         dst += offset;                                                  \
 888         dst_end += offset;                                              \
 889       }                                                                 \
 890   } while (0)
 891
 892 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 893   do {                                                                  \
 894     ptrdiff_t offset;                                                   \
 895                                                                         \
 896     charset_map_loaded = 0;                                             \
 897     result = CHAR_CHARSET_P (c, charset);                               \
 898     if (charset_map_loaded                                              \
 899         && (offset = coding_change_destination (coding)))               \
 900       {                                                                 \
 901         dst += offset;                                                  \
 902         dst_end += offset;                                              \
 903       }                                                                 \
 904   } while (0)
 905
 906
 907 /* If there are at least BYTES length of room at dst, allocate memory
 908    for coding->destination and update dst and dst_end.  We don't have
 909    to take care of coding->source which will be relocated.  It is
 910    handled by calling coding_set_source in encode_coding.  */
 911
 912 #define ASSURE_DESTINATION(bytes)                               \
 913   do {                                                          \
 914     if (dst + (bytes) >= dst_end)                               \
 915       {                                                         \
 916         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 917                                                                 \
 918         dst = alloc_destination (coding, more_bytes, dst);      \
 919         dst_end = coding->destination + coding->dst_bytes;      \
 920       }                                                         \
 921   } while (0)
 922
 923
 924 /* Store multibyte form of the character C in P, and advance P to the
 925    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 926    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 927    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 928
 929 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 930
 931 /* Return the character code of character whose multibyte form is at
 932    P, and advance P to the end of the multibyte form.  This used to be
 933    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 934    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 935
 936 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 937
 938 /* Set coding->source from coding->src_object.  */
 939
 940 static void
 941 coding_set_source (struct coding_system *coding)
 942 {
 943   if (BUFFERP (coding->src_object))
 944     {
 945       struct buffer *buf = XBUFFER (coding->src_object);
 946
 947       if (coding->src_pos < 0)
 948         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 949       else
 950         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 951     }
 952   else if (STRINGP (coding->src_object))
 953     {
 954       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 955     }
 956   else
 957     {
 958       /* Otherwise, the source is C string and is never relocated
 959          automatically.  Thus we don't have to update anything.  */
 960     }
 961 }
 962
 963
 964 /* Set coding->source from coding->src_object, and return how many
 965    bytes coding->source was changed.  */
 966
 967 static ptrdiff_t
 968 coding_change_source (struct coding_system *coding)
 969 {
 970   const unsigned char *orig = coding->source;
 971   coding_set_source (coding);
 972   return coding->source - orig;
 973 }
 974
 975
 976 /* Set coding->destination from coding->dst_object.  */
 977
 978 static void
 979 coding_set_destination (struct coding_system *coding)
 980 {
 981   if (BUFFERP (coding->dst_object))
 982     {
 983       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 984         {
 985           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 986           coding->dst_bytes = (GAP_END_ADDR
 987                                - (coding->src_bytes - coding->consumed)
 988                                - coding->destination);
 989         }
 990       else
 991         {
 992           /* We are sure that coding->dst_pos_byte is before the gap
 993              of the buffer. */
 994           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 995                                  + coding->dst_pos_byte - BEG_BYTE);
 996           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 997                                - coding->destination);
 998         }
 999     }
1000   else
1001     {
1002       /* Otherwise, the destination is C string and is never relocated
1003          automatically.  Thus we don't have to update anything.  */
1004     }
1005 }
1006
1007
1008 /* Set coding->destination from coding->dst_object, and return how
1009    many bytes coding->destination was changed.  */
1010
1011 static ptrdiff_t
1012 coding_change_destination (struct coding_system *coding)
1013 {
1014   const unsigned char *orig = coding->destination;
1015   coding_set_destination (coding);
1016   return coding->destination - orig;
1017 }
1018
1019
1020 static void
1021 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1022 {
1023   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1024     string_overflow ();
1025   coding->destination = xrealloc (coding->destination,
1026                                   coding->dst_bytes + bytes);
1027   coding->dst_bytes += bytes;
1028 }
1029
1030 static void
1031 coding_alloc_by_making_gap (struct coding_system *coding,
1032                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1033 {
1034   if (EQ (coding->src_object, coding->dst_object))
1035     {
1036       /* The gap may contain the produced data at the head and not-yet
1037          consumed data at the tail.  To preserve those data, we at
1038          first make the gap size to zero, then increase the gap
1039          size.  */
1040       ptrdiff_t add = GAP_SIZE;
1041
1042       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1043       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1044       make_gap (bytes);
1045       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1046       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1047     }
1048   else
1049     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1050 }
1051
1052
1053 static unsigned char *
1054 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1055                    unsigned char *dst)
1056 {
1057   ptrdiff_t offset = dst - coding->destination;
1058
1059   if (BUFFERP (coding->dst_object))
1060     {
1061       struct buffer *buf = XBUFFER (coding->dst_object);
1062
1063       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1064     }
1065   else
1066     coding_alloc_by_realloc (coding, nbytes);
1067   coding_set_destination (coding);
1068   dst = coding->destination + offset;
1069   return dst;
1070 }
1071
1072 /** Macros for annotations.  */
1073
1074 /* An annotation data is stored in the array coding->charbuf in this
1075    format:
1076      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1077    LENGTH is the number of elements in the annotation.
1078    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1079    NCHARS is the number of characters in the text annotated.
1080
1081    The format of the following elements depend on ANNOTATION_MASK.
1082
1083    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1084    follows:
1085      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1086
1087    NBYTES is the number of bytes specified in the header part of
1088    old-style emacs-mule encoding, or 0 for the other kind of
1089    composition.
1090
1091    METHOD is one of enum composition_method.
1092
1093    Optional COMPOSITION-COMPONENTS are characters and composition
1094    rules.
1095
1096    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1097    follows.
1098
1099    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1100    recover from an invalid annotation, and should be skipped by
1101    produce_annotation.  */
1102
1103 /* Maximum length of the header of annotation data.  */
1104 #define MAX_ANNOTATION_LENGTH 5
1105
1106 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1107   do {                                                  \
1108     *(buf)++ = -(len);                                  \
1109     *(buf)++ = (mask);                                  \
1110     *(buf)++ = (nchars);                                \
1111     coding->annotated = 1;                              \
1112   } while (0);
1113
1114 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1115   do {                                                                      \
1116     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1117     *buf++ = nbytes;                                                        \
1118     *buf++ = method;                                                        \
1119   } while (0)
1120
1121
1122 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1123   do {                                                                  \
1124     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1125     *buf++ = id;                                                        \
1126   } while (0)
1127
1128 \f
1129 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1130
1131
1132
1133 \f
1134 /*** 3. UTF-8 ***/
1135
1136 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1137    Return true if a text is encoded in UTF-8.  */
1138
1139 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1140 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1141 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1142 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1143 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1144 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1145
1146 #define UTF_8_BOM_1 0xEF
1147 #define UTF_8_BOM_2 0xBB
1148 #define UTF_8_BOM_3 0xBF
1149
1150 static bool
1151 detect_coding_utf_8 (struct coding_system *coding,
1152                      struct coding_detection_info *detect_info)
1153 {
1154   const unsigned char *src = coding->source, *src_base;
1155   const unsigned char *src_end = coding->source + coding->src_bytes;
1156   bool multibytep = coding->src_multibyte;
1157   ptrdiff_t consumed_chars = 0;
1158   bool bom_found = 0;
1159   bool found = 0;
1160
1161   detect_info->checked |= CATEGORY_MASK_UTF_8;
1162   /* A coding system of this category is always ASCII compatible.  */
1163   src += coding->head_ascii;
1164
1165   while (1)
1166     {
1167       int c, c1, c2, c3, c4;
1168
1169       src_base = src;
1170       ONE_MORE_BYTE (c);
1171       if (c < 0 || UTF_8_1_OCTET_P (c))
1172         continue;
1173       ONE_MORE_BYTE (c1);
1174       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1175         break;
1176       if (UTF_8_2_OCTET_LEADING_P (c))
1177         {
1178           found = 1;
1179           continue;
1180         }
1181       ONE_MORE_BYTE (c2);
1182       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1183         break;
1184       if (UTF_8_3_OCTET_LEADING_P (c))
1185         {
1186           found = 1;
1187           if (src_base == coding->source
1188               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1189             bom_found = 1;
1190           continue;
1191         }
1192       ONE_MORE_BYTE (c3);
1193       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1194         break;
1195       if (UTF_8_4_OCTET_LEADING_P (c))
1196         {
1197           found = 1;
1198           continue;
1199         }
1200       ONE_MORE_BYTE (c4);
1201       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1202         break;
1203       if (UTF_8_5_OCTET_LEADING_P (c))
1204         {
1205           found = 1;
1206           continue;
1207         }
1208       break;
1209     }
1210   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1211   return 0;
1212
1213  no_more_source:
1214   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1215     {
1216       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1217       return 0;
1218     }
1219   if (bom_found)
1220     {
1221       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1222       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1223     }
1224   else
1225     {
1226       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1227       if (found)
1228         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1229     }
1230   return 1;
1231 }
1232
1233
1234 static void
1235 decode_coding_utf_8 (struct coding_system *coding)
1236 {
1237   const unsigned char *src = coding->source + coding->consumed;
1238   const unsigned char *src_end = coding->source + coding->src_bytes;
1239   const unsigned char *src_base;
1240   int *charbuf = coding->charbuf + coding->charbuf_used;
1241   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1242   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1243   bool multibytep = coding->src_multibyte;
1244   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1245   bool eol_dos
1246     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1247   int byte_after_cr = -1;
1248
1249   if (bom != utf_without_bom)
1250     {
1251       int c1, c2, c3;
1252
1253       src_base = src;
1254       ONE_MORE_BYTE (c1);
1255       if (! UTF_8_3_OCTET_LEADING_P (c1))
1256         src = src_base;
1257       else
1258         {
1259           ONE_MORE_BYTE (c2);
1260           if (! UTF_8_EXTRA_OCTET_P (c2))
1261             src = src_base;
1262           else
1263             {
1264               ONE_MORE_BYTE (c3);
1265               if (! UTF_8_EXTRA_OCTET_P (c3))
1266                 src = src_base;
1267               else
1268                 {
1269                   if ((c1 != UTF_8_BOM_1)
1270                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1271                     src = src_base;
1272                   else
1273                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1274                 }
1275             }
1276         }
1277     }
1278   CODING_UTF_8_BOM (coding) = utf_without_bom;
1279
1280   while (1)
1281     {
1282       int c, c1, c2, c3, c4, c5;
1283
1284       src_base = src;
1285       consumed_chars_base = consumed_chars;
1286
1287       if (charbuf >= charbuf_end)
1288         {
1289           if (byte_after_cr >= 0)
1290             src_base--;
1291           break;
1292         }
1293
1294       if (byte_after_cr >= 0)
1295         c1 = byte_after_cr, byte_after_cr = -1;
1296       else
1297         ONE_MORE_BYTE (c1);
1298       if (c1 < 0)
1299         {
1300           c = - c1;
1301         }
1302       else if (UTF_8_1_OCTET_P (c1))
1303         {
1304           if (eol_dos && c1 == '\r')
1305             ONE_MORE_BYTE (byte_after_cr);
1306           c = c1;
1307         }
1308       else
1309         {
1310           ONE_MORE_BYTE (c2);
1311           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1312             goto invalid_code;
1313           if (UTF_8_2_OCTET_LEADING_P (c1))
1314             {
1315               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1316               /* Reject overlong sequences here and below.  Encoders
1317                  producing them are incorrect, they can be misleading,
1318                  and they mess up read/write invariance.  */
1319               if (c < 128)
1320                 goto invalid_code;
1321             }
1322           else
1323             {
1324               ONE_MORE_BYTE (c3);
1325               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1326                 goto invalid_code;
1327               if (UTF_8_3_OCTET_LEADING_P (c1))
1328                 {
1329                   c = (((c1 & 0xF) << 12)
1330                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1331                   if (c < 0x800
1332                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1333                     goto invalid_code;
1334                 }
1335               else
1336                 {
1337                   ONE_MORE_BYTE (c4);
1338                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1339                     goto invalid_code;
1340                   if (UTF_8_4_OCTET_LEADING_P (c1))
1341                     {
1342                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1343                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1344                     if (c < 0x10000)
1345                       goto invalid_code;
1346                     }
1347                   else
1348                     {
1349                       ONE_MORE_BYTE (c5);
1350                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1351                         goto invalid_code;
1352                       if (UTF_8_5_OCTET_LEADING_P (c1))
1353                         {
1354                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1355                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1356                                | (c5 & 0x3F));
1357                           if ((c > MAX_CHAR) || (c < 0x200000))
1358                             goto invalid_code;
1359                         }
1360                       else
1361                         goto invalid_code;
1362                     }
1363                 }
1364             }
1365         }
1366
1367       *charbuf++ = c;
1368       continue;
1369
1370     invalid_code:
1371       src = src_base;
1372       consumed_chars = consumed_chars_base;
1373       ONE_MORE_BYTE (c);
1374       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1375       coding->errors++;
1376     }
1377
1378  no_more_source:
1379   coding->consumed_char += consumed_chars_base;
1380   coding->consumed = src_base - coding->source;
1381   coding->charbuf_used = charbuf - coding->charbuf;
1382 }
1383
1384
1385 static bool
1386 encode_coding_utf_8 (struct coding_system *coding)
1387 {
1388   bool multibytep = coding->dst_multibyte;
1389   int *charbuf = coding->charbuf;
1390   int *charbuf_end = charbuf + coding->charbuf_used;
1391   unsigned char *dst = coding->destination + coding->produced;
1392   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1393   ptrdiff_t produced_chars = 0;
1394   int c;
1395
1396   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1397     {
1398       ASSURE_DESTINATION (3);
1399       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1400       CODING_UTF_8_BOM (coding) = utf_without_bom;
1401     }
1402
1403   if (multibytep)
1404     {
1405       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1406
1407       while (charbuf < charbuf_end)
1408         {
1409           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1410
1411           ASSURE_DESTINATION (safe_room);
1412           c = *charbuf++;
1413           if (CHAR_BYTE8_P (c))
1414             {
1415               c = CHAR_TO_BYTE8 (c);
1416               EMIT_ONE_BYTE (c);
1417             }
1418           else
1419             {
1420               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1421               for (p = str; p < pend; p++)
1422                 EMIT_ONE_BYTE (*p);
1423             }
1424         }
1425     }
1426   else
1427     {
1428       int safe_room = MAX_MULTIBYTE_LENGTH;
1429
1430       while (charbuf < charbuf_end)
1431         {
1432           ASSURE_DESTINATION (safe_room);
1433           c = *charbuf++;
1434           if (CHAR_BYTE8_P (c))
1435             *dst++ = CHAR_TO_BYTE8 (c);
1436           else
1437             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1438           produced_chars++;
1439         }
1440     }
1441   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1442   coding->produced_char += produced_chars;
1443   coding->produced = dst - coding->destination;
1444   return 0;
1445 }
1446
1447
1448 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1449    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1450
1451 #define UTF_16_HIGH_SURROGATE_P(val) \
1452   (((val) & 0xFC00) == 0xD800)
1453
1454 #define UTF_16_LOW_SURROGATE_P(val) \
1455   (((val) & 0xFC00) == 0xDC00)
1456
1457
1458 static bool
1459 detect_coding_utf_16 (struct coding_system *coding,
1460                       struct coding_detection_info *detect_info)
1461 {
1462   const unsigned char *src = coding->source;
1463   const unsigned char *src_end = coding->source + coding->src_bytes;
1464   bool multibytep = coding->src_multibyte;
1465   int c1, c2;
1466
1467   detect_info->checked |= CATEGORY_MASK_UTF_16;
1468   if (coding->mode & CODING_MODE_LAST_BLOCK
1469       && (coding->src_chars & 1))
1470     {
1471       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1472       return 0;
1473     }
1474
1475   TWO_MORE_BYTES (c1, c2);
1476   if ((c1 == 0xFF) && (c2 == 0xFE))
1477     {
1478       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1479                              | CATEGORY_MASK_UTF_16_AUTO);
1480       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1481                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1482                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1483     }
1484   else if ((c1 == 0xFE) && (c2 == 0xFF))
1485     {
1486       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1487                              | CATEGORY_MASK_UTF_16_AUTO);
1488       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1489                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1490                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1491     }
1492   else if (c2 < 0)
1493     {
1494       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1495       return 0;
1496     }
1497   else
1498     {
1499       /* We check the dispersion of Eth and Oth bytes where E is even and
1500          O is odd.  If both are high, we assume binary data.*/
1501       unsigned char e[256], o[256];
1502       unsigned e_num = 1, o_num = 1;
1503
1504       memset (e, 0, 256);
1505       memset (o, 0, 256);
1506       e[c1] = 1;
1507       o[c2] = 1;
1508
1509       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1510                                 |CATEGORY_MASK_UTF_16_BE
1511                                 | CATEGORY_MASK_UTF_16_LE);
1512
1513       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1514              != CATEGORY_MASK_UTF_16)
1515         {
1516           TWO_MORE_BYTES (c1, c2);
1517           if (c2 < 0)
1518             break;
1519           if (! e[c1])
1520             {
1521               e[c1] = 1;
1522               e_num++;
1523               if (e_num >= 128)
1524                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1525             }
1526           if (! o[c2])
1527             {
1528               o[c2] = 1;
1529               o_num++;
1530               if (o_num >= 128)
1531                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1532             }
1533         }
1534       return 0;
1535     }
1536
1537  no_more_source:
1538   return 1;
1539 }
1540
1541 static void
1542 decode_coding_utf_16 (struct coding_system *coding)
1543 {
1544   const unsigned char *src = coding->source + coding->consumed;
1545   const unsigned char *src_end = coding->source + coding->src_bytes;
1546   const unsigned char *src_base;
1547   int *charbuf = coding->charbuf + coding->charbuf_used;
1548   /* We may produces at most 3 chars in one loop.  */
1549   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1550   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1551   bool multibytep = coding->src_multibyte;
1552   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1553   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1554   int surrogate = CODING_UTF_16_SURROGATE (coding);
1555   bool eol_dos
1556     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1557   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1558
1559   if (bom == utf_with_bom)
1560     {
1561       int c, c1, c2;
1562
1563       src_base = src;
1564       ONE_MORE_BYTE (c1);
1565       ONE_MORE_BYTE (c2);
1566       c = (c1 << 8) | c2;
1567
1568       if (endian == utf_16_big_endian
1569           ? c != 0xFEFF : c != 0xFFFE)
1570         {
1571           /* The first two bytes are not BOM.  Treat them as bytes
1572              for a normal character.  */
1573           src = src_base;
1574           coding->errors++;
1575         }
1576       CODING_UTF_16_BOM (coding) = utf_without_bom;
1577     }
1578   else if (bom == utf_detect_bom)
1579     {
1580       /* We have already tried to detect BOM and failed in
1581          detect_coding.  */
1582       CODING_UTF_16_BOM (coding) = utf_without_bom;
1583     }
1584
1585   while (1)
1586     {
1587       int c, c1, c2;
1588
1589       src_base = src;
1590       consumed_chars_base = consumed_chars;
1591
1592       if (charbuf >= charbuf_end)
1593         {
1594           if (byte_after_cr1 >= 0)
1595             src_base -= 2;
1596           break;
1597         }
1598
1599       if (byte_after_cr1 >= 0)
1600         c1 = byte_after_cr1, byte_after_cr1 = -1;
1601       else
1602         ONE_MORE_BYTE (c1);
1603       if (c1 < 0)
1604         {
1605           *charbuf++ = -c1;
1606           continue;
1607         }
1608       if (byte_after_cr2 >= 0)
1609         c2 = byte_after_cr2, byte_after_cr2 = -1;
1610       else
1611         ONE_MORE_BYTE (c2);
1612       if (c2 < 0)
1613         {
1614           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1615           *charbuf++ = -c2;
1616           continue;
1617         }
1618       c = (endian == utf_16_big_endian
1619            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1620
1621       if (surrogate)
1622         {
1623           if (! UTF_16_LOW_SURROGATE_P (c))
1624             {
1625               if (endian == utf_16_big_endian)
1626                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1627               else
1628                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1629               *charbuf++ = c1;
1630               *charbuf++ = c2;
1631               coding->errors++;
1632               if (UTF_16_HIGH_SURROGATE_P (c))
1633                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1634               else
1635                 *charbuf++ = c;
1636             }
1637           else
1638             {
1639               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1640               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1641               *charbuf++ = 0x10000 + c;
1642             }
1643         }
1644       else
1645         {
1646           if (UTF_16_HIGH_SURROGATE_P (c))
1647             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1648           else
1649             {
1650               if (eol_dos && c == '\r')
1651                 {
1652                   ONE_MORE_BYTE (byte_after_cr1);
1653                   ONE_MORE_BYTE (byte_after_cr2);
1654                 }
1655               *charbuf++ = c;
1656             }
1657         }
1658     }
1659
1660  no_more_source:
1661   coding->consumed_char += consumed_chars_base;
1662   coding->consumed = src_base - coding->source;
1663   coding->charbuf_used = charbuf - coding->charbuf;
1664 }
1665
1666 static bool
1667 encode_coding_utf_16 (struct coding_system *coding)
1668 {
1669   bool multibytep = coding->dst_multibyte;
1670   int *charbuf = coding->charbuf;
1671   int *charbuf_end = charbuf + coding->charbuf_used;
1672   unsigned char *dst = coding->destination + coding->produced;
1673   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1674   int safe_room = 8;
1675   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1676   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1677   ptrdiff_t produced_chars = 0;
1678   int c;
1679
1680   if (bom != utf_without_bom)
1681     {
1682       ASSURE_DESTINATION (safe_room);
1683       if (big_endian)
1684         EMIT_TWO_BYTES (0xFE, 0xFF);
1685       else
1686         EMIT_TWO_BYTES (0xFF, 0xFE);
1687       CODING_UTF_16_BOM (coding) = utf_without_bom;
1688     }
1689
1690   while (charbuf < charbuf_end)
1691     {
1692       ASSURE_DESTINATION (safe_room);
1693       c = *charbuf++;
1694       if (c > MAX_UNICODE_CHAR)
1695         c = coding->default_char;
1696
1697       if (c < 0x10000)
1698         {
1699           if (big_endian)
1700             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1701           else
1702             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1703         }
1704       else
1705         {
1706           int c1, c2;
1707
1708           c -= 0x10000;
1709           c1 = (c >> 10) + 0xD800;
1710           c2 = (c & 0x3FF) + 0xDC00;
1711           if (big_endian)
1712             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1713           else
1714             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1715         }
1716     }
1717   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1718   coding->produced = dst - coding->destination;
1719   coding->produced_char += produced_chars;
1720   return 0;
1721 }
1722
1723 \f
1724 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1725
1726 /* Emacs' internal format for representation of multiple character
1727    sets is a kind of multi-byte encoding, i.e. characters are
1728    represented by variable-length sequences of one-byte codes.
1729
1730    ASCII characters and control characters (e.g. `tab', `newline') are
1731    represented by one-byte sequences which are their ASCII codes, in
1732    the range 0x00 through 0x7F.
1733
1734    8-bit characters of the range 0x80..0x9F are represented by
1735    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1736    code + 0x20).
1737
1738    8-bit characters of the range 0xA0..0xFF are represented by
1739    one-byte sequences which are their 8-bit code.
1740
1741    The other characters are represented by a sequence of `base
1742    leading-code', optional `extended leading-code', and one or two
1743    `position-code's.  The length of the sequence is determined by the
1744    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1745    whereas extended leading-code and position-code take the range 0xA0
1746    through 0xFF.  See `charset.h' for more details about leading-code
1747    and position-code.
1748
1749    --- CODE RANGE of Emacs' internal format ---
1750    character set        range
1751    -------------        -----
1752    ascii                0x00..0x7F
1753    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1754    eight-bit-graphic    0xA0..0xBF
1755    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1756    ---------------------------------------------
1757
1758    As this is the internal character representation, the format is
1759    usually not used externally (i.e. in a file or in a data sent to a
1760    process).  But, it is possible to have a text externally in this
1761    format (i.e. by encoding by the coding system `emacs-mule').
1762
1763    In that case, a sequence of one-byte codes has a slightly different
1764    form.
1765
1766    At first, all characters in eight-bit-control are represented by
1767    one-byte sequences which are their 8-bit code.
1768
1769    Next, character composition data are represented by the byte
1770    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1771    where,
1772         METHOD is 0xF2 plus one of composition method (enum
1773         composition_method),
1774
1775         BYTES is 0xA0 plus a byte length of this composition data,
1776
1777         CHARS is 0xA0 plus a number of characters composed by this
1778         data,
1779
1780         COMPONENTs are characters of multibyte form or composition
1781         rules encoded by two-byte of ASCII codes.
1782
1783    In addition, for backward compatibility, the following formats are
1784    also recognized as composition data on decoding.
1785
1786    0x80 MSEQ ...
1787    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1788
1789    Here,
1790         MSEQ is a multibyte form but in these special format:
1791           ASCII: 0xA0 ASCII_CODE+0x80,
1792           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1793         RULE is a one byte code of the range 0xA0..0xF0 that
1794         represents a composition rule.
1795   */
1796
1797 char emacs_mule_bytes[256];
1798
1799
1800 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1801    Return true if a text is encoded in 'emacs-mule'.  */
1802
1803 static bool
1804 detect_coding_emacs_mule (struct coding_system *coding,
1805                           struct coding_detection_info *detect_info)
1806 {
1807   const unsigned char *src = coding->source, *src_base;
1808   const unsigned char *src_end = coding->source + coding->src_bytes;
1809   bool multibytep = coding->src_multibyte;
1810   ptrdiff_t consumed_chars = 0;
1811   int c;
1812   int found = 0;
1813
1814   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1815   /* A coding system of this category is always ASCII compatible.  */
1816   src += coding->head_ascii;
1817
1818   while (1)
1819     {
1820       src_base = src;
1821       ONE_MORE_BYTE (c);
1822       if (c < 0)
1823         continue;
1824       if (c == 0x80)
1825         {
1826           /* Perhaps the start of composite character.  We simply skip
1827              it because analyzing it is too heavy for detecting.  But,
1828              at least, we check that the composite character
1829              constitutes of more than 4 bytes.  */
1830           const unsigned char *src_start;
1831
1832         repeat:
1833           src_start = src;
1834           do
1835             {
1836               ONE_MORE_BYTE (c);
1837             }
1838           while (c >= 0xA0);
1839
1840           if (src - src_start <= 4)
1841             break;
1842           found = CATEGORY_MASK_EMACS_MULE;
1843           if (c == 0x80)
1844             goto repeat;
1845         }
1846
1847       if (c < 0x80)
1848         {
1849           if (c < 0x20
1850               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1851             break;
1852         }
1853       else
1854         {
1855           int more_bytes = emacs_mule_bytes[c] - 1;
1856
1857           while (more_bytes > 0)
1858             {
1859               ONE_MORE_BYTE (c);
1860               if (c < 0xA0)
1861                 {
1862                   src--;        /* Unread the last byte.  */
1863                   break;
1864                 }
1865               more_bytes--;
1866             }
1867           if (more_bytes != 0)
1868             break;
1869           found = CATEGORY_MASK_EMACS_MULE;
1870         }
1871     }
1872   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1873   return 0;
1874
1875  no_more_source:
1876   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1877     {
1878       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1879       return 0;
1880     }
1881   detect_info->found |= found;
1882   return 1;
1883 }
1884
1885
1886 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1887    character.  If CMP_STATUS indicates that we must expect MSEQ or
1888    RULE described above, decode it and return the negative value of
1889    the decoded character or rule.  If an invalid byte is found, return
1890    -1.  If SRC is too short, return -2.  */
1891
1892 static int
1893 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1894                  int *nbytes, int *nchars, int *id,
1895                  struct composition_status *cmp_status)
1896 {
1897   const unsigned char *src_end = coding->source + coding->src_bytes;
1898   const unsigned char *src_base = src;
1899   bool multibytep = coding->src_multibyte;
1900   int charset_ID;
1901   unsigned code;
1902   int c;
1903   int consumed_chars = 0;
1904   bool mseq_found = 0;
1905
1906   ONE_MORE_BYTE (c);
1907   if (c < 0)
1908     {
1909       c = -c;
1910       charset_ID = emacs_mule_charset[0];
1911     }
1912   else
1913     {
1914       if (c >= 0xA0)
1915         {
1916           if (cmp_status->state != COMPOSING_NO
1917               && cmp_status->old_form)
1918             {
1919               if (cmp_status->state == COMPOSING_CHAR)
1920                 {
1921                   if (c == 0xA0)
1922                     {
1923                       ONE_MORE_BYTE (c);
1924                       c -= 0x80;
1925                       if (c < 0)
1926                         goto invalid_code;
1927                     }
1928                   else
1929                     c -= 0x20;
1930                   mseq_found = 1;
1931                 }
1932               else
1933                 {
1934                   *nbytes = src - src_base;
1935                   *nchars = consumed_chars;
1936                   return -c;
1937                 }
1938             }
1939           else
1940             goto invalid_code;
1941         }
1942
1943       switch (emacs_mule_bytes[c])
1944         {
1945         case 2:
1946           if ((charset_ID = emacs_mule_charset[c]) < 0)
1947             goto invalid_code;
1948           ONE_MORE_BYTE (c);
1949           if (c < 0xA0)
1950             goto invalid_code;
1951           code = c & 0x7F;
1952           break;
1953
1954         case 3:
1955           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1956               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1957             {
1958               ONE_MORE_BYTE (c);
1959               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1960                 goto invalid_code;
1961               ONE_MORE_BYTE (c);
1962               if (c < 0xA0)
1963                 goto invalid_code;
1964               code = c & 0x7F;
1965             }
1966           else
1967             {
1968               if ((charset_ID = emacs_mule_charset[c]) < 0)
1969                 goto invalid_code;
1970               ONE_MORE_BYTE (c);
1971               if (c < 0xA0)
1972                 goto invalid_code;
1973               code = (c & 0x7F) << 8;
1974               ONE_MORE_BYTE (c);
1975               if (c < 0xA0)
1976                 goto invalid_code;
1977               code |= c & 0x7F;
1978             }
1979           break;
1980
1981         case 4:
1982           ONE_MORE_BYTE (c);
1983           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1984             goto invalid_code;
1985           ONE_MORE_BYTE (c);
1986           if (c < 0xA0)
1987             goto invalid_code;
1988           code = (c & 0x7F) << 8;
1989           ONE_MORE_BYTE (c);
1990           if (c < 0xA0)
1991             goto invalid_code;
1992           code |= c & 0x7F;
1993           break;
1994
1995         case 1:
1996           code = c;
1997           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
1998           break;
1999
2000         default:
2001           emacs_abort ();
2002         }
2003       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2004                           CHARSET_FROM_ID (charset_ID), code, c);
2005       if (c < 0)
2006         goto invalid_code;
2007     }
2008   *nbytes = src - src_base;
2009   *nchars = consumed_chars;
2010   if (id)
2011     *id = charset_ID;
2012   return (mseq_found ? -c : c);
2013
2014  no_more_source:
2015   return -2;
2016
2017  invalid_code:
2018   return -1;
2019 }
2020
2021
2022 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2023
2024 /* Handle these composition sequence ('|': the end of header elements,
2025    BYTES and CHARS >= 0xA0):
2026
2027    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2028    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2029    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2030
2031    and these old form:
2032
2033    (4) relative composition: 0x80 | MSEQ ... MSEQ
2034    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2035
2036    When the starter 0x80 and the following header elements are found,
2037    this annotation header is produced.
2038
2039         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2040
2041    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2042    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2043
2044    Then, upon reading the following elements, these codes are produced
2045    until the composition end is found:
2046
2047    (1) CHAR ... CHAR
2048    (2) ALT ... ALT CHAR ... CHAR
2049    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2050    (4) CHAR ... CHAR
2051    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2052
2053    When the composition end is found, LENGTH and NCHARS in the
2054    annotation header is updated as below:
2055
2056    (1) LENGTH: unchanged, NCHARS: unchanged
2057    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2058    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2059    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2060    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2061
2062    If an error is found while composing, the annotation header is
2063    changed to the original composition header (plus filler -1s) as
2064    below:
2065
2066    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2067    (5)          [ 0x80 0xFF -1 -1- -1 ]
2068
2069    and the sequence [ -2 DECODED-RULE ] is changed to the original
2070    byte sequence as below:
2071         o the original byte sequence is B: [ B -1 ]
2072         o the original byte sequence is B1 B2: [ B1 B2 ]
2073
2074    Most of the routines are implemented by macros because many
2075    variables and labels in the caller decode_coding_emacs_mule must be
2076    accessible, and they are usually called just once (thus doesn't
2077    increase the size of compiled object).  */
2078
2079 /* Decode a composition rule represented by C as a component of
2080    composition sequence of Emacs 20 style.  Set RULE to the decoded
2081    rule. */
2082
2083 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2084   do {                                                  \
2085     int gref, nref;                                     \
2086                                                         \
2087     c -= 0xA0;                                          \
2088     if (c < 0 || c >= 81)                               \
2089       goto invalid_code;                                \
2090     gref = c / 9, nref = c % 9;                         \
2091     if (gref == 4) gref = 10;                           \
2092     if (nref == 4) nref = 10;                           \
2093     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2094   } while (0)
2095
2096
2097 /* Decode a composition rule represented by C and the following byte
2098    at SRC as a component of composition sequence of Emacs 21 style.
2099    Set RULE to the decoded rule.  */
2100
2101 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2102   do {                                                  \
2103     int gref, nref;                                     \
2104                                                         \
2105     gref = c - 0x20;                                    \
2106     if (gref < 0 || gref >= 81)                         \
2107       goto invalid_code;                                \
2108     ONE_MORE_BYTE (c);                                  \
2109     nref = c - 0x20;                                    \
2110     if (nref < 0 || nref >= 81)                         \
2111       goto invalid_code;                                \
2112     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2113   } while (0)
2114
2115
2116 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2117    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2118    byte length of this composition information, CHARS is the number of
2119    characters composed by this composition.  */
2120
2121 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2122   do {                                                                  \
2123     enum composition_method method = c - 0xF2;                          \
2124     int nbytes, nchars;                                                 \
2125                                                                         \
2126     ONE_MORE_BYTE (c);                                                  \
2127     if (c < 0)                                                          \
2128       goto invalid_code;                                                \
2129     nbytes = c - 0xA0;                                                  \
2130     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2131       goto invalid_code;                                                \
2132     ONE_MORE_BYTE (c);                                                  \
2133     nchars = c - 0xA0;                                                  \
2134     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2135       goto invalid_code;                                                \
2136     cmp_status->old_form = 0;                                           \
2137     cmp_status->method = method;                                        \
2138     if (method == COMPOSITION_RELATIVE)                                 \
2139       cmp_status->state = COMPOSING_CHAR;                               \
2140     else                                                                \
2141       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2142     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2143     cmp_status->nchars = nchars;                                        \
2144     cmp_status->ncomps = nbytes - 4;                                    \
2145     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2146   } while (0)
2147
2148
2149 /* Start of Emacs 20 style format for relative composition.  */
2150
2151 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2152   do {                                                          \
2153     cmp_status->old_form = 1;                                   \
2154     cmp_status->method = COMPOSITION_RELATIVE;                  \
2155     cmp_status->state = COMPOSING_CHAR;                         \
2156     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2157     cmp_status->nchars = cmp_status->ncomps = 0;                \
2158     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2159   } while (0)
2160
2161
2162 /* Start of Emacs 20 style format for rule-base composition.  */
2163
2164 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2165   do {                                                          \
2166     cmp_status->old_form = 1;                                   \
2167     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2168     cmp_status->state = COMPOSING_CHAR;                         \
2169     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2170     cmp_status->nchars = cmp_status->ncomps = 0;                \
2171     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2172   } while (0)
2173
2174
2175 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2176   do {                                                  \
2177     const unsigned char *current_src = src;             \
2178                                                         \
2179     ONE_MORE_BYTE (c);                                  \
2180     if (c < 0)                                          \
2181       goto invalid_code;                                \
2182     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2183         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2184       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2185     else if (c < 0xA0)                                  \
2186       goto invalid_code;                                \
2187     else if (c < 0xC0)                                  \
2188       {                                                 \
2189         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2190         /* Re-read C as a composition component.  */    \
2191         src = current_src;                              \
2192       }                                                 \
2193     else if (c == 0xFF)                                 \
2194       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2195     else                                                \
2196       goto invalid_code;                                \
2197   } while (0)
2198
2199 #define EMACS_MULE_COMPOSITION_END()                            \
2200   do {                                                          \
2201     int idx = - cmp_status->length;                             \
2202                                                                 \
2203     if (cmp_status->old_form)                                   \
2204       charbuf[idx + 2] = cmp_status->nchars;                    \
2205     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2206       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2207     cmp_status->state = COMPOSING_NO;                           \
2208   } while (0)
2209
2210
2211 static int
2212 emacs_mule_finish_composition (int *charbuf,
2213                                struct composition_status *cmp_status)
2214 {
2215   int idx = - cmp_status->length;
2216   int new_chars;
2217
2218   if (cmp_status->old_form && cmp_status->nchars > 0)
2219     {
2220       charbuf[idx + 2] = cmp_status->nchars;
2221       new_chars = 0;
2222       if (cmp_status->method == COMPOSITION_WITH_RULE
2223           && cmp_status->state == COMPOSING_CHAR)
2224         {
2225           /* The last rule was invalid.  */
2226           int rule = charbuf[-1] + 0xA0;
2227
2228           charbuf[-2] = BYTE8_TO_CHAR (rule);
2229           charbuf[-1] = -1;
2230           new_chars = 1;
2231         }
2232     }
2233   else
2234     {
2235       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2236
2237       if (cmp_status->method == COMPOSITION_WITH_RULE)
2238         {
2239           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2240           charbuf[idx++] = -3;
2241           charbuf[idx++] = 0;
2242           new_chars = 1;
2243         }
2244       else
2245         {
2246           int nchars = charbuf[idx + 1] + 0xA0;
2247           int nbytes = charbuf[idx + 2] + 0xA0;
2248
2249           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2250           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2251           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2252           charbuf[idx++] = -1;
2253           new_chars = 4;
2254         }
2255     }
2256   cmp_status->state = COMPOSING_NO;
2257   return new_chars;
2258 }
2259
2260 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2261   do {                                                                    \
2262     if (cmp_status->state != COMPOSING_NO)                                \
2263       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2264   } while (0)
2265
2266
2267 static void
2268 decode_coding_emacs_mule (struct coding_system *coding)
2269 {
2270   const unsigned char *src = coding->source + coding->consumed;
2271   const unsigned char *src_end = coding->source + coding->src_bytes;
2272   const unsigned char *src_base;
2273   int *charbuf = coding->charbuf + coding->charbuf_used;
2274   /* We may produce two annotations (charset and composition) in one
2275      loop and one more charset annotation at the end.  */
2276   int *charbuf_end
2277     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2278       /* We can produce up to 2 characters in a loop.  */
2279       - 1;
2280   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2281   bool multibytep = coding->src_multibyte;
2282   ptrdiff_t char_offset = coding->produced_char;
2283   ptrdiff_t last_offset = char_offset;
2284   int last_id = charset_ascii;
2285   bool eol_dos
2286     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2287   int byte_after_cr = -1;
2288   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2289
2290   if (cmp_status->state != COMPOSING_NO)
2291     {
2292       int i;
2293
2294       if (charbuf_end - charbuf < cmp_status->length)
2295         emacs_abort ();
2296       for (i = 0; i < cmp_status->length; i++)
2297         *charbuf++ = cmp_status->carryover[i];
2298       coding->annotated = 1;
2299     }
2300
2301   while (1)
2302     {
2303       int c, id IF_LINT (= 0);
2304
2305       src_base = src;
2306       consumed_chars_base = consumed_chars;
2307
2308       if (charbuf >= charbuf_end)
2309         {
2310           if (byte_after_cr >= 0)
2311             src_base--;
2312           break;
2313         }
2314
2315       if (byte_after_cr >= 0)
2316         c = byte_after_cr, byte_after_cr = -1;
2317       else
2318         ONE_MORE_BYTE (c);
2319
2320       if (c < 0 || c == 0x80)
2321         {
2322           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2323           if (c < 0)
2324             {
2325               *charbuf++ = -c;
2326               char_offset++;
2327             }
2328           else
2329             DECODE_EMACS_MULE_COMPOSITION_START ();
2330           continue;
2331         }
2332
2333       if (c < 0x80)
2334         {
2335           if (eol_dos && c == '\r')
2336             ONE_MORE_BYTE (byte_after_cr);
2337           id = charset_ascii;
2338           if (cmp_status->state != COMPOSING_NO)
2339             {
2340               if (cmp_status->old_form)
2341                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2342               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2343                 cmp_status->ncomps--;
2344             }
2345         }
2346       else
2347         {
2348           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2349           /* emacs_mule_char can load a charset map from a file, which
2350              allocates a large structure and might cause buffer text
2351              to be relocated as result.  Thus, we need to remember the
2352              original pointer to buffer text, and fix up all related
2353              pointers after the call.  */
2354           const unsigned char *orig = coding->source;
2355           ptrdiff_t offset;
2356
2357           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2358                                cmp_status);
2359           offset = coding->source - orig;
2360           if (offset)
2361             {
2362               src += offset;
2363               src_base += offset;
2364               src_end += offset;
2365             }
2366           if (c < 0)
2367             {
2368               if (c == -1)
2369                 goto invalid_code;
2370               if (c == -2)
2371                 break;
2372             }
2373           src = src_base + nbytes;
2374           consumed_chars = consumed_chars_base + nchars;
2375           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2376             cmp_status->ncomps -= nchars;
2377         }
2378
2379       /* Now if C >= 0, we found a normally encoded character, if C <
2380          0, we found an old-style composition component character or
2381          rule.  */
2382
2383       if (cmp_status->state == COMPOSING_NO)
2384         {
2385           if (last_id != id)
2386             {
2387               if (last_id != charset_ascii)
2388                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2389                                   last_id);
2390               last_id = id;
2391               last_offset = char_offset;
2392             }
2393           *charbuf++ = c;
2394           char_offset++;
2395         }
2396       else if (cmp_status->state == COMPOSING_CHAR)
2397         {
2398           if (cmp_status->old_form)
2399             {
2400               if (c >= 0)
2401                 {
2402                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2403                   *charbuf++ = c;
2404                   char_offset++;
2405                 }
2406               else
2407                 {
2408                   *charbuf++ = -c;
2409                   cmp_status->nchars++;
2410                   cmp_status->length++;
2411                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2412                     EMACS_MULE_COMPOSITION_END ();
2413                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2414                     cmp_status->state = COMPOSING_RULE;
2415                 }
2416             }
2417           else
2418             {
2419               *charbuf++ = c;
2420               cmp_status->length++;
2421               cmp_status->nchars--;
2422               if (cmp_status->nchars == 0)
2423                 EMACS_MULE_COMPOSITION_END ();
2424             }
2425         }
2426       else if (cmp_status->state == COMPOSING_RULE)
2427         {
2428           int rule;
2429
2430           if (c >= 0)
2431             {
2432               EMACS_MULE_COMPOSITION_END ();
2433               *charbuf++ = c;
2434               char_offset++;
2435             }
2436           else
2437             {
2438               c = -c;
2439               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2440               if (rule < 0)
2441                 goto invalid_code;
2442               *charbuf++ = -2;
2443               *charbuf++ = rule;
2444               cmp_status->length += 2;
2445               cmp_status->state = COMPOSING_CHAR;
2446             }
2447         }
2448       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2449         {
2450           *charbuf++ = c;
2451           cmp_status->length++;
2452           if (cmp_status->ncomps == 0)
2453             cmp_status->state = COMPOSING_CHAR;
2454           else if (cmp_status->ncomps > 0)
2455             {
2456               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2457                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2458             }
2459           else
2460             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2461         }
2462       else                      /* COMPOSING_COMPONENT_RULE */
2463         {
2464           int rule;
2465
2466           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2467           if (rule < 0)
2468             goto invalid_code;
2469           *charbuf++ = -2;
2470           *charbuf++ = rule;
2471           cmp_status->length += 2;
2472           cmp_status->ncomps--;
2473           if (cmp_status->ncomps > 0)
2474             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2475           else
2476             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2477         }
2478       continue;
2479
2480     invalid_code:
2481       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2482       src = src_base;
2483       consumed_chars = consumed_chars_base;
2484       ONE_MORE_BYTE (c);
2485       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2486       char_offset++;
2487       coding->errors++;
2488     }
2489
2490  no_more_source:
2491   if (cmp_status->state != COMPOSING_NO)
2492     {
2493       if (coding->mode & CODING_MODE_LAST_BLOCK)
2494         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2495       else
2496         {
2497           int i;
2498
2499           charbuf -= cmp_status->length;
2500           for (i = 0; i < cmp_status->length; i++)
2501             cmp_status->carryover[i] = charbuf[i];
2502         }
2503     }
2504   if (last_id != charset_ascii)
2505     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2506   coding->consumed_char += consumed_chars_base;
2507   coding->consumed = src_base - coding->source;
2508   coding->charbuf_used = charbuf - coding->charbuf;
2509 }
2510
2511
2512 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2513   do {                                          \
2514     if (id < 0xA0)                              \
2515       codes[0] = id, codes[1] = 0;              \
2516     else if (id < 0xE0)                         \
2517       codes[0] = 0x9A, codes[1] = id;           \
2518     else if (id < 0xF0)                         \
2519       codes[0] = 0x9B, codes[1] = id;           \
2520     else if (id < 0xF5)                         \
2521       codes[0] = 0x9C, codes[1] = id;           \
2522     else                                        \
2523       codes[0] = 0x9D, codes[1] = id;           \
2524   } while (0);
2525
2526
2527 static bool
2528 encode_coding_emacs_mule (struct coding_system *coding)
2529 {
2530   bool multibytep = coding->dst_multibyte;
2531   int *charbuf = coding->charbuf;
2532   int *charbuf_end = charbuf + coding->charbuf_used;
2533   unsigned char *dst = coding->destination + coding->produced;
2534   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2535   int safe_room = 8;
2536   ptrdiff_t produced_chars = 0;
2537   Lisp_Object attrs, charset_list;
2538   int c;
2539   int preferred_charset_id = -1;
2540
2541   CODING_GET_INFO (coding, attrs, charset_list);
2542   if (! EQ (charset_list, Vemacs_mule_charset_list))
2543     {
2544       charset_list = Vemacs_mule_charset_list;
2545       ASET (attrs, coding_attr_charset_list, charset_list);
2546     }
2547
2548   while (charbuf < charbuf_end)
2549     {
2550       ASSURE_DESTINATION (safe_room);
2551       c = *charbuf++;
2552
2553       if (c < 0)
2554         {
2555           /* Handle an annotation.  */
2556           switch (*charbuf)
2557             {
2558             case CODING_ANNOTATE_COMPOSITION_MASK:
2559               /* Not yet implemented.  */
2560               break;
2561             case CODING_ANNOTATE_CHARSET_MASK:
2562               preferred_charset_id = charbuf[3];
2563               if (preferred_charset_id >= 0
2564                   && NILP (Fmemq (make_number (preferred_charset_id),
2565                                   charset_list)))
2566                 preferred_charset_id = -1;
2567               break;
2568             default:
2569               emacs_abort ();
2570             }
2571           charbuf += -c - 1;
2572           continue;
2573         }
2574
2575       if (ASCII_CHAR_P (c))
2576         EMIT_ONE_ASCII_BYTE (c);
2577       else if (CHAR_BYTE8_P (c))
2578         {
2579           c = CHAR_TO_BYTE8 (c);
2580           EMIT_ONE_BYTE (c);
2581         }
2582       else
2583         {
2584           struct charset *charset;
2585           unsigned code;
2586           int dimension;
2587           int emacs_mule_id;
2588           unsigned char leading_codes[2];
2589
2590           if (preferred_charset_id >= 0)
2591             {
2592               bool result;
2593
2594               charset = CHARSET_FROM_ID (preferred_charset_id);
2595               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2596               if (result)
2597                 code = ENCODE_CHAR (charset, c);
2598               else
2599                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2600                                      &code, charset);
2601             }
2602           else
2603             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2604                                  &code, charset);
2605           if (! charset)
2606             {
2607               c = coding->default_char;
2608               if (ASCII_CHAR_P (c))
2609                 {
2610                   EMIT_ONE_ASCII_BYTE (c);
2611                   continue;
2612                 }
2613               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2614                                    &code, charset);
2615             }
2616           dimension = CHARSET_DIMENSION (charset);
2617           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2618           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2619           EMIT_ONE_BYTE (leading_codes[0]);
2620           if (leading_codes[1])
2621             EMIT_ONE_BYTE (leading_codes[1]);
2622           if (dimension == 1)
2623             EMIT_ONE_BYTE (code | 0x80);
2624           else
2625             {
2626               code |= 0x8080;
2627               EMIT_ONE_BYTE (code >> 8);
2628               EMIT_ONE_BYTE (code & 0xFF);
2629             }
2630         }
2631     }
2632   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2633   coding->produced_char += produced_chars;
2634   coding->produced = dst - coding->destination;
2635   return 0;
2636 }
2637
2638 \f
2639 /*** 7. ISO2022 handlers ***/
2640
2641 /* The following note describes the coding system ISO2022 briefly.
2642    Since the intention of this note is to help understand the
2643    functions in this file, some parts are NOT ACCURATE or are OVERLY
2644    SIMPLIFIED.  For thorough understanding, please refer to the
2645    original document of ISO2022.  This is equivalent to the standard
2646    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2647
2648    ISO2022 provides many mechanisms to encode several character sets
2649    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2650    is encoded using bytes less than 128.  This may make the encoded
2651    text a little bit longer, but the text passes more easily through
2652    several types of gateway, some of which strip off the MSB (Most
2653    Significant Bit).
2654
2655    There are two kinds of character sets: control character sets and
2656    graphic character sets.  The former contain control characters such
2657    as `newline' and `escape' to provide control functions (control
2658    functions are also provided by escape sequences).  The latter
2659    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2660    two control character sets and many graphic character sets.
2661
2662    Graphic character sets are classified into one of the following
2663    four classes, according to the number of bytes (DIMENSION) and
2664    number of characters in one dimension (CHARS) of the set:
2665    - DIMENSION1_CHARS94
2666    - DIMENSION1_CHARS96
2667    - DIMENSION2_CHARS94
2668    - DIMENSION2_CHARS96
2669
2670    In addition, each character set is assigned an identification tag,
2671    unique for each set, called the "final character" (denoted as <F>
2672    hereafter).  The <F> of each character set is decided by ECMA(*)
2673    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2674    (0x30..0x3F are for private use only).
2675
2676    Note (*): ECMA = European Computer Manufacturers Association
2677
2678    Here are examples of graphic character sets [NAME(<F>)]:
2679         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2680         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2681         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2682         o DIMENSION2_CHARS96 -- none for the moment
2683
2684    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2685         C0 [0x00..0x1F] -- control character plane 0
2686         GL [0x20..0x7F] -- graphic character plane 0
2687         C1 [0x80..0x9F] -- control character plane 1
2688         GR [0xA0..0xFF] -- graphic character plane 1
2689
2690    A control character set is directly designated and invoked to C0 or
2691    C1 by an escape sequence.  The most common case is that:
2692    - ISO646's  control character set is designated/invoked to C0, and
2693    - ISO6429's control character set is designated/invoked to C1,
2694    and usually these designations/invocations are omitted in encoded
2695    text.  In a 7-bit environment, only C0 can be used, and a control
2696    character for C1 is encoded by an appropriate escape sequence to
2697    fit into the environment.  All control characters for C1 are
2698    defined to have corresponding escape sequences.
2699
2700    A graphic character set is at first designated to one of four
2701    graphic registers (G0 through G3), then these graphic registers are
2702    invoked to GL or GR.  These designations and invocations can be
2703    done independently.  The most common case is that G0 is invoked to
2704    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2705    these invocations and designations are omitted in encoded text.
2706    In a 7-bit environment, only GL can be used.
2707
2708    When a graphic character set of CHARS94 is invoked to GL, codes
2709    0x20 and 0x7F of the GL area work as control characters SPACE and
2710    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2711    be used.
2712
2713    There are two ways of invocation: locking-shift and single-shift.
2714    With locking-shift, the invocation lasts until the next different
2715    invocation, whereas with single-shift, the invocation affects the
2716    following character only and doesn't affect the locking-shift
2717    state.  Invocations are done by the following control characters or
2718    escape sequences:
2719
2720    ----------------------------------------------------------------------
2721    abbrev  function                  cntrl escape seq   description
2722    ----------------------------------------------------------------------
2723    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2724    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2725    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2726    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2727    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2728    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2729    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2730    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2731    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2732    ----------------------------------------------------------------------
2733    (*) These are not used by any known coding system.
2734
2735    Control characters for these functions are defined by macros
2736    ISO_CODE_XXX in `coding.h'.
2737
2738    Designations are done by the following escape sequences:
2739    ----------------------------------------------------------------------
2740    escape sequence      description
2741    ----------------------------------------------------------------------
2742    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2743    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2744    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2745    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2746    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2747    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2748    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2749    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2750    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2751    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2752    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2753    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2754    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2755    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2756    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2757    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2758    ----------------------------------------------------------------------
2759
2760    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2761    of dimension 1, chars 94, and final character <F>, etc...
2762
2763    Note (*): Although these designations are not allowed in ISO2022,
2764    Emacs accepts them on decoding, and produces them on encoding
2765    CHARS96 character sets in a coding system which is characterized as
2766    7-bit environment, non-locking-shift, and non-single-shift.
2767
2768    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2769    '(' must be omitted.  We refer to this as "short-form" hereafter.
2770
2771    Now you may notice that there are a lot of ways of encoding the
2772    same multilingual text in ISO2022.  Actually, there exist many
2773    coding systems such as Compound Text (used in X11's inter client
2774    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2775    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2776    localized platforms), and all of these are variants of ISO2022.
2777
2778    In addition to the above, Emacs handles two more kinds of escape
2779    sequences: ISO6429's direction specification and Emacs' private
2780    sequence for specifying character composition.
2781
2782    ISO6429's direction specification takes the following form:
2783         o CSI ']'      -- end of the current direction
2784         o CSI '0' ']'  -- end of the current direction
2785         o CSI '1' ']'  -- start of left-to-right text
2786         o CSI '2' ']'  -- start of right-to-left text
2787    The control character CSI (0x9B: control sequence introducer) is
2788    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2789
2790    Character composition specification takes the following form:
2791         o ESC '0' -- start relative composition
2792         o ESC '1' -- end composition
2793         o ESC '2' -- start rule-base composition (*)
2794         o ESC '3' -- start relative composition with alternate chars  (**)
2795         o ESC '4' -- start rule-base composition with alternate chars  (**)
2796   Since these are not standard escape sequences of any ISO standard,
2797   the use of them with these meanings is restricted to Emacs only.
2798
2799   (*) This form is used only in Emacs 20.7 and older versions,
2800   but newer versions can safely decode it.
2801   (**) This form is used only in Emacs 21.1 and newer versions,
2802   and older versions can't decode it.
2803
2804   Here's a list of example usages of these composition escape
2805   sequences (categorized by `enum composition_method').
2806
2807   COMPOSITION_RELATIVE:
2808         ESC 0 CHAR [ CHAR ] ESC 1
2809   COMPOSITION_WITH_RULE:
2810         ESC 2 CHAR [ RULE CHAR ] ESC 1
2811   COMPOSITION_WITH_ALTCHARS:
2812         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2813   COMPOSITION_WITH_RULE_ALTCHARS:
2814         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2815
2816 static enum iso_code_class_type iso_code_class[256];
2817
2818 #define SAFE_CHARSET_P(coding, id)      \
2819   ((id) <= (coding)->max_charset_id     \
2820    && (coding)->safe_charsets[id] != 255)
2821
2822 static void
2823 setup_iso_safe_charsets (Lisp_Object attrs)
2824 {
2825   Lisp_Object charset_list, safe_charsets;
2826   Lisp_Object request;
2827   Lisp_Object reg_usage;
2828   Lisp_Object tail;
2829   EMACS_INT reg94, reg96;
2830   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2831   int max_charset_id;
2832
2833   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2834   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2835       && ! EQ (charset_list, Viso_2022_charset_list))
2836     {
2837       charset_list = Viso_2022_charset_list;
2838       ASET (attrs, coding_attr_charset_list, charset_list);
2839       ASET (attrs, coding_attr_safe_charsets, Qnil);
2840     }
2841
2842   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2843     return;
2844
2845   max_charset_id = 0;
2846   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2847     {
2848       int id = XINT (XCAR (tail));
2849       if (max_charset_id < id)
2850         max_charset_id = id;
2851     }
2852
2853   safe_charsets = make_uninit_string (max_charset_id + 1);
2854   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2855   request = AREF (attrs, coding_attr_iso_request);
2856   reg_usage = AREF (attrs, coding_attr_iso_usage);
2857   reg94 = XINT (XCAR (reg_usage));
2858   reg96 = XINT (XCDR (reg_usage));
2859
2860   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2861     {
2862       Lisp_Object id;
2863       Lisp_Object reg;
2864       struct charset *charset;
2865
2866       id = XCAR (tail);
2867       charset = CHARSET_FROM_ID (XINT (id));
2868       reg = Fcdr (Fassq (id, request));
2869       if (! NILP (reg))
2870         SSET (safe_charsets, XINT (id), XINT (reg));
2871       else if (charset->iso_chars_96)
2872         {
2873           if (reg96 < 4)
2874             SSET (safe_charsets, XINT (id), reg96);
2875         }
2876       else
2877         {
2878           if (reg94 < 4)
2879             SSET (safe_charsets, XINT (id), reg94);
2880         }
2881     }
2882   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2883 }
2884
2885
2886 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2887    Return true if a text is encoded in one of ISO-2022 based coding
2888    systems.  */
2889
2890 static bool
2891 detect_coding_iso_2022 (struct coding_system *coding,
2892                         struct coding_detection_info *detect_info)
2893 {
2894   const unsigned char *src = coding->source, *src_base = src;
2895   const unsigned char *src_end = coding->source + coding->src_bytes;
2896   bool multibytep = coding->src_multibyte;
2897   bool single_shifting = 0;
2898   int id;
2899   int c, c1;
2900   ptrdiff_t consumed_chars = 0;
2901   int i;
2902   int rejected = 0;
2903   int found = 0;
2904   int composition_count = -1;
2905
2906   detect_info->checked |= CATEGORY_MASK_ISO;
2907
2908   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2909     {
2910       struct coding_system *this = &(coding_categories[i]);
2911       Lisp_Object attrs, val;
2912
2913       if (this->id < 0)
2914         continue;
2915       attrs = CODING_ID_ATTRS (this->id);
2916       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2917           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2918         setup_iso_safe_charsets (attrs);
2919       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2920       this->max_charset_id = SCHARS (val) - 1;
2921       this->safe_charsets = SDATA (val);
2922     }
2923
2924   /* A coding system of this category is always ASCII compatible.  */
2925   src += coding->head_ascii;
2926
2927   while (rejected != CATEGORY_MASK_ISO)
2928     {
2929       src_base = src;
2930       ONE_MORE_BYTE (c);
2931       switch (c)
2932         {
2933         case ISO_CODE_ESC:
2934           if (inhibit_iso_escape_detection)
2935             break;
2936           single_shifting = 0;
2937           ONE_MORE_BYTE (c);
2938           if (c == 'N' || c == 'O')
2939             {
2940               /* ESC <Fe> for SS2 or SS3.  */
2941               single_shifting = 1;
2942               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2943             }
2944           else if (c == '1')
2945             {
2946               /* End of composition.  */
2947               if (composition_count < 0
2948                   || composition_count > MAX_COMPOSITION_COMPONENTS)
2949                 /* Invalid */
2950                 break;
2951               composition_count = -1;
2952               found |= CATEGORY_MASK_ISO;
2953             }
2954           else if (c >= '0' && c <= '4')
2955             {
2956               /* ESC <Fp> for start/end composition.  */
2957               composition_count = 0;
2958             }
2959           else
2960             {
2961               if (c >= '(' && c <= '/')
2962                 {
2963                   /* Designation sequence for a charset of dimension 1.  */
2964                   ONE_MORE_BYTE (c1);
2965                   if (c1 < ' ' || c1 >= 0x80
2966                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2967                     /* Invalid designation sequence.  Just ignore.  */
2968                     break;
2969                 }
2970               else if (c == '$')
2971                 {
2972                   /* Designation sequence for a charset of dimension 2.  */
2973                   ONE_MORE_BYTE (c);
2974                   if (c >= '@' && c <= 'B')
2975                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2976                     id = iso_charset_table[1][0][c];
2977                   else if (c >= '(' && c <= '/')
2978                     {
2979                       ONE_MORE_BYTE (c1);
2980                       if (c1 < ' ' || c1 >= 0x80
2981                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2982                         /* Invalid designation sequence.  Just ignore.  */
2983                         break;
2984                     }
2985                   else
2986                     /* Invalid designation sequence.  Just ignore it.  */
2987                     break;
2988                 }
2989               else
2990                 {
2991                   /* Invalid escape sequence.  Just ignore it.  */
2992                   break;
2993                 }
2994
2995               /* We found a valid designation sequence for CHARSET.  */
2996               rejected |= CATEGORY_MASK_ISO_8BIT;
2997               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2998                                   id))
2999                 found |= CATEGORY_MASK_ISO_7;
3000               else
3001                 rejected |= CATEGORY_MASK_ISO_7;
3002               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3003                                   id))
3004                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3005               else
3006                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3007               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3008                                   id))
3009                 found |= CATEGORY_MASK_ISO_7_ELSE;
3010               else
3011                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3012               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3013                                   id))
3014                 found |= CATEGORY_MASK_ISO_8_ELSE;
3015               else
3016                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3017             }
3018           break;
3019
3020         case ISO_CODE_SO:
3021         case ISO_CODE_SI:
3022           /* Locking shift out/in.  */
3023           if (inhibit_iso_escape_detection)
3024             break;
3025           single_shifting = 0;
3026           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3027           break;
3028
3029         case ISO_CODE_CSI:
3030           /* Control sequence introducer.  */
3031           single_shifting = 0;
3032           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3033           found |= CATEGORY_MASK_ISO_8_ELSE;
3034           goto check_extra_latin;
3035
3036         case ISO_CODE_SS2:
3037         case ISO_CODE_SS3:
3038           /* Single shift.   */
3039           if (inhibit_iso_escape_detection)
3040             break;
3041           single_shifting = 0;
3042           rejected |= CATEGORY_MASK_ISO_7BIT;
3043           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3044               & CODING_ISO_FLAG_SINGLE_SHIFT)
3045             {
3046               found |= CATEGORY_MASK_ISO_8_1;
3047               single_shifting = 1;
3048             }
3049           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3050               & CODING_ISO_FLAG_SINGLE_SHIFT)
3051             {
3052               found |= CATEGORY_MASK_ISO_8_2;
3053               single_shifting = 1;
3054             }
3055           if (single_shifting)
3056             break;
3057           goto check_extra_latin;
3058
3059         default:
3060           if (c < 0)
3061             continue;
3062           if (c < 0x80)
3063             {
3064               if (composition_count >= 0)
3065                 composition_count++;
3066               single_shifting = 0;
3067               break;
3068             }
3069           if (c >= 0xA0)
3070             {
3071               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3072               found |= CATEGORY_MASK_ISO_8_1;
3073               /* Check the length of succeeding codes of the range
3074                  0xA0..0FF.  If the byte length is even, we include
3075                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3076                  only when we are not single shifting.  */
3077               if (! single_shifting
3078                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3079                 {
3080                   int len = 1;
3081                   while (src < src_end)
3082                     {
3083                       src_base = src;
3084                       ONE_MORE_BYTE (c);
3085                       if (c < 0xA0)
3086                         {
3087                           src = src_base;
3088                           break;
3089                         }
3090                       len++;
3091                     }
3092
3093                   if (len & 1 && src < src_end)
3094                     {
3095                       rejected |= CATEGORY_MASK_ISO_8_2;
3096                       if (composition_count >= 0)
3097                         composition_count += len;
3098                     }
3099                   else
3100                     {
3101                       found |= CATEGORY_MASK_ISO_8_2;
3102                       if (composition_count >= 0)
3103                         composition_count += len / 2;
3104                     }
3105                 }
3106               break;
3107             }
3108         check_extra_latin:
3109           if (! VECTORP (Vlatin_extra_code_table)
3110               || NILP (AREF (Vlatin_extra_code_table, c)))
3111             {
3112               rejected = CATEGORY_MASK_ISO;
3113               break;
3114             }
3115           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3116               & CODING_ISO_FLAG_LATIN_EXTRA)
3117             found |= CATEGORY_MASK_ISO_8_1;
3118           else
3119             rejected |= CATEGORY_MASK_ISO_8_1;
3120           rejected |= CATEGORY_MASK_ISO_8_2;
3121           break;
3122         }
3123     }
3124   detect_info->rejected |= CATEGORY_MASK_ISO;
3125   return 0;
3126
3127  no_more_source:
3128   detect_info->rejected |= rejected;
3129   detect_info->found |= (found & ~rejected);
3130   return 1;
3131 }
3132
3133
3134 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3135    escape sequence should be kept.  */
3136 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3137   do {                                                                  \
3138     int id, prev;                                                       \
3139                                                                         \
3140     if (final < '0' || final >= 128                                     \
3141         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3142         || !SAFE_CHARSET_P (coding, id))                                \
3143       {                                                                 \
3144         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3145         chars_96 = -1;                                                  \
3146         break;                                                          \
3147       }                                                                 \
3148     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3149     if (id == charset_jisx0201_roman)                                   \
3150       {                                                                 \
3151         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3152           id = charset_ascii;                                           \
3153       }                                                                 \
3154     else if (id == charset_jisx0208_1978)                               \
3155       {                                                                 \
3156         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3157           id = charset_jisx0208;                                        \
3158       }                                                                 \
3159     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3160     /* If there was an invalid designation to REG previously, and this  \
3161        designation is ASCII to REG, we should keep this designation     \
3162        sequence.  */                                                    \
3163     if (prev == -2 && id == charset_ascii)                              \
3164       chars_96 = -1;                                                    \
3165   } while (0)
3166
3167
3168 /* Handle these composition sequence (ALT: alternate char):
3169
3170    (1) relative composition: ESC 0 CHAR ... ESC 1
3171    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3172    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3173    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3174
3175    When the start sequence (ESC 0/2/3/4) is found, this annotation
3176    header is produced.
3177
3178         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3179
3180    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3181    produced until the end sequence (ESC 1) is found:
3182
3183    (1) CHAR ... CHAR
3184    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3185    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3186    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3187
3188    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3189    annotation header is updated as below:
3190
3191    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3192    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3193    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3194    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3195
3196    If an error is found while composing, the annotation header is
3197    changed to:
3198
3199         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3200
3201    and the sequence [ -2 DECODED-RULE ] is changed to the original
3202    byte sequence as below:
3203         o the original byte sequence is B: [ B -1 ]
3204         o the original byte sequence is B1 B2: [ B1 B2 ]
3205    and the sequence [ -1 -1 ] is changed to the original byte
3206    sequence:
3207         [ ESC '0' ]
3208 */
3209
3210 /* Decode a composition rule C1 and maybe one more byte from the
3211    source, and set RULE to the encoded composition rule.  If the rule
3212    is invalid, goto invalid_code.  */
3213
3214 #define DECODE_COMPOSITION_RULE(rule)                                   \
3215   do {                                                                  \
3216     rule = c1 - 32;                                                     \
3217     if (rule < 0)                                                       \
3218       goto invalid_code;                                                \
3219     if (rule < 81)              /* old format (before ver.21) */        \
3220       {                                                                 \
3221         int gref = (rule) / 9;                                          \
3222         int nref = (rule) % 9;                                          \
3223         if (gref == 4) gref = 10;                                       \
3224         if (nref == 4) nref = 10;                                       \
3225         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3226       }                                                                 \
3227     else                        /* new format (after ver.21) */         \
3228       {                                                                 \
3229         int b;                                                          \
3230                                                                         \
3231         ONE_MORE_BYTE (b);                                              \
3232         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3233           goto invalid_code;                                            \
3234         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3235         rule += 0x100;   /* Distinguish it from the old format.  */     \
3236       }                                                                 \
3237   } while (0)
3238
3239 #define ENCODE_COMPOSITION_RULE(rule)                           \
3240   do {                                                          \
3241     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3242                                                                 \
3243     if (rule < 0x100)           /* old format */                \
3244       {                                                         \
3245         if (gref == 10) gref = 4;                               \
3246         if (nref == 10) nref = 4;                               \
3247         charbuf[idx] = 32 + gref * 9 + nref;                    \
3248         charbuf[idx + 1] = -1;                                  \
3249         new_chars++;                                            \
3250       }                                                         \
3251     else                                /* new format */        \
3252       {                                                         \
3253         charbuf[idx] = 32 + 81 + gref;                          \
3254         charbuf[idx + 1] = 32 + nref;                           \
3255         new_chars += 2;                                         \
3256       }                                                         \
3257   } while (0)
3258
3259 /* Finish the current composition as invalid.  */
3260
3261 static int
3262 finish_composition (int *charbuf, struct composition_status *cmp_status)
3263 {
3264   int idx = - cmp_status->length;
3265   int new_chars;
3266
3267   /* Recover the original ESC sequence */
3268   charbuf[idx++] = ISO_CODE_ESC;
3269   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3270                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3271                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3272                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3273                     : '4');
3274   charbuf[idx++] = -2;
3275   charbuf[idx++] = 0;
3276   charbuf[idx++] = -1;
3277   new_chars = cmp_status->nchars;
3278   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3279     for (; idx < 0; idx++)
3280       {
3281         int elt = charbuf[idx];
3282
3283         if (elt == -2)
3284           {
3285             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3286             idx++;
3287           }
3288         else if (elt == -1)
3289           {
3290             charbuf[idx++] = ISO_CODE_ESC;
3291             charbuf[idx] = '0';
3292             new_chars += 2;
3293           }
3294       }
3295   cmp_status->state = COMPOSING_NO;
3296   return new_chars;
3297 }
3298
3299 /* If characters are under composition, finish the composition.  */
3300 #define MAYBE_FINISH_COMPOSITION()                              \
3301   do {                                                          \
3302     if (cmp_status->state != COMPOSING_NO)                      \
3303       char_offset += finish_composition (charbuf, cmp_status);  \
3304   } while (0)
3305
3306 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3307
3308    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3309    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3310    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3311    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3312
3313    Produce this annotation sequence now:
3314
3315    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3316 */
3317
3318 #define DECODE_COMPOSITION_START(c1)                                       \
3319   do {                                                                     \
3320     if (c1 == '0'                                                          \
3321         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3322              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3323             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3324                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3325       {                                                                    \
3326         *charbuf++ = -1;                                                   \
3327         *charbuf++= -1;                                                    \
3328         cmp_status->state = COMPOSING_CHAR;                                \
3329         cmp_status->length += 2;                                           \
3330       }                                                                    \
3331     else                                                                   \
3332       {                                                                    \
3333         MAYBE_FINISH_COMPOSITION ();                                       \
3334         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3335                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3336                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3337                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3338         cmp_status->state                                                  \
3339           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3340         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3341         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3342         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3343         coding->annotated = 1;                                             \
3344       }                                                                    \
3345   } while (0)
3346
3347
3348 /* Handle composition end sequence ESC 1.  */
3349
3350 #define DECODE_COMPOSITION_END()                                        \
3351   do {                                                                  \
3352     if (cmp_status->nchars == 0                                         \
3353         || ((cmp_status->state == COMPOSING_CHAR)                       \
3354             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3355       {                                                                 \
3356         MAYBE_FINISH_COMPOSITION ();                                    \
3357         goto invalid_code;                                              \
3358       }                                                                 \
3359     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3360       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3361     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3362       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3363     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3364     char_offset += cmp_status->nchars;                                  \
3365     cmp_status->state = COMPOSING_NO;                                   \
3366   } while (0)
3367
3368 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3369
3370 #define STORE_COMPOSITION_RULE(rule)    \
3371   do {                                  \
3372     *charbuf++ = -2;                    \
3373     *charbuf++ = rule;                  \
3374     cmp_status->length += 2;            \
3375     cmp_status->state--;                \
3376   } while (0)
3377
3378 /* Store a composed char or a component char C in charbuf, and update
3379    cmp_status.  */
3380
3381 #define STORE_COMPOSITION_CHAR(c)                                       \
3382   do {                                                                  \
3383     *charbuf++ = (c);                                                   \
3384     cmp_status->length++;                                               \
3385     if (cmp_status->state == COMPOSING_CHAR)                            \
3386       cmp_status->nchars++;                                             \
3387     else                                                                \
3388       cmp_status->ncomps++;                                             \
3389     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3390         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3391             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3392       cmp_status->state++;                                              \
3393   } while (0)
3394
3395
3396 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3397
3398 static void
3399 decode_coding_iso_2022 (struct coding_system *coding)
3400 {
3401   const unsigned char *src = coding->source + coding->consumed;
3402   const unsigned char *src_end = coding->source + coding->src_bytes;
3403   const unsigned char *src_base;
3404   int *charbuf = coding->charbuf + coding->charbuf_used;
3405   /* We may produce two annotations (charset and composition) in one
3406      loop and one more charset annotation at the end.  */
3407   int *charbuf_end
3408     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3409   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3410   bool multibytep = coding->src_multibyte;
3411   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3412   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3413   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3414   int charset_id_2, charset_id_3;
3415   struct charset *charset;
3416   int c;
3417   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3418   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3419   ptrdiff_t char_offset = coding->produced_char;
3420   ptrdiff_t last_offset = char_offset;
3421   int last_id = charset_ascii;
3422   bool eol_dos
3423     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3424   int byte_after_cr = -1;
3425   int i;
3426
3427   setup_iso_safe_charsets (attrs);
3428   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3429
3430   if (cmp_status->state != COMPOSING_NO)
3431     {
3432       if (charbuf_end - charbuf < cmp_status->length)
3433         emacs_abort ();
3434       for (i = 0; i < cmp_status->length; i++)
3435         *charbuf++ = cmp_status->carryover[i];
3436       coding->annotated = 1;
3437     }
3438
3439   while (1)
3440     {
3441       int c1, c2, c3;
3442
3443       src_base = src;
3444       consumed_chars_base = consumed_chars;
3445
3446       if (charbuf >= charbuf_end)
3447         {
3448           if (byte_after_cr >= 0)
3449             src_base--;
3450           break;
3451         }
3452
3453       if (byte_after_cr >= 0)
3454         c1 = byte_after_cr, byte_after_cr = -1;
3455       else
3456         ONE_MORE_BYTE (c1);
3457       if (c1 < 0)
3458         goto invalid_code;
3459
3460       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3461         {
3462           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3463           char_offset++;
3464           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3465           continue;
3466         }
3467
3468       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3469         {
3470           if (c1 == ISO_CODE_ESC)
3471             {
3472               if (src + 1 >= src_end)
3473                 goto no_more_source;
3474               *charbuf++ = ISO_CODE_ESC;
3475               char_offset++;
3476               if (src[0] == '%' && src[1] == '@')
3477                 {
3478                   src += 2;
3479                   consumed_chars += 2;
3480                   char_offset += 2;
3481                   /* We are sure charbuf can contain two more chars. */
3482                   *charbuf++ = '%';
3483                   *charbuf++ = '@';
3484                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3485                 }
3486             }
3487           else
3488             {
3489               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3490               char_offset++;
3491             }
3492           continue;
3493         }
3494
3495       if ((cmp_status->state == COMPOSING_RULE
3496            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3497           && c1 != ISO_CODE_ESC)
3498         {
3499           int rule;
3500
3501           DECODE_COMPOSITION_RULE (rule);
3502           STORE_COMPOSITION_RULE (rule);
3503           continue;
3504         }
3505
3506       /* We produce at most one character.  */
3507       switch (iso_code_class [c1])
3508         {
3509         case ISO_0x20_or_0x7F:
3510           if (charset_id_0 < 0
3511               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3512             /* This is SPACE or DEL.  */
3513             charset = CHARSET_FROM_ID (charset_ascii);
3514           else
3515             charset = CHARSET_FROM_ID (charset_id_0);
3516           break;
3517
3518         case ISO_graphic_plane_0:
3519           if (charset_id_0 < 0)
3520             charset = CHARSET_FROM_ID (charset_ascii);
3521           else
3522             charset = CHARSET_FROM_ID (charset_id_0);
3523           break;
3524
3525         case ISO_0xA0_or_0xFF:
3526           if (charset_id_1 < 0
3527               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3528               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3529             goto invalid_code;
3530           /* This is a graphic character, we fall down ... */
3531
3532         case ISO_graphic_plane_1:
3533           if (charset_id_1 < 0)
3534             goto invalid_code;
3535           charset = CHARSET_FROM_ID (charset_id_1);
3536           break;
3537
3538         case ISO_control_0:
3539           if (eol_dos && c1 == '\r')
3540             ONE_MORE_BYTE (byte_after_cr);
3541           MAYBE_FINISH_COMPOSITION ();
3542           charset = CHARSET_FROM_ID (charset_ascii);
3543           break;
3544
3545         case ISO_control_1:
3546           goto invalid_code;
3547
3548         case ISO_shift_out:
3549           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3550               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3551             goto invalid_code;
3552           CODING_ISO_INVOCATION (coding, 0) = 1;
3553           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3554           continue;
3555
3556         case ISO_shift_in:
3557           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3558             goto invalid_code;
3559           CODING_ISO_INVOCATION (coding, 0) = 0;
3560           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3561           continue;
3562
3563         case ISO_single_shift_2_7:
3564           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3565             goto invalid_code;
3566         case ISO_single_shift_2:
3567           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3568             goto invalid_code;
3569           /* SS2 is handled as an escape sequence of ESC 'N' */
3570           c1 = 'N';
3571           goto label_escape_sequence;
3572
3573         case ISO_single_shift_3:
3574           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3575             goto invalid_code;
3576           /* SS2 is handled as an escape sequence of ESC 'O' */
3577           c1 = 'O';
3578           goto label_escape_sequence;
3579
3580         case ISO_control_sequence_introducer:
3581           /* CSI is handled as an escape sequence of ESC '[' ...  */
3582           c1 = '[';
3583           goto label_escape_sequence;
3584
3585         case ISO_escape:
3586           ONE_MORE_BYTE (c1);
3587         label_escape_sequence:
3588           /* Escape sequences handled here are invocation,
3589              designation, direction specification, and character
3590              composition specification.  */
3591           switch (c1)
3592             {
3593             case '&':           /* revision of following character set */
3594               ONE_MORE_BYTE (c1);
3595               if (!(c1 >= '@' && c1 <= '~'))
3596                 goto invalid_code;
3597               ONE_MORE_BYTE (c1);
3598               if (c1 != ISO_CODE_ESC)
3599                 goto invalid_code;
3600               ONE_MORE_BYTE (c1);
3601               goto label_escape_sequence;
3602
3603             case '$':           /* designation of 2-byte character set */
3604               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3605                 goto invalid_code;
3606               {
3607                 int reg, chars96;
3608
3609                 ONE_MORE_BYTE (c1);
3610                 if (c1 >= '@' && c1 <= 'B')
3611                   {     /* designation of JISX0208.1978, GB2312.1980,
3612                            or JISX0208.1980 */
3613                     reg = 0, chars96 = 0;
3614                   }
3615                 else if (c1 >= 0x28 && c1 <= 0x2B)
3616                   { /* designation of DIMENSION2_CHARS94 character set */
3617                     reg = c1 - 0x28, chars96 = 0;
3618                     ONE_MORE_BYTE (c1);
3619                   }
3620                 else if (c1 >= 0x2C && c1 <= 0x2F)
3621                   { /* designation of DIMENSION2_CHARS96 character set */
3622                     reg = c1 - 0x2C, chars96 = 1;
3623                     ONE_MORE_BYTE (c1);
3624                   }
3625                 else
3626                   goto invalid_code;
3627                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3628                 /* We must update these variables now.  */
3629                 if (reg == 0)
3630                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3631                 else if (reg == 1)
3632                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3633                 if (chars96 < 0)
3634                   goto invalid_code;
3635               }
3636               continue;
3637
3638             case 'n':           /* invocation of locking-shift-2 */
3639               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3640                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3641                 goto invalid_code;
3642               CODING_ISO_INVOCATION (coding, 0) = 2;
3643               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3644               continue;
3645
3646             case 'o':           /* invocation of locking-shift-3 */
3647               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3648                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3649                 goto invalid_code;
3650               CODING_ISO_INVOCATION (coding, 0) = 3;
3651               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3652               continue;
3653
3654             case 'N':           /* invocation of single-shift-2 */
3655               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3656                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3657                 goto invalid_code;
3658               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3659               if (charset_id_2 < 0)
3660                 charset = CHARSET_FROM_ID (charset_ascii);
3661               else
3662                 charset = CHARSET_FROM_ID (charset_id_2);
3663               ONE_MORE_BYTE (c1);
3664               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3665                 goto invalid_code;
3666               break;
3667
3668             case 'O':           /* invocation of single-shift-3 */
3669               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3670                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3671                 goto invalid_code;
3672               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3673               if (charset_id_3 < 0)
3674                 charset = CHARSET_FROM_ID (charset_ascii);
3675               else
3676                 charset = CHARSET_FROM_ID (charset_id_3);
3677               ONE_MORE_BYTE (c1);
3678               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3679                 goto invalid_code;
3680               break;
3681
3682             case '0': case '2': case '3': case '4': /* start composition */
3683               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3684                 goto invalid_code;
3685               if (last_id != charset_ascii)
3686                 {
3687                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3688                   last_id = charset_ascii;
3689                   last_offset = char_offset;
3690                 }
3691               DECODE_COMPOSITION_START (c1);
3692               continue;
3693
3694             case '1':           /* end composition */
3695               if (cmp_status->state == COMPOSING_NO)
3696                 goto invalid_code;
3697               DECODE_COMPOSITION_END ();
3698               continue;
3699
3700             case '[':           /* specification of direction */
3701               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3702                 goto invalid_code;
3703               /* For the moment, nested direction is not supported.
3704                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3705                  left-to-right, and nonzero means right-to-left.  */
3706               ONE_MORE_BYTE (c1);
3707               switch (c1)
3708                 {
3709                 case ']':       /* end of the current direction */
3710                   coding->mode &= ~CODING_MODE_DIRECTION;
3711
3712                 case '0':       /* end of the current direction */
3713                 case '1':       /* start of left-to-right direction */
3714                   ONE_MORE_BYTE (c1);
3715                   if (c1 == ']')
3716                     coding->mode &= ~CODING_MODE_DIRECTION;
3717                   else
3718                     goto invalid_code;
3719                   break;
3720
3721                 case '2':       /* start of right-to-left direction */
3722                   ONE_MORE_BYTE (c1);
3723                   if (c1 == ']')
3724                     coding->mode |= CODING_MODE_DIRECTION;
3725                   else
3726                     goto invalid_code;
3727                   break;
3728
3729                 default:
3730                   goto invalid_code;
3731                 }
3732               continue;
3733
3734             case '%':
3735               ONE_MORE_BYTE (c1);
3736               if (c1 == '/')
3737                 {
3738                   /* CTEXT extended segment:
3739                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3740                      We keep these bytes as is for the moment.
3741                      They may be decoded by post-read-conversion.  */
3742                   int dim, M, L;
3743                   int size;
3744
3745                   ONE_MORE_BYTE (dim);
3746                   if (dim < '0' || dim > '4')
3747                     goto invalid_code;
3748                   ONE_MORE_BYTE (M);
3749                   if (M < 128)
3750                     goto invalid_code;
3751                   ONE_MORE_BYTE (L);
3752                   if (L < 128)
3753                     goto invalid_code;
3754                   size = ((M - 128) * 128) + (L - 128);
3755                   if (charbuf + 6 > charbuf_end)
3756                     goto break_loop;
3757                   *charbuf++ = ISO_CODE_ESC;
3758                   *charbuf++ = '%';
3759                   *charbuf++ = '/';
3760                   *charbuf++ = dim;
3761                   *charbuf++ = BYTE8_TO_CHAR (M);
3762                   *charbuf++ = BYTE8_TO_CHAR (L);
3763                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3764                 }
3765               else if (c1 == 'G')
3766                 {
3767                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3768                      ESC % G --UTF-8-BYTES-- ESC % @
3769                      We keep these bytes as is for the moment.
3770                      They may be decoded by post-read-conversion.  */
3771                   if (charbuf + 3 > charbuf_end)
3772                     goto break_loop;
3773                   *charbuf++ = ISO_CODE_ESC;
3774                   *charbuf++ = '%';
3775                   *charbuf++ = 'G';
3776                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3777                 }
3778               else
3779                 goto invalid_code;
3780               continue;
3781               break;
3782
3783             default:
3784               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3785                 goto invalid_code;
3786               {
3787                 int reg, chars96;
3788
3789                 if (c1 >= 0x28 && c1 <= 0x2B)
3790                   { /* designation of DIMENSION1_CHARS94 character set */
3791                     reg = c1 - 0x28, chars96 = 0;
3792                     ONE_MORE_BYTE (c1);
3793                   }
3794                 else if (c1 >= 0x2C && c1 <= 0x2F)
3795                   { /* designation of DIMENSION1_CHARS96 character set */
3796                     reg = c1 - 0x2C, chars96 = 1;
3797                     ONE_MORE_BYTE (c1);
3798                   }
3799                 else
3800                   goto invalid_code;
3801                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3802                 /* We must update these variables now.  */
3803                 if (reg == 0)
3804                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3805                 else if (reg == 1)
3806                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3807                 if (chars96 < 0)
3808                   goto invalid_code;
3809               }
3810               continue;
3811             }
3812           break;
3813
3814         default:
3815           emacs_abort ();
3816         }
3817
3818       if (cmp_status->state == COMPOSING_NO
3819           && charset->id != charset_ascii
3820           && last_id != charset->id)
3821         {
3822           if (last_id != charset_ascii)
3823             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3824           last_id = charset->id;
3825           last_offset = char_offset;
3826         }
3827
3828       /* Now we know CHARSET and 1st position code C1 of a character.
3829          Produce a decoded character while getting 2nd and 3rd
3830          position codes C2, C3 if necessary.  */
3831       if (CHARSET_DIMENSION (charset) > 1)
3832         {
3833           ONE_MORE_BYTE (c2);
3834           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3835               || ((c1 & 0x80) != (c2 & 0x80)))
3836             /* C2 is not in a valid range.  */
3837             goto invalid_code;
3838           if (CHARSET_DIMENSION (charset) == 2)
3839             c1 = (c1 << 8) | c2;
3840           else
3841             {
3842               ONE_MORE_BYTE (c3);
3843               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3844                   || ((c1 & 0x80) != (c3 & 0x80)))
3845                 /* C3 is not in a valid range.  */
3846                 goto invalid_code;
3847               c1 = (c1 << 16) | (c2 << 8) | c2;
3848             }
3849         }
3850       c1 &= 0x7F7F7F;
3851       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3852       if (c < 0)
3853         {
3854           MAYBE_FINISH_COMPOSITION ();
3855           for (; src_base < src; src_base++, char_offset++)
3856             {
3857               if (ASCII_BYTE_P (*src_base))
3858                 *charbuf++ = *src_base;
3859               else
3860                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3861             }
3862         }
3863       else if (cmp_status->state == COMPOSING_NO)
3864         {
3865           *charbuf++ = c;
3866           char_offset++;
3867         }
3868       else if ((cmp_status->state == COMPOSING_CHAR
3869                 ? cmp_status->nchars
3870                 : cmp_status->ncomps)
3871                >= MAX_COMPOSITION_COMPONENTS)
3872         {
3873           /* Too long composition.  */
3874           MAYBE_FINISH_COMPOSITION ();
3875           *charbuf++ = c;
3876           char_offset++;
3877         }
3878       else
3879         STORE_COMPOSITION_CHAR (c);
3880       continue;
3881
3882     invalid_code:
3883       MAYBE_FINISH_COMPOSITION ();
3884       src = src_base;
3885       consumed_chars = consumed_chars_base;
3886       ONE_MORE_BYTE (c);
3887       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3888       char_offset++;
3889       coding->errors++;
3890       /* Reset the invocation and designation status to the safest
3891          one; i.e. designate ASCII to the graphic register 0, and
3892          invoke that register to the graphic plane 0.  This typically
3893          helps the case that an designation sequence for ASCII "ESC (
3894          B" is somehow broken (e.g. broken by a newline).  */
3895       CODING_ISO_INVOCATION (coding, 0) = 0;
3896       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3897       charset_id_0 = charset_ascii;
3898       continue;
3899
3900     break_loop:
3901       break;
3902     }
3903
3904  no_more_source:
3905   if (cmp_status->state != COMPOSING_NO)
3906     {
3907       if (coding->mode & CODING_MODE_LAST_BLOCK)
3908         MAYBE_FINISH_COMPOSITION ();
3909       else
3910         {
3911           charbuf -= cmp_status->length;
3912           for (i = 0; i < cmp_status->length; i++)
3913             cmp_status->carryover[i] = charbuf[i];
3914         }
3915     }
3916   else if (last_id != charset_ascii)
3917     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3918   coding->consumed_char += consumed_chars_base;
3919   coding->consumed = src_base - coding->source;
3920   coding->charbuf_used = charbuf - coding->charbuf;
3921 }
3922
3923
3924 /* ISO2022 encoding stuff.  */
3925
3926 /*
3927    It is not enough to say just "ISO2022" on encoding, we have to
3928    specify more details.  In Emacs, each coding system of ISO2022
3929    variant has the following specifications:
3930         1. Initial designation to G0 thru G3.
3931         2. Allows short-form designation?
3932         3. ASCII should be designated to G0 before control characters?
3933         4. ASCII should be designated to G0 at end of line?
3934         5. 7-bit environment or 8-bit environment?
3935         6. Use locking-shift?
3936         7. Use Single-shift?
3937    And the following two are only for Japanese:
3938         8. Use ASCII in place of JIS0201-1976-Roman?
3939         9. Use JISX0208-1983 in place of JISX0208-1978?
3940    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3941    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3942    details.
3943 */
3944
3945 /* Produce codes (escape sequence) for designating CHARSET to graphic
3946    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3947    '@', 'A', or 'B' and the coding system CODING allows, produce
3948    designation sequence of short-form.  */
3949
3950 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3951   do {                                                                  \
3952     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3953     const char *intermediate_char_94 = "()*+";                          \
3954     const char *intermediate_char_96 = ",-./";                          \
3955     int revision = -1;                                                  \
3956                                                                         \
3957     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3958       revision = CHARSET_ISO_REVISION (charset);                        \
3959                                                                         \
3960     if (revision >= 0)                                                  \
3961       {                                                                 \
3962         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3963         EMIT_ONE_BYTE ('@' + revision);                                 \
3964       }                                                                 \
3965     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3966     if (CHARSET_DIMENSION (charset) == 1)                               \
3967       {                                                                 \
3968         int b;                                                          \
3969         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3970           b = intermediate_char_94[reg];                                \
3971         else                                                            \
3972           b = intermediate_char_96[reg];                                \
3973         EMIT_ONE_ASCII_BYTE (b);                                        \
3974       }                                                                 \
3975     else                                                                \
3976       {                                                                 \
3977         EMIT_ONE_ASCII_BYTE ('$');                                      \
3978         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3979           {                                                             \
3980             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3981                 || reg != 0                                             \
3982                 || final_char < '@' || final_char > 'B')                \
3983               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3984           }                                                             \
3985         else                                                            \
3986           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3987       }                                                                 \
3988     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3989                                                                         \
3990     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3991   } while (0)
3992
3993
3994 /* The following two macros produce codes (control character or escape
3995    sequence) for ISO2022 single-shift functions (single-shift-2 and
3996    single-shift-3).  */
3997
3998 #define ENCODE_SINGLE_SHIFT_2                                           \
3999   do {                                                                  \
4000     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4001       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4002     else                                                                \
4003       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4004     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4005   } while (0)
4006
4007
4008 #define ENCODE_SINGLE_SHIFT_3                                           \
4009   do {                                                                  \
4010     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4011       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4012     else                                                                \
4013       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4014     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4015   } while (0)
4016
4017
4018 /* The following four macros produce codes (control character or
4019    escape sequence) for ISO2022 locking-shift functions (shift-in,
4020    shift-out, locking-shift-2, and locking-shift-3).  */
4021
4022 #define ENCODE_SHIFT_IN                                 \
4023   do {                                                  \
4024     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4025     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4026   } while (0)
4027
4028
4029 #define ENCODE_SHIFT_OUT                                \
4030   do {                                                  \
4031     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4032     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4033   } while (0)
4034
4035
4036 #define ENCODE_LOCKING_SHIFT_2                          \
4037   do {                                                  \
4038     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4039     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4040   } while (0)
4041
4042
4043 #define ENCODE_LOCKING_SHIFT_3                          \
4044   do {                                                  \
4045     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4046     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4047   } while (0)
4048
4049
4050 /* Produce codes for a DIMENSION1 character whose character set is
4051    CHARSET and whose position-code is C1.  Designation and invocation
4052    sequences are also produced in advance if necessary.  */
4053
4054 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4055   do {                                                                  \
4056     int id = CHARSET_ID (charset);                                      \
4057                                                                         \
4058     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4059         && id == charset_ascii)                                         \
4060       {                                                                 \
4061         id = charset_jisx0201_roman;                                    \
4062         charset = CHARSET_FROM_ID (id);                                 \
4063       }                                                                 \
4064                                                                         \
4065     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4066       {                                                                 \
4067         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4068           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4069         else                                                            \
4070           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4071         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4072         break;                                                          \
4073       }                                                                 \
4074     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4075       {                                                                 \
4076         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4077         break;                                                          \
4078       }                                                                 \
4079     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4080       {                                                                 \
4081         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4082         break;                                                          \
4083       }                                                                 \
4084     else                                                                \
4085       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4086          must invoke it, or, at first, designate it to some graphic     \
4087          register.  Then repeat the loop to actually produce the        \
4088          character.  */                                                 \
4089       dst = encode_invocation_designation (charset, coding, dst,        \
4090                                            &produced_chars);            \
4091   } while (1)
4092
4093
4094 /* Produce codes for a DIMENSION2 character whose character set is
4095    CHARSET and whose position-codes are C1 and C2.  Designation and
4096    invocation codes are also produced in advance if necessary.  */
4097
4098 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4099   do {                                                                  \
4100     int id = CHARSET_ID (charset);                                      \
4101                                                                         \
4102     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4103         && id == charset_jisx0208)                                      \
4104       {                                                                 \
4105         id = charset_jisx0208_1978;                                     \
4106         charset = CHARSET_FROM_ID (id);                                 \
4107       }                                                                 \
4108                                                                         \
4109     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4110       {                                                                 \
4111         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4112           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4113         else                                                            \
4114           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4115         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4116         break;                                                          \
4117       }                                                                 \
4118     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4119       {                                                                 \
4120         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4121         break;                                                          \
4122       }                                                                 \
4123     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4124       {                                                                 \
4125         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4126         break;                                                          \
4127       }                                                                 \
4128     else                                                                \
4129       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4130          must invoke it, or, at first, designate it to some graphic     \
4131          register.  Then repeat the loop to actually produce the        \
4132          character.  */                                                 \
4133       dst = encode_invocation_designation (charset, coding, dst,        \
4134                                            &produced_chars);            \
4135   } while (1)
4136
4137
4138 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4139   do {                                                                     \
4140     unsigned code;                                                         \
4141     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4142                                                                            \
4143     if (CHARSET_DIMENSION (charset) == 1)                                  \
4144       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4145     else                                                                   \
4146       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4147   } while (0)
4148
4149
4150 /* Produce designation and invocation codes at a place pointed by DST
4151    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4152    Return new DST.  */
4153
4154 static unsigned char *
4155 encode_invocation_designation (struct charset *charset,
4156                                struct coding_system *coding,
4157                                unsigned char *dst, ptrdiff_t *p_nchars)
4158 {
4159   bool multibytep = coding->dst_multibyte;
4160   ptrdiff_t produced_chars = *p_nchars;
4161   int reg;                      /* graphic register number */
4162   int id = CHARSET_ID (charset);
4163
4164   /* At first, check designations.  */
4165   for (reg = 0; reg < 4; reg++)
4166     if (id == CODING_ISO_DESIGNATION (coding, reg))
4167       break;
4168
4169   if (reg >= 4)
4170     {
4171       /* CHARSET is not yet designated to any graphic registers.  */
4172       /* At first check the requested designation.  */
4173       reg = CODING_ISO_REQUEST (coding, id);
4174       if (reg < 0)
4175         /* Since CHARSET requests no special designation, designate it
4176            to graphic register 0.  */
4177         reg = 0;
4178
4179       ENCODE_DESIGNATION (charset, reg, coding);
4180     }
4181
4182   if (CODING_ISO_INVOCATION (coding, 0) != reg
4183       && CODING_ISO_INVOCATION (coding, 1) != reg)
4184     {
4185       /* Since the graphic register REG is not invoked to any graphic
4186          planes, invoke it to graphic plane 0.  */
4187       switch (reg)
4188         {
4189         case 0:                 /* graphic register 0 */
4190           ENCODE_SHIFT_IN;
4191           break;
4192
4193         case 1:                 /* graphic register 1 */
4194           ENCODE_SHIFT_OUT;
4195           break;
4196
4197         case 2:                 /* graphic register 2 */
4198           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4199             ENCODE_SINGLE_SHIFT_2;
4200           else
4201             ENCODE_LOCKING_SHIFT_2;
4202           break;
4203
4204         case 3:                 /* graphic register 3 */
4205           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4206             ENCODE_SINGLE_SHIFT_3;
4207           else
4208             ENCODE_LOCKING_SHIFT_3;
4209           break;
4210         }
4211     }
4212
4213   *p_nchars = produced_chars;
4214   return dst;
4215 }
4216
4217
4218 /* Produce codes for designation and invocation to reset the graphic
4219    planes and registers to initial state.  */
4220 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4221   do {                                                                  \
4222     int reg;                                                            \
4223     struct charset *charset;                                            \
4224                                                                         \
4225     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4226       ENCODE_SHIFT_IN;                                                  \
4227     for (reg = 0; reg < 4; reg++)                                       \
4228       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4229           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4230               != CODING_ISO_INITIAL (coding, reg)))                     \
4231         {                                                               \
4232           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4233           ENCODE_DESIGNATION (charset, reg, coding);                    \
4234         }                                                               \
4235   } while (0)
4236
4237
4238 /* Produce designation sequences of charsets in the line started from
4239    CHARBUF to a place pointed by DST, and return the number of
4240    produced bytes.  DST should not directly point a buffer text area
4241    which may be relocated by char_charset call.
4242
4243    If the current block ends before any end-of-line, we may fail to
4244    find all the necessary designations.  */
4245
4246 static ptrdiff_t
4247 encode_designation_at_bol (struct coding_system *coding,
4248                            int *charbuf, int *charbuf_end,
4249                            unsigned char *dst)
4250 {
4251   unsigned char *orig = dst;
4252   struct charset *charset;
4253   /* Table of charsets to be designated to each graphic register.  */
4254   int r[4];
4255   int c, found = 0, reg;
4256   ptrdiff_t produced_chars = 0;
4257   bool multibytep = coding->dst_multibyte;
4258   Lisp_Object attrs;
4259   Lisp_Object charset_list;
4260
4261   attrs = CODING_ID_ATTRS (coding->id);
4262   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4263   if (EQ (charset_list, Qiso_2022))
4264     charset_list = Viso_2022_charset_list;
4265
4266   for (reg = 0; reg < 4; reg++)
4267     r[reg] = -1;
4268
4269   while (charbuf < charbuf_end && found < 4)
4270     {
4271       int id;
4272
4273       c = *charbuf++;
4274       if (c == '\n')
4275         break;
4276       charset = char_charset (c, charset_list, NULL);
4277       id = CHARSET_ID (charset);
4278       reg = CODING_ISO_REQUEST (coding, id);
4279       if (reg >= 0 && r[reg] < 0)
4280         {
4281           found++;
4282           r[reg] = id;
4283         }
4284     }
4285
4286   if (found)
4287     {
4288       for (reg = 0; reg < 4; reg++)
4289         if (r[reg] >= 0
4290             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4291           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4292     }
4293
4294   return dst - orig;
4295 }
4296
4297 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4298
4299 static bool
4300 encode_coding_iso_2022 (struct coding_system *coding)
4301 {
4302   bool multibytep = coding->dst_multibyte;
4303   int *charbuf = coding->charbuf;
4304   int *charbuf_end = charbuf + coding->charbuf_used;
4305   unsigned char *dst = coding->destination + coding->produced;
4306   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4307   int safe_room = 16;
4308   bool bol_designation
4309     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4310        && CODING_ISO_BOL (coding));
4311   ptrdiff_t produced_chars = 0;
4312   Lisp_Object attrs, eol_type, charset_list;
4313   bool ascii_compatible;
4314   int c;
4315   int preferred_charset_id = -1;
4316
4317   CODING_GET_INFO (coding, attrs, charset_list);
4318   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4319   if (VECTORP (eol_type))
4320     eol_type = Qunix;
4321
4322   setup_iso_safe_charsets (attrs);
4323   /* Charset list may have been changed.  */
4324   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4325   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4326
4327   ascii_compatible
4328     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4329        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4330                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4331
4332   while (charbuf < charbuf_end)
4333     {
4334       ASSURE_DESTINATION (safe_room);
4335
4336       if (bol_designation)
4337         {
4338           /* We have to produce designation sequences if any now.  */
4339           unsigned char desig_buf[16];
4340           int nbytes;
4341           ptrdiff_t offset;
4342
4343           charset_map_loaded = 0;
4344           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4345                                               desig_buf);
4346           if (charset_map_loaded
4347               && (offset = coding_change_destination (coding)))
4348             {
4349               dst += offset;
4350               dst_end += offset;
4351             }
4352           memcpy (dst, desig_buf, nbytes);
4353           dst += nbytes;
4354           /* We are sure that designation sequences are all ASCII bytes.  */
4355           produced_chars += nbytes;
4356           bol_designation = 0;
4357           ASSURE_DESTINATION (safe_room);
4358         }
4359
4360       c = *charbuf++;
4361
4362       if (c < 0)
4363         {
4364           /* Handle an annotation.  */
4365           switch (*charbuf)
4366             {
4367             case CODING_ANNOTATE_COMPOSITION_MASK:
4368               /* Not yet implemented.  */
4369               break;
4370             case CODING_ANNOTATE_CHARSET_MASK:
4371               preferred_charset_id = charbuf[2];
4372               if (preferred_charset_id >= 0
4373                   && NILP (Fmemq (make_number (preferred_charset_id),
4374                                   charset_list)))
4375                 preferred_charset_id = -1;
4376               break;
4377             default:
4378               emacs_abort ();
4379             }
4380           charbuf += -c - 1;
4381           continue;
4382         }
4383
4384       /* Now encode the character C.  */
4385       if (c < 0x20 || c == 0x7F)
4386         {
4387           if (c == '\n'
4388               || (c == '\r' && EQ (eol_type, Qmac)))
4389             {
4390               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4391                 ENCODE_RESET_PLANE_AND_REGISTER ();
4392               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4393                 {
4394                   int i;
4395
4396                   for (i = 0; i < 4; i++)
4397                     CODING_ISO_DESIGNATION (coding, i)
4398                       = CODING_ISO_INITIAL (coding, i);
4399                 }
4400               bol_designation = ((CODING_ISO_FLAGS (coding)
4401                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4402                                  != 0);
4403             }
4404           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4405             ENCODE_RESET_PLANE_AND_REGISTER ();
4406           EMIT_ONE_ASCII_BYTE (c);
4407         }
4408       else if (ASCII_CHAR_P (c))
4409         {
4410           if (ascii_compatible)
4411             EMIT_ONE_ASCII_BYTE (c);
4412           else
4413             {
4414               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4415               ENCODE_ISO_CHARACTER (charset, c);
4416             }
4417         }
4418       else if (CHAR_BYTE8_P (c))
4419         {
4420           c = CHAR_TO_BYTE8 (c);
4421           EMIT_ONE_BYTE (c);
4422         }
4423       else
4424         {
4425           struct charset *charset;
4426
4427           if (preferred_charset_id >= 0)
4428             {
4429               bool result;
4430
4431               charset = CHARSET_FROM_ID (preferred_charset_id);
4432               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4433               if (! result)
4434                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4435                                      NULL, charset);
4436             }
4437           else
4438             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4439                                  NULL, charset);
4440           if (!charset)
4441             {
4442               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4443                 {
4444                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4445                   charset = CHARSET_FROM_ID (charset_ascii);
4446                 }
4447               else
4448                 {
4449                   c = coding->default_char;
4450                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4451                                        charset_list, NULL, charset);
4452                 }
4453             }
4454           ENCODE_ISO_CHARACTER (charset, c);
4455         }
4456     }
4457
4458   if (coding->mode & CODING_MODE_LAST_BLOCK
4459       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4460     {
4461       ASSURE_DESTINATION (safe_room);
4462       ENCODE_RESET_PLANE_AND_REGISTER ();
4463     }
4464   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4465   CODING_ISO_BOL (coding) = bol_designation;
4466   coding->produced_char += produced_chars;
4467   coding->produced = dst - coding->destination;
4468   return 0;
4469 }
4470
4471 \f
4472 /*** 8,9. SJIS and BIG5 handlers ***/
4473
4474 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4475    quite widely.  So, for the moment, Emacs supports them in the bare
4476    C code.  But, in the future, they may be supported only by CCL.  */
4477
4478 /* SJIS is a coding system encoding three character sets: ASCII, right
4479    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4480    as is.  A character of charset katakana-jisx0201 is encoded by
4481    "position-code + 0x80".  A character of charset japanese-jisx0208
4482    is encoded in 2-byte but two position-codes are divided and shifted
4483    so that it fit in the range below.
4484
4485    --- CODE RANGE of SJIS ---
4486    (character set)      (range)
4487    ASCII                0x00 .. 0x7F
4488    KATAKANA-JISX0201    0xA0 .. 0xDF
4489    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4490             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4491    -------------------------------
4492
4493 */
4494
4495 /* BIG5 is a coding system encoding two character sets: ASCII and
4496    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4497    character set and is encoded in two-byte.
4498
4499    --- CODE RANGE of BIG5 ---
4500    (character set)      (range)
4501    ASCII                0x00 .. 0x7F
4502    Big5 (1st byte)      0xA1 .. 0xFE
4503         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4504    --------------------------
4505
4506   */
4507
4508 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4509    Return true if a text is encoded in SJIS.  */
4510
4511 static bool
4512 detect_coding_sjis (struct coding_system *coding,
4513                     struct coding_detection_info *detect_info)
4514 {
4515   const unsigned char *src = coding->source, *src_base;
4516   const unsigned char *src_end = coding->source + coding->src_bytes;
4517   bool multibytep = coding->src_multibyte;
4518   ptrdiff_t consumed_chars = 0;
4519   int found = 0;
4520   int c;
4521   Lisp_Object attrs, charset_list;
4522   int max_first_byte_of_2_byte_code;
4523
4524   CODING_GET_INFO (coding, attrs, charset_list);
4525   max_first_byte_of_2_byte_code
4526     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4527
4528   detect_info->checked |= CATEGORY_MASK_SJIS;
4529   /* A coding system of this category is always ASCII compatible.  */
4530   src += coding->head_ascii;
4531
4532   while (1)
4533     {
4534       src_base = src;
4535       ONE_MORE_BYTE (c);
4536       if (c < 0x80)
4537         continue;
4538       if ((c >= 0x81 && c <= 0x9F)
4539           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4540         {
4541           ONE_MORE_BYTE (c);
4542           if (c < 0x40 || c == 0x7F || c > 0xFC)
4543             break;
4544           found = CATEGORY_MASK_SJIS;
4545         }
4546       else if (c >= 0xA0 && c < 0xE0)
4547         found = CATEGORY_MASK_SJIS;
4548       else
4549         break;
4550     }
4551   detect_info->rejected |= CATEGORY_MASK_SJIS;
4552   return 0;
4553
4554  no_more_source:
4555   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4556     {
4557       detect_info->rejected |= CATEGORY_MASK_SJIS;
4558       return 0;
4559     }
4560   detect_info->found |= found;
4561   return 1;
4562 }
4563
4564 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4565    Return true if a text is encoded in BIG5.  */
4566
4567 static bool
4568 detect_coding_big5 (struct coding_system *coding,
4569                     struct coding_detection_info *detect_info)
4570 {
4571   const unsigned char *src = coding->source, *src_base;
4572   const unsigned char *src_end = coding->source + coding->src_bytes;
4573   bool multibytep = coding->src_multibyte;
4574   ptrdiff_t consumed_chars = 0;
4575   int found = 0;
4576   int c;
4577
4578   detect_info->checked |= CATEGORY_MASK_BIG5;
4579   /* A coding system of this category is always ASCII compatible.  */
4580   src += coding->head_ascii;
4581
4582   while (1)
4583     {
4584       src_base = src;
4585       ONE_MORE_BYTE (c);
4586       if (c < 0x80)
4587         continue;
4588       if (c >= 0xA1)
4589         {
4590           ONE_MORE_BYTE (c);
4591           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4592             return 0;
4593           found = CATEGORY_MASK_BIG5;
4594         }
4595       else
4596         break;
4597     }
4598   detect_info->rejected |= CATEGORY_MASK_BIG5;
4599   return 0;
4600
4601  no_more_source:
4602   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4603     {
4604       detect_info->rejected |= CATEGORY_MASK_BIG5;
4605       return 0;
4606     }
4607   detect_info->found |= found;
4608   return 1;
4609 }
4610
4611 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4612
4613 static void
4614 decode_coding_sjis (struct coding_system *coding)
4615 {
4616   const unsigned char *src = coding->source + coding->consumed;
4617   const unsigned char *src_end = coding->source + coding->src_bytes;
4618   const unsigned char *src_base;
4619   int *charbuf = coding->charbuf + coding->charbuf_used;
4620   /* We may produce one charset annotation in one loop and one more at
4621      the end.  */
4622   int *charbuf_end
4623     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4624   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4625   bool multibytep = coding->src_multibyte;
4626   struct charset *charset_roman, *charset_kanji, *charset_kana;
4627   struct charset *charset_kanji2;
4628   Lisp_Object attrs, charset_list, val;
4629   ptrdiff_t char_offset = coding->produced_char;
4630   ptrdiff_t last_offset = char_offset;
4631   int last_id = charset_ascii;
4632   bool eol_dos
4633     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4634   int byte_after_cr = -1;
4635
4636   CODING_GET_INFO (coding, attrs, charset_list);
4637
4638   val = charset_list;
4639   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4640   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4641   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4642   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4643
4644   while (1)
4645     {
4646       int c, c1;
4647       struct charset *charset;
4648
4649       src_base = src;
4650       consumed_chars_base = consumed_chars;
4651
4652       if (charbuf >= charbuf_end)
4653         {
4654           if (byte_after_cr >= 0)
4655             src_base--;
4656           break;
4657         }
4658
4659       if (byte_after_cr >= 0)
4660         c = byte_after_cr, byte_after_cr = -1;
4661       else
4662         ONE_MORE_BYTE (c);
4663       if (c < 0)
4664         goto invalid_code;
4665       if (c < 0x80)
4666         {
4667           if (eol_dos && c == '\r')
4668             ONE_MORE_BYTE (byte_after_cr);
4669           charset = charset_roman;
4670         }
4671       else if (c == 0x80 || c == 0xA0)
4672         goto invalid_code;
4673       else if (c >= 0xA1 && c <= 0xDF)
4674         {
4675           /* SJIS -> JISX0201-Kana */
4676           c &= 0x7F;
4677           charset = charset_kana;
4678         }
4679       else if (c <= 0xEF)
4680         {
4681           /* SJIS -> JISX0208 */
4682           ONE_MORE_BYTE (c1);
4683           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4684             goto invalid_code;
4685           c = (c << 8) | c1;
4686           SJIS_TO_JIS (c);
4687           charset = charset_kanji;
4688         }
4689       else if (c <= 0xFC && charset_kanji2)
4690         {
4691           /* SJIS -> JISX0213-2 */
4692           ONE_MORE_BYTE (c1);
4693           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4694             goto invalid_code;
4695           c = (c << 8) | c1;
4696           SJIS_TO_JIS2 (c);
4697           charset = charset_kanji2;
4698         }
4699       else
4700         goto invalid_code;
4701       if (charset->id != charset_ascii
4702           && last_id != charset->id)
4703         {
4704           if (last_id != charset_ascii)
4705             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4706           last_id = charset->id;
4707           last_offset = char_offset;
4708         }
4709       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4710       *charbuf++ = c;
4711       char_offset++;
4712       continue;
4713
4714     invalid_code:
4715       src = src_base;
4716       consumed_chars = consumed_chars_base;
4717       ONE_MORE_BYTE (c);
4718       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4719       char_offset++;
4720       coding->errors++;
4721     }
4722
4723  no_more_source:
4724   if (last_id != charset_ascii)
4725     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4726   coding->consumed_char += consumed_chars_base;
4727   coding->consumed = src_base - coding->source;
4728   coding->charbuf_used = charbuf - coding->charbuf;
4729 }
4730
4731 static void
4732 decode_coding_big5 (struct coding_system *coding)
4733 {
4734   const unsigned char *src = coding->source + coding->consumed;
4735   const unsigned char *src_end = coding->source + coding->src_bytes;
4736   const unsigned char *src_base;
4737   int *charbuf = coding->charbuf + coding->charbuf_used;
4738   /* We may produce one charset annotation in one loop and one more at
4739      the end.  */
4740   int *charbuf_end
4741     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4742   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4743   bool multibytep = coding->src_multibyte;
4744   struct charset *charset_roman, *charset_big5;
4745   Lisp_Object attrs, charset_list, val;
4746   ptrdiff_t char_offset = coding->produced_char;
4747   ptrdiff_t last_offset = char_offset;
4748   int last_id = charset_ascii;
4749   bool eol_dos
4750     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4751   int byte_after_cr = -1;
4752
4753   CODING_GET_INFO (coding, attrs, charset_list);
4754   val = charset_list;
4755   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4756   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4757
4758   while (1)
4759     {
4760       int c, c1;
4761       struct charset *charset;
4762
4763       src_base = src;
4764       consumed_chars_base = consumed_chars;
4765
4766       if (charbuf >= charbuf_end)
4767         {
4768           if (byte_after_cr >= 0)
4769             src_base--;
4770           break;
4771         }
4772
4773       if (byte_after_cr >= 0)
4774         c = byte_after_cr, byte_after_cr = -1;
4775       else
4776         ONE_MORE_BYTE (c);
4777
4778       if (c < 0)
4779         goto invalid_code;
4780       if (c < 0x80)
4781         {
4782           if (eol_dos && c == '\r')
4783             ONE_MORE_BYTE (byte_after_cr);
4784           charset = charset_roman;
4785         }
4786       else
4787         {
4788           /* BIG5 -> Big5 */
4789           if (c < 0xA1 || c > 0xFE)
4790             goto invalid_code;
4791           ONE_MORE_BYTE (c1);
4792           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4793             goto invalid_code;
4794           c = c << 8 | c1;
4795           charset = charset_big5;
4796         }
4797       if (charset->id != charset_ascii
4798           && last_id != charset->id)
4799         {
4800           if (last_id != charset_ascii)
4801             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4802           last_id = charset->id;
4803           last_offset = char_offset;
4804         }
4805       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4806       *charbuf++ = c;
4807       char_offset++;
4808       continue;
4809
4810     invalid_code:
4811       src = src_base;
4812       consumed_chars = consumed_chars_base;
4813       ONE_MORE_BYTE (c);
4814       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4815       char_offset++;
4816       coding->errors++;
4817     }
4818
4819  no_more_source:
4820   if (last_id != charset_ascii)
4821     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4822   coding->consumed_char += consumed_chars_base;
4823   coding->consumed = src_base - coding->source;
4824   coding->charbuf_used = charbuf - coding->charbuf;
4825 }
4826
4827 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4828    This function can encode charsets `ascii', `katakana-jisx0201',
4829    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4830    are sure that all these charsets are registered as official charset
4831    (i.e. do not have extended leading-codes).  Characters of other
4832    charsets are produced without any encoding.  */
4833
4834 static bool
4835 encode_coding_sjis (struct coding_system *coding)
4836 {
4837   bool multibytep = coding->dst_multibyte;
4838   int *charbuf = coding->charbuf;
4839   int *charbuf_end = charbuf + coding->charbuf_used;
4840   unsigned char *dst = coding->destination + coding->produced;
4841   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4842   int safe_room = 4;
4843   ptrdiff_t produced_chars = 0;
4844   Lisp_Object attrs, charset_list, val;
4845   bool ascii_compatible;
4846   struct charset *charset_kanji, *charset_kana;
4847   struct charset *charset_kanji2;
4848   int c;
4849
4850   CODING_GET_INFO (coding, attrs, charset_list);
4851   val = XCDR (charset_list);
4852   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4853   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4854   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4855
4856   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4857
4858   while (charbuf < charbuf_end)
4859     {
4860       ASSURE_DESTINATION (safe_room);
4861       c = *charbuf++;
4862       /* Now encode the character C.  */
4863       if (ASCII_CHAR_P (c) && ascii_compatible)
4864         EMIT_ONE_ASCII_BYTE (c);
4865       else if (CHAR_BYTE8_P (c))
4866         {
4867           c = CHAR_TO_BYTE8 (c);
4868           EMIT_ONE_BYTE (c);
4869         }
4870       else
4871         {
4872           unsigned code;
4873           struct charset *charset;
4874           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4875                                &code, charset);
4876
4877           if (!charset)
4878             {
4879               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4880                 {
4881                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4882                   charset = CHARSET_FROM_ID (charset_ascii);
4883                 }
4884               else
4885                 {
4886                   c = coding->default_char;
4887                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4888                                        charset_list, &code, charset);
4889                 }
4890             }
4891           if (code == CHARSET_INVALID_CODE (charset))
4892             emacs_abort ();
4893           if (charset == charset_kanji)
4894             {
4895               int c1, c2;
4896               JIS_TO_SJIS (code);
4897               c1 = code >> 8, c2 = code & 0xFF;
4898               EMIT_TWO_BYTES (c1, c2);
4899             }
4900           else if (charset == charset_kana)
4901             EMIT_ONE_BYTE (code | 0x80);
4902           else if (charset_kanji2 && charset == charset_kanji2)
4903             {
4904               int c1, c2;
4905
4906               c1 = code >> 8;
4907               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4908                   || c1 == 0x28
4909                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4910                 {
4911                   JIS_TO_SJIS2 (code);
4912                   c1 = code >> 8, c2 = code & 0xFF;
4913                   EMIT_TWO_BYTES (c1, c2);
4914                 }
4915               else
4916                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4917             }
4918           else
4919             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4920         }
4921     }
4922   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4923   coding->produced_char += produced_chars;
4924   coding->produced = dst - coding->destination;
4925   return 0;
4926 }
4927
4928 static bool
4929 encode_coding_big5 (struct coding_system *coding)
4930 {
4931   bool multibytep = coding->dst_multibyte;
4932   int *charbuf = coding->charbuf;
4933   int *charbuf_end = charbuf + coding->charbuf_used;
4934   unsigned char *dst = coding->destination + coding->produced;
4935   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4936   int safe_room = 4;
4937   ptrdiff_t produced_chars = 0;
4938   Lisp_Object attrs, charset_list, val;
4939   bool ascii_compatible;
4940   struct charset *charset_big5;
4941   int c;
4942
4943   CODING_GET_INFO (coding, attrs, charset_list);
4944   val = XCDR (charset_list);
4945   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4946   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4947
4948   while (charbuf < charbuf_end)
4949     {
4950       ASSURE_DESTINATION (safe_room);
4951       c = *charbuf++;
4952       /* Now encode the character C.  */
4953       if (ASCII_CHAR_P (c) && ascii_compatible)
4954         EMIT_ONE_ASCII_BYTE (c);
4955       else if (CHAR_BYTE8_P (c))
4956         {
4957           c = CHAR_TO_BYTE8 (c);
4958           EMIT_ONE_BYTE (c);
4959         }
4960       else
4961         {
4962           unsigned code;
4963           struct charset *charset;
4964           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4965                                &code, charset);
4966
4967           if (! charset)
4968             {
4969               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4970                 {
4971                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4972                   charset = CHARSET_FROM_ID (charset_ascii);
4973                 }
4974               else
4975                 {
4976                   c = coding->default_char;
4977                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4978                                        charset_list, &code, charset);
4979                 }
4980             }
4981           if (code == CHARSET_INVALID_CODE (charset))
4982             emacs_abort ();
4983           if (charset == charset_big5)
4984             {
4985               int c1, c2;
4986
4987               c1 = code >> 8, c2 = code & 0xFF;
4988               EMIT_TWO_BYTES (c1, c2);
4989             }
4990           else
4991             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4992         }
4993     }
4994   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4995   coding->produced_char += produced_chars;
4996   coding->produced = dst - coding->destination;
4997   return 0;
4998 }
4999
5000 \f
5001 /*** 10. CCL handlers ***/
5002
5003 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5004    Return true if a text is encoded in a coding system of which
5005    encoder/decoder are written in CCL program.  */
5006
5007 static bool
5008 detect_coding_ccl (struct coding_system *coding,
5009                    struct coding_detection_info *detect_info)
5010 {
5011   const unsigned char *src = coding->source, *src_base;
5012   const unsigned char *src_end = coding->source + coding->src_bytes;
5013   bool multibytep = coding->src_multibyte;
5014   ptrdiff_t consumed_chars = 0;
5015   int found = 0;
5016   unsigned char *valids;
5017   ptrdiff_t head_ascii = coding->head_ascii;
5018   Lisp_Object attrs;
5019
5020   detect_info->checked |= CATEGORY_MASK_CCL;
5021
5022   coding = &coding_categories[coding_category_ccl];
5023   valids = CODING_CCL_VALIDS (coding);
5024   attrs = CODING_ID_ATTRS (coding->id);
5025   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5026     src += head_ascii;
5027
5028   while (1)
5029     {
5030       int c;
5031
5032       src_base = src;
5033       ONE_MORE_BYTE (c);
5034       if (c < 0 || ! valids[c])
5035         break;
5036       if ((valids[c] > 1))
5037         found = CATEGORY_MASK_CCL;
5038     }
5039   detect_info->rejected |= CATEGORY_MASK_CCL;
5040   return 0;
5041
5042  no_more_source:
5043   detect_info->found |= found;
5044   return 1;
5045 }
5046
5047 static void
5048 decode_coding_ccl (struct coding_system *coding)
5049 {
5050   const unsigned char *src = coding->source + coding->consumed;
5051   const unsigned char *src_end = coding->source + coding->src_bytes;
5052   int *charbuf = coding->charbuf + coding->charbuf_used;
5053   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5054   ptrdiff_t consumed_chars = 0;
5055   bool multibytep = coding->src_multibyte;
5056   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5057   int source_charbuf[1024];
5058   int source_byteidx[1025];
5059   Lisp_Object attrs, charset_list;
5060
5061   CODING_GET_INFO (coding, attrs, charset_list);
5062
5063   while (1)
5064     {
5065       const unsigned char *p = src;
5066       ptrdiff_t offset;
5067       int i = 0;
5068
5069       if (multibytep)
5070         {
5071           while (i < 1024 && p < src_end)
5072             {
5073               source_byteidx[i] = p - src;
5074               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5075             }
5076           source_byteidx[i] = p - src;
5077         }
5078       else
5079         while (i < 1024 && p < src_end)
5080           source_charbuf[i++] = *p++;
5081
5082       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5083         ccl->last_block = 1;
5084       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5085       charset_map_loaded = 0;
5086       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5087                   charset_list);
5088       if (charset_map_loaded
5089           && (offset = coding_change_source (coding)))
5090         {
5091           p += offset;
5092           src += offset;
5093           src_end += offset;
5094         }
5095       charbuf += ccl->produced;
5096       if (multibytep)
5097         src += source_byteidx[ccl->consumed];
5098       else
5099         src += ccl->consumed;
5100       consumed_chars += ccl->consumed;
5101       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5102         break;
5103     }
5104
5105   switch (ccl->status)
5106     {
5107     case CCL_STAT_SUSPEND_BY_SRC:
5108       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5109       break;
5110     case CCL_STAT_SUSPEND_BY_DST:
5111       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5112       break;
5113     case CCL_STAT_QUIT:
5114     case CCL_STAT_INVALID_CMD:
5115       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5116       break;
5117     default:
5118       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5119       break;
5120     }
5121   coding->consumed_char += consumed_chars;
5122   coding->consumed = src - coding->source;
5123   coding->charbuf_used = charbuf - coding->charbuf;
5124 }
5125
5126 static bool
5127 encode_coding_ccl (struct coding_system *coding)
5128 {
5129   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5130   bool multibytep = coding->dst_multibyte;
5131   int *charbuf = coding->charbuf;
5132   int *charbuf_end = charbuf + coding->charbuf_used;
5133   unsigned char *dst = coding->destination + coding->produced;
5134   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5135   int destination_charbuf[1024];
5136   ptrdiff_t produced_chars = 0;
5137   int i;
5138   Lisp_Object attrs, charset_list;
5139
5140   CODING_GET_INFO (coding, attrs, charset_list);
5141   if (coding->consumed_char == coding->src_chars
5142       && coding->mode & CODING_MODE_LAST_BLOCK)
5143     ccl->last_block = 1;
5144
5145   do
5146     {
5147       ptrdiff_t offset;
5148
5149       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5150       charset_map_loaded = 0;
5151       ccl_driver (ccl, charbuf, destination_charbuf,
5152                   charbuf_end - charbuf, 1024, charset_list);
5153       if (charset_map_loaded
5154           && (offset = coding_change_destination (coding)))
5155         dst += offset;
5156       if (multibytep)
5157         {
5158           ASSURE_DESTINATION (ccl->produced * 2);
5159           for (i = 0; i < ccl->produced; i++)
5160             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5161         }
5162       else
5163         {
5164           ASSURE_DESTINATION (ccl->produced);
5165           for (i = 0; i < ccl->produced; i++)
5166             *dst++ = destination_charbuf[i] & 0xFF;
5167           produced_chars += ccl->produced;
5168         }
5169       charbuf += ccl->consumed;
5170       if (ccl->status == CCL_STAT_QUIT
5171           || ccl->status == CCL_STAT_INVALID_CMD)
5172         break;
5173     }
5174   while (charbuf < charbuf_end);
5175
5176   switch (ccl->status)
5177     {
5178     case CCL_STAT_SUSPEND_BY_SRC:
5179       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5180       break;
5181     case CCL_STAT_SUSPEND_BY_DST:
5182       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5183       break;
5184     case CCL_STAT_QUIT:
5185     case CCL_STAT_INVALID_CMD:
5186       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5187       break;
5188     default:
5189       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5190       break;
5191     }
5192
5193   coding->produced_char += produced_chars;
5194   coding->produced = dst - coding->destination;
5195   return 0;
5196 }
5197
5198 \f
5199 /*** 10, 11. no-conversion handlers ***/
5200
5201 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5202
5203 static void
5204 decode_coding_raw_text (struct coding_system *coding)
5205 {
5206   bool eol_dos
5207     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5208
5209   coding->chars_at_source = 1;
5210   coding->consumed_char = coding->src_chars;
5211   coding->consumed = coding->src_bytes;
5212   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5213     {
5214       coding->consumed_char--;
5215       coding->consumed--;
5216       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5217     }
5218   else
5219     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5220 }
5221
5222 static bool
5223 encode_coding_raw_text (struct coding_system *coding)
5224 {
5225   bool multibytep = coding->dst_multibyte;
5226   int *charbuf = coding->charbuf;
5227   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5228   unsigned char *dst = coding->destination + coding->produced;
5229   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5230   ptrdiff_t produced_chars = 0;
5231   int c;
5232
5233   if (multibytep)
5234     {
5235       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5236
5237       if (coding->src_multibyte)
5238         while (charbuf < charbuf_end)
5239           {
5240             ASSURE_DESTINATION (safe_room);
5241             c = *charbuf++;
5242             if (ASCII_CHAR_P (c))
5243               EMIT_ONE_ASCII_BYTE (c);
5244             else if (CHAR_BYTE8_P (c))
5245               {
5246                 c = CHAR_TO_BYTE8 (c);
5247                 EMIT_ONE_BYTE (c);
5248               }
5249             else
5250               {
5251                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5252
5253                 CHAR_STRING_ADVANCE (c, p1);
5254                 do
5255                   {
5256                     EMIT_ONE_BYTE (*p0);
5257                     p0++;
5258                   }
5259                 while (p0 < p1);
5260               }
5261           }
5262       else
5263         while (charbuf < charbuf_end)
5264           {
5265             ASSURE_DESTINATION (safe_room);
5266             c = *charbuf++;
5267             EMIT_ONE_BYTE (c);
5268           }
5269     }
5270   else
5271     {
5272       if (coding->src_multibyte)
5273         {
5274           int safe_room = MAX_MULTIBYTE_LENGTH;
5275
5276           while (charbuf < charbuf_end)
5277             {
5278               ASSURE_DESTINATION (safe_room);
5279               c = *charbuf++;
5280               if (ASCII_CHAR_P (c))
5281                 *dst++ = c;
5282               else if (CHAR_BYTE8_P (c))
5283                 *dst++ = CHAR_TO_BYTE8 (c);
5284               else
5285                 CHAR_STRING_ADVANCE (c, dst);
5286             }
5287         }
5288       else
5289         {
5290           ASSURE_DESTINATION (charbuf_end - charbuf);
5291           while (charbuf < charbuf_end && dst < dst_end)
5292             *dst++ = *charbuf++;
5293         }
5294       produced_chars = dst - (coding->destination + coding->produced);
5295     }
5296   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5297   coding->produced_char += produced_chars;
5298   coding->produced = dst - coding->destination;
5299   return 0;
5300 }
5301
5302 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5303    Return true if a text is encoded in a charset-based coding system.  */
5304
5305 static bool
5306 detect_coding_charset (struct coding_system *coding,
5307                        struct coding_detection_info *detect_info)
5308 {
5309   const unsigned char *src = coding->source, *src_base;
5310   const unsigned char *src_end = coding->source + coding->src_bytes;
5311   bool multibytep = coding->src_multibyte;
5312   ptrdiff_t consumed_chars = 0;
5313   Lisp_Object attrs, valids, name;
5314   int found = 0;
5315   ptrdiff_t head_ascii = coding->head_ascii;
5316   bool check_latin_extra = 0;
5317
5318   detect_info->checked |= CATEGORY_MASK_CHARSET;
5319
5320   coding = &coding_categories[coding_category_charset];
5321   attrs = CODING_ID_ATTRS (coding->id);
5322   valids = AREF (attrs, coding_attr_charset_valids);
5323   name = CODING_ID_NAME (coding->id);
5324   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5325                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5326       || strncmp (SSDATA (SYMBOL_NAME (name)),
5327                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5328     check_latin_extra = 1;
5329
5330   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5331     src += head_ascii;
5332
5333   while (1)
5334     {
5335       int c;
5336       Lisp_Object val;
5337       struct charset *charset;
5338       int dim, idx;
5339
5340       src_base = src;
5341       ONE_MORE_BYTE (c);
5342       if (c < 0)
5343         continue;
5344       val = AREF (valids, c);
5345       if (NILP (val))
5346         break;
5347       if (c >= 0x80)
5348         {
5349           if (c < 0xA0
5350               && check_latin_extra
5351               && (!VECTORP (Vlatin_extra_code_table)
5352                   || NILP (AREF (Vlatin_extra_code_table, c))))
5353             break;
5354           found = CATEGORY_MASK_CHARSET;
5355         }
5356       if (INTEGERP (val))
5357         {
5358           charset = CHARSET_FROM_ID (XFASTINT (val));
5359           dim = CHARSET_DIMENSION (charset);
5360           for (idx = 1; idx < dim; idx++)
5361             {
5362               if (src == src_end)
5363                 goto too_short;
5364               ONE_MORE_BYTE (c);
5365               if (c < charset->code_space[(dim - 1 - idx) * 4]
5366                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5367                 break;
5368             }
5369           if (idx < dim)
5370             break;
5371         }
5372       else
5373         {
5374           idx = 1;
5375           for (; CONSP (val); val = XCDR (val))
5376             {
5377               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5378               dim = CHARSET_DIMENSION (charset);
5379               while (idx < dim)
5380                 {
5381                   if (src == src_end)
5382                     goto too_short;
5383                   ONE_MORE_BYTE (c);
5384                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5385                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5386                     break;
5387                   idx++;
5388                 }
5389               if (idx == dim)
5390                 {
5391                   val = Qnil;
5392                   break;
5393                 }
5394             }
5395           if (CONSP (val))
5396             break;
5397         }
5398     }
5399  too_short:
5400   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5401   return 0;
5402
5403  no_more_source:
5404   detect_info->found |= found;
5405   return 1;
5406 }
5407
5408 static void
5409 decode_coding_charset (struct coding_system *coding)
5410 {
5411   const unsigned char *src = coding->source + coding->consumed;
5412   const unsigned char *src_end = coding->source + coding->src_bytes;
5413   const unsigned char *src_base;
5414   int *charbuf = coding->charbuf + coding->charbuf_used;
5415   /* We may produce one charset annotation in one loop and one more at
5416      the end.  */
5417   int *charbuf_end
5418     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5419   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5420   bool multibytep = coding->src_multibyte;
5421   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5422   Lisp_Object valids;
5423   ptrdiff_t char_offset = coding->produced_char;
5424   ptrdiff_t last_offset = char_offset;
5425   int last_id = charset_ascii;
5426   bool eol_dos
5427     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5428   int byte_after_cr = -1;
5429
5430   valids = AREF (attrs, coding_attr_charset_valids);
5431
5432   while (1)
5433     {
5434       int c;
5435       Lisp_Object val;
5436       struct charset *charset;
5437       int dim;
5438       int len = 1;
5439       unsigned code;
5440
5441       src_base = src;
5442       consumed_chars_base = consumed_chars;
5443
5444       if (charbuf >= charbuf_end)
5445         {
5446           if (byte_after_cr >= 0)
5447             src_base--;
5448           break;
5449         }
5450
5451       if (byte_after_cr >= 0)
5452         {
5453           c = byte_after_cr;
5454           byte_after_cr = -1;
5455         }
5456       else
5457         {
5458           ONE_MORE_BYTE (c);
5459           if (eol_dos && c == '\r')
5460             ONE_MORE_BYTE (byte_after_cr);
5461         }
5462       if (c < 0)
5463         goto invalid_code;
5464       code = c;
5465
5466       val = AREF (valids, c);
5467       if (! INTEGERP (val) && ! CONSP (val))
5468         goto invalid_code;
5469       if (INTEGERP (val))
5470         {
5471           charset = CHARSET_FROM_ID (XFASTINT (val));
5472           dim = CHARSET_DIMENSION (charset);
5473           while (len < dim)
5474             {
5475               ONE_MORE_BYTE (c);
5476               code = (code << 8) | c;
5477               len++;
5478             }
5479           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5480                               charset, code, c);
5481         }
5482       else
5483         {
5484           /* VAL is a list of charset IDs.  It is assured that the
5485              list is sorted by charset dimensions (smaller one
5486              comes first).  */
5487           while (CONSP (val))
5488             {
5489               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5490               dim = CHARSET_DIMENSION (charset);
5491               while (len < dim)
5492                 {
5493                   ONE_MORE_BYTE (c);
5494                   code = (code << 8) | c;
5495                   len++;
5496                 }
5497               CODING_DECODE_CHAR (coding, src, src_base,
5498                                   src_end, charset, code, c);
5499               if (c >= 0)
5500                 break;
5501               val = XCDR (val);
5502             }
5503         }
5504       if (c < 0)
5505         goto invalid_code;
5506       if (charset->id != charset_ascii
5507           && last_id != charset->id)
5508         {
5509           if (last_id != charset_ascii)
5510             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5511           last_id = charset->id;
5512           last_offset = char_offset;
5513         }
5514
5515       *charbuf++ = c;
5516       char_offset++;
5517       continue;
5518
5519     invalid_code:
5520       src = src_base;
5521       consumed_chars = consumed_chars_base;
5522       ONE_MORE_BYTE (c);
5523       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5524       char_offset++;
5525       coding->errors++;
5526     }
5527
5528  no_more_source:
5529   if (last_id != charset_ascii)
5530     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5531   coding->consumed_char += consumed_chars_base;
5532   coding->consumed = src_base - coding->source;
5533   coding->charbuf_used = charbuf - coding->charbuf;
5534 }
5535
5536 static bool
5537 encode_coding_charset (struct coding_system *coding)
5538 {
5539   bool multibytep = coding->dst_multibyte;
5540   int *charbuf = coding->charbuf;
5541   int *charbuf_end = charbuf + coding->charbuf_used;
5542   unsigned char *dst = coding->destination + coding->produced;
5543   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5544   int safe_room = MAX_MULTIBYTE_LENGTH;
5545   ptrdiff_t produced_chars = 0;
5546   Lisp_Object attrs, charset_list;
5547   bool ascii_compatible;
5548   int c;
5549
5550   CODING_GET_INFO (coding, attrs, charset_list);
5551   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5552
5553   while (charbuf < charbuf_end)
5554     {
5555       struct charset *charset;
5556       unsigned code;
5557
5558       ASSURE_DESTINATION (safe_room);
5559       c = *charbuf++;
5560       if (ascii_compatible && ASCII_CHAR_P (c))
5561         EMIT_ONE_ASCII_BYTE (c);
5562       else if (CHAR_BYTE8_P (c))
5563         {
5564           c = CHAR_TO_BYTE8 (c);
5565           EMIT_ONE_BYTE (c);
5566         }
5567       else
5568         {
5569           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5570                                &code, charset);
5571
5572           if (charset)
5573             {
5574               if (CHARSET_DIMENSION (charset) == 1)
5575                 EMIT_ONE_BYTE (code);
5576               else if (CHARSET_DIMENSION (charset) == 2)
5577                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5578               else if (CHARSET_DIMENSION (charset) == 3)
5579                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5580               else
5581                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5582                                  (code >> 8) & 0xFF, code & 0xFF);
5583             }
5584           else
5585             {
5586               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5587                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5588               else
5589                 c = coding->default_char;
5590               EMIT_ONE_BYTE (c);
5591             }
5592         }
5593     }
5594
5595   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5596   coding->produced_char += produced_chars;
5597   coding->produced = dst - coding->destination;
5598   return 0;
5599 }
5600
5601 \f
5602 /*** 7. C library functions ***/
5603
5604 /* Setup coding context CODING from information about CODING_SYSTEM.
5605    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5606    CODING_SYSTEM is invalid, signal an error.  */
5607
5608 void
5609 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5610 {
5611   Lisp_Object attrs;
5612   Lisp_Object eol_type;
5613   Lisp_Object coding_type;
5614   Lisp_Object val;
5615
5616   if (NILP (coding_system))
5617     coding_system = Qundecided;
5618
5619   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5620
5621   attrs = CODING_ID_ATTRS (coding->id);
5622   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5623
5624   coding->mode = 0;
5625   coding->head_ascii = -1;
5626   if (VECTORP (eol_type))
5627     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5628                             | CODING_REQUIRE_DETECTION_MASK);
5629   else if (! EQ (eol_type, Qunix))
5630     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5631                             | CODING_REQUIRE_ENCODING_MASK);
5632   else
5633     coding->common_flags = 0;
5634   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5635     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5636   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5637     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5638   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5639     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5640
5641   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5642   coding->max_charset_id = SCHARS (val) - 1;
5643   coding->safe_charsets = SDATA (val);
5644   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5645   coding->carryover_bytes = 0;
5646
5647   coding_type = CODING_ATTR_TYPE (attrs);
5648   if (EQ (coding_type, Qundecided))
5649     {
5650       coding->detector = NULL;
5651       coding->decoder = decode_coding_raw_text;
5652       coding->encoder = encode_coding_raw_text;
5653       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5654     }
5655   else if (EQ (coding_type, Qiso_2022))
5656     {
5657       int i;
5658       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5659
5660       /* Invoke graphic register 0 to plane 0.  */
5661       CODING_ISO_INVOCATION (coding, 0) = 0;
5662       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5663       CODING_ISO_INVOCATION (coding, 1)
5664         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5665       /* Setup the initial status of designation.  */
5666       for (i = 0; i < 4; i++)
5667         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5668       /* Not single shifting initially.  */
5669       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5670       /* Beginning of buffer should also be regarded as bol. */
5671       CODING_ISO_BOL (coding) = 1;
5672       coding->detector = detect_coding_iso_2022;
5673       coding->decoder = decode_coding_iso_2022;
5674       coding->encoder = encode_coding_iso_2022;
5675       if (flags & CODING_ISO_FLAG_SAFE)
5676         coding->mode |= CODING_MODE_SAFE_ENCODING;
5677       coding->common_flags
5678         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5679             | CODING_REQUIRE_FLUSHING_MASK);
5680       if (flags & CODING_ISO_FLAG_COMPOSITION)
5681         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5682       if (flags & CODING_ISO_FLAG_DESIGNATION)
5683         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5684       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5685         {
5686           setup_iso_safe_charsets (attrs);
5687           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5688           coding->max_charset_id = SCHARS (val) - 1;
5689           coding->safe_charsets = SDATA (val);
5690         }
5691       CODING_ISO_FLAGS (coding) = flags;
5692       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5693       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5694       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5695       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5696     }
5697   else if (EQ (coding_type, Qcharset))
5698     {
5699       coding->detector = detect_coding_charset;
5700       coding->decoder = decode_coding_charset;
5701       coding->encoder = encode_coding_charset;
5702       coding->common_flags
5703         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5704     }
5705   else if (EQ (coding_type, Qutf_8))
5706     {
5707       val = AREF (attrs, coding_attr_utf_bom);
5708       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5709                                    : EQ (val, Qt) ? utf_with_bom
5710                                    : utf_without_bom);
5711       coding->detector = detect_coding_utf_8;
5712       coding->decoder = decode_coding_utf_8;
5713       coding->encoder = encode_coding_utf_8;
5714       coding->common_flags
5715         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5716       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5717         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5718     }
5719   else if (EQ (coding_type, Qutf_16))
5720     {
5721       val = AREF (attrs, coding_attr_utf_bom);
5722       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5723                                     : EQ (val, Qt) ? utf_with_bom
5724                                     : utf_without_bom);
5725       val = AREF (attrs, coding_attr_utf_16_endian);
5726       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5727                                        : utf_16_little_endian);
5728       CODING_UTF_16_SURROGATE (coding) = 0;
5729       coding->detector = detect_coding_utf_16;
5730       coding->decoder = decode_coding_utf_16;
5731       coding->encoder = encode_coding_utf_16;
5732       coding->common_flags
5733         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5734       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5735         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5736     }
5737   else if (EQ (coding_type, Qccl))
5738     {
5739       coding->detector = detect_coding_ccl;
5740       coding->decoder = decode_coding_ccl;
5741       coding->encoder = encode_coding_ccl;
5742       coding->common_flags
5743         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5744             | CODING_REQUIRE_FLUSHING_MASK);
5745     }
5746   else if (EQ (coding_type, Qemacs_mule))
5747     {
5748       coding->detector = detect_coding_emacs_mule;
5749       coding->decoder = decode_coding_emacs_mule;
5750       coding->encoder = encode_coding_emacs_mule;
5751       coding->common_flags
5752         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5753       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5754           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5755         {
5756           Lisp_Object tail, safe_charsets;
5757           int max_charset_id = 0;
5758
5759           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5760                tail = XCDR (tail))
5761             if (max_charset_id < XFASTINT (XCAR (tail)))
5762               max_charset_id = XFASTINT (XCAR (tail));
5763           safe_charsets = make_uninit_string (max_charset_id + 1);
5764           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5765           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5766                tail = XCDR (tail))
5767             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5768           coding->max_charset_id = max_charset_id;
5769           coding->safe_charsets = SDATA (safe_charsets);
5770         }
5771       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5772       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5773     }
5774   else if (EQ (coding_type, Qshift_jis))
5775     {
5776       coding->detector = detect_coding_sjis;
5777       coding->decoder = decode_coding_sjis;
5778       coding->encoder = encode_coding_sjis;
5779       coding->common_flags
5780         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5781     }
5782   else if (EQ (coding_type, Qbig5))
5783     {
5784       coding->detector = detect_coding_big5;
5785       coding->decoder = decode_coding_big5;
5786       coding->encoder = encode_coding_big5;
5787       coding->common_flags
5788         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5789     }
5790   else                          /* EQ (coding_type, Qraw_text) */
5791     {
5792       coding->detector = NULL;
5793       coding->decoder = decode_coding_raw_text;
5794       coding->encoder = encode_coding_raw_text;
5795       if (! EQ (eol_type, Qunix))
5796         {
5797           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5798           if (! VECTORP (eol_type))
5799             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5800         }
5801
5802     }
5803
5804   return;
5805 }
5806
5807 /* Return a list of charsets supported by CODING.  */
5808
5809 Lisp_Object
5810 coding_charset_list (struct coding_system *coding)
5811 {
5812   Lisp_Object attrs, charset_list;
5813
5814   CODING_GET_INFO (coding, attrs, charset_list);
5815   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5816     {
5817       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5818
5819       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5820         charset_list = Viso_2022_charset_list;
5821     }
5822   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5823     {
5824       charset_list = Vemacs_mule_charset_list;
5825     }
5826   return charset_list;
5827 }
5828
5829
5830 /* Return a list of charsets supported by CODING-SYSTEM.  */
5831
5832 Lisp_Object
5833 coding_system_charset_list (Lisp_Object coding_system)
5834 {
5835   ptrdiff_t id;
5836   Lisp_Object attrs, charset_list;
5837
5838   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5839   attrs = CODING_ID_ATTRS (id);
5840
5841   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5842     {
5843       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5844
5845       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5846         charset_list = Viso_2022_charset_list;
5847       else
5848         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5849     }
5850   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5851     {
5852       charset_list = Vemacs_mule_charset_list;
5853     }
5854   else
5855     {
5856       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5857     }
5858   return charset_list;
5859 }
5860
5861
5862 /* Return raw-text or one of its subsidiaries that has the same
5863    eol_type as CODING-SYSTEM.  */
5864
5865 Lisp_Object
5866 raw_text_coding_system (Lisp_Object coding_system)
5867 {
5868   Lisp_Object spec, attrs;
5869   Lisp_Object eol_type, raw_text_eol_type;
5870
5871   if (NILP (coding_system))
5872     return Qraw_text;
5873   spec = CODING_SYSTEM_SPEC (coding_system);
5874   attrs = AREF (spec, 0);
5875
5876   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5877     return coding_system;
5878
5879   eol_type = AREF (spec, 2);
5880   if (VECTORP (eol_type))
5881     return Qraw_text;
5882   spec = CODING_SYSTEM_SPEC (Qraw_text);
5883   raw_text_eol_type = AREF (spec, 2);
5884   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5885           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5886           : AREF (raw_text_eol_type, 2));
5887 }
5888
5889
5890 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5891    the subsidiary that has the same eol-spec as PARENT (if it is not
5892    nil and specifies end-of-line format) or the system's setting
5893    (system_eol_type).  */
5894
5895 Lisp_Object
5896 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5897 {
5898   Lisp_Object spec, eol_type;
5899
5900   if (NILP (coding_system))
5901     coding_system = Qraw_text;
5902   spec = CODING_SYSTEM_SPEC (coding_system);
5903   eol_type = AREF (spec, 2);
5904   if (VECTORP (eol_type))
5905     {
5906       Lisp_Object parent_eol_type;
5907
5908       if (! NILP (parent))
5909         {
5910           Lisp_Object parent_spec;
5911
5912           parent_spec = CODING_SYSTEM_SPEC (parent);
5913           parent_eol_type = AREF (parent_spec, 2);
5914           if (VECTORP (parent_eol_type))
5915             parent_eol_type = system_eol_type;
5916         }
5917       else
5918         parent_eol_type = system_eol_type;
5919       if (EQ (parent_eol_type, Qunix))
5920         coding_system = AREF (eol_type, 0);
5921       else if (EQ (parent_eol_type, Qdos))
5922         coding_system = AREF (eol_type, 1);
5923       else if (EQ (parent_eol_type, Qmac))
5924         coding_system = AREF (eol_type, 2);
5925     }
5926   return coding_system;
5927 }
5928
5929
5930 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5931    decided for writing to a process.  If not, complement them, and
5932    return a new coding system.  */
5933
5934 Lisp_Object
5935 complement_process_encoding_system (Lisp_Object coding_system)
5936 {
5937   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5938   Lisp_Object spec, attrs;
5939   int i;
5940
5941   for (i = 0; i < 3; i++)
5942     {
5943       if (i == 1)
5944         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5945       else if (i == 2)
5946         coding_system = preferred_coding_system ();
5947       spec = CODING_SYSTEM_SPEC (coding_system);
5948       if (NILP (spec))
5949         continue;
5950       attrs = AREF (spec, 0);
5951       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5952         coding_base = CODING_ATTR_BASE_NAME (attrs);
5953       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5954         eol_base = coding_system;
5955       if (! NILP (coding_base) && ! NILP (eol_base))
5956         break;
5957     }
5958
5959   if (i > 0)
5960     /* The original CODING_SYSTEM didn't specify text-conversion or
5961        eol-conversion.  Be sure that we return a fully complemented
5962        coding system.  */
5963     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5964   return coding_system;
5965 }
5966
5967
5968 /* Emacs has a mechanism to automatically detect a coding system if it
5969    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5970    it's impossible to distinguish some coding systems accurately
5971    because they use the same range of codes.  So, at first, coding
5972    systems are categorized into 7, those are:
5973
5974    o coding-category-emacs-mule
5975
5976         The category for a coding system which has the same code range
5977         as Emacs' internal format.  Assigned the coding-system (Lisp
5978         symbol) `emacs-mule' by default.
5979
5980    o coding-category-sjis
5981
5982         The category for a coding system which has the same code range
5983         as SJIS.  Assigned the coding-system (Lisp
5984         symbol) `japanese-shift-jis' by default.
5985
5986    o coding-category-iso-7
5987
5988         The category for a coding system which has the same code range
5989         as ISO2022 of 7-bit environment.  This doesn't use any locking
5990         shift and single shift functions.  This can encode/decode all
5991         charsets.  Assigned the coding-system (Lisp symbol)
5992         `iso-2022-7bit' by default.
5993
5994    o coding-category-iso-7-tight
5995
5996         Same as coding-category-iso-7 except that this can
5997         encode/decode only the specified charsets.
5998
5999    o coding-category-iso-8-1
6000
6001         The category for a coding system which has the same code range
6002         as ISO2022 of 8-bit environment and graphic plane 1 used only
6003         for DIMENSION1 charset.  This doesn't use any locking shift
6004         and single shift functions.  Assigned the coding-system (Lisp
6005         symbol) `iso-latin-1' by default.
6006
6007    o coding-category-iso-8-2
6008
6009         The category for a coding system which has the same code range
6010         as ISO2022 of 8-bit environment and graphic plane 1 used only
6011         for DIMENSION2 charset.  This doesn't use any locking shift
6012         and single shift functions.  Assigned the coding-system (Lisp
6013         symbol) `japanese-iso-8bit' by default.
6014
6015    o coding-category-iso-7-else
6016
6017         The category for a coding system which has the same code range
6018         as ISO2022 of 7-bit environment but uses locking shift or
6019         single shift functions.  Assigned the coding-system (Lisp
6020         symbol) `iso-2022-7bit-lock' by default.
6021
6022    o coding-category-iso-8-else
6023
6024         The category for a coding system which has the same code range
6025         as ISO2022 of 8-bit environment but uses locking shift or
6026         single shift functions.  Assigned the coding-system (Lisp
6027         symbol) `iso-2022-8bit-ss2' by default.
6028
6029    o coding-category-big5
6030
6031         The category for a coding system which has the same code range
6032         as BIG5.  Assigned the coding-system (Lisp symbol)
6033         `cn-big5' by default.
6034
6035    o coding-category-utf-8
6036
6037         The category for a coding system which has the same code range
6038         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6039         symbol) `utf-8' by default.
6040
6041    o coding-category-utf-16-be
6042
6043         The category for a coding system in which a text has an
6044         Unicode signature (cf. Unicode Standard) in the order of BIG
6045         endian at the head.  Assigned the coding-system (Lisp symbol)
6046         `utf-16-be' by default.
6047
6048    o coding-category-utf-16-le
6049
6050         The category for a coding system in which a text has an
6051         Unicode signature (cf. Unicode Standard) in the order of
6052         LITTLE endian at the head.  Assigned the coding-system (Lisp
6053         symbol) `utf-16-le' by default.
6054
6055    o coding-category-ccl
6056
6057         The category for a coding system of which encoder/decoder is
6058         written in CCL programs.  The default value is nil, i.e., no
6059         coding system is assigned.
6060
6061    o coding-category-binary
6062
6063         The category for a coding system not categorized in any of the
6064         above.  Assigned the coding-system (Lisp symbol)
6065         `no-conversion' by default.
6066
6067    Each of them is a Lisp symbol and the value is an actual
6068    `coding-system's (this is also a Lisp symbol) assigned by a user.
6069    What Emacs does actually is to detect a category of coding system.
6070    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6071    decide only one possible category, it selects a category of the
6072    highest priority.  Priorities of categories are also specified by a
6073    user in a Lisp variable `coding-category-list'.
6074
6075 */
6076
6077 #define EOL_SEEN_NONE   0
6078 #define EOL_SEEN_LF     1
6079 #define EOL_SEEN_CR     2
6080 #define EOL_SEEN_CRLF   4
6081
6082
6083 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6084                                            int eol_seen);
6085
6086
6087 /* Return the number of ASCII characters at the head of the source.
6088    By side effects, set coding->head_ascii and coding->eol_seen.  The
6089    value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
6090    EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
6091    all the source bytes are ASCII.  */
6092
6093 static int
6094 check_ascii (struct coding_system *coding)
6095 {
6096   const unsigned char *src, *end;
6097   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6098   int eol_seen;
6099
6100   eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
6101               : EQ (eol_type, Qunix) ? EOL_SEEN_LF
6102               : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6103               : EOL_SEEN_CR);
6104   coding_set_source (coding);
6105   src = coding->source;
6106   end = src + coding->src_bytes;
6107
6108   if (inhibit_eol_conversion
6109       || eol_seen != EOL_SEEN_NONE)
6110     {
6111       /* We don't have to check EOL format.  */
6112       while (src < end && !( *src & 0x80)) src++;
6113       if (inhibit_eol_conversion)
6114         {
6115           eol_seen = EOL_SEEN_LF;
6116           adjust_coding_eol_type (coding, eol_seen);
6117         }
6118     }
6119   else
6120     {
6121       end--;                /* We look ahead one byte for "CR LF".  */
6122       while (src < end)
6123         {
6124           int c = *src;
6125
6126           if (c & 0x80)
6127             break;
6128           src++;
6129           if (c == '\r')
6130             {
6131               if (*src == '\n')
6132                 {
6133                   eol_seen |= EOL_SEEN_CRLF;
6134                   src++;
6135                 }
6136               else
6137                 eol_seen |= EOL_SEEN_CR;
6138             }
6139           else if (c == '\n')
6140             eol_seen |= EOL_SEEN_LF;
6141         }
6142       if (src == end)
6143         {
6144           int c = *src;
6145
6146           /* All bytes but the last one C are ASCII.  */
6147           if (! (c & 0x80))
6148             {
6149               if (c == '\r')
6150                 eol_seen |= EOL_SEEN_CR;
6151               else if (c  == '\n')
6152                 eol_seen |= EOL_SEEN_LF;
6153               src++;
6154             }
6155         }
6156     }
6157   coding->head_ascii = src - coding->source;
6158   coding->eol_seen = eol_seen;
6159   return (coding->head_ascii);
6160 }
6161
6162
6163 /* Return the number of characters at the source if all the bytes are
6164    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6165    effects, update coding->eol_seen.  The value of coding->eol_seen is
6166    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6167    the value is reliable only when all the source bytes are valid
6168    UTF-8.  */
6169
6170 static int
6171 check_utf_8 (struct coding_system *coding)
6172 {
6173   const unsigned char *src, *end;
6174   int eol_seen = coding->eol_seen;
6175   int nchars = coding->head_ascii;
6176
6177   if (coding->head_ascii < 0)
6178     check_ascii (coding);
6179   else
6180     coding_set_source (coding);
6181   src = coding->source + coding->head_ascii;
6182   /* We look ahead one byte for CR LF.  */
6183   end = coding->source + coding->src_bytes - 1;
6184
6185   while (src < end)
6186     {
6187       int c = *src;
6188
6189       if (UTF_8_1_OCTET_P (*src))
6190         {
6191           src++;
6192           if (c < 0x20)
6193             {
6194               if (c == '\r')
6195                 {
6196                   if (*src == '\n')
6197                     {
6198                       eol_seen |= EOL_SEEN_CRLF;
6199                       src++;
6200                       nchars++;
6201                     }
6202                   else
6203                     eol_seen |= EOL_SEEN_CR;
6204                 }
6205               else if (c == '\n')
6206                 eol_seen |= EOL_SEEN_LF;
6207             }
6208         }
6209       else if (UTF_8_2_OCTET_LEADING_P (c))
6210         {
6211           if (c < 0xC2          /* overlong sequence */
6212               || src + 1 >= end
6213               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6214             return -1;
6215           src += 2;
6216         }
6217       else if (UTF_8_3_OCTET_LEADING_P (c))
6218         {
6219           if (src + 2 >= end
6220               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6221                     && UTF_8_EXTRA_OCTET_P (src[2])))
6222             return -1;
6223           c = (((c & 0xF) << 12)
6224                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6225           if (c < 0x800                       /* overlong sequence */
6226               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6227             return -1;
6228           src += 3;
6229         }
6230       else if (UTF_8_4_OCTET_LEADING_P (c))
6231         {
6232           if (src + 3 >= end
6233               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6234                     && UTF_8_EXTRA_OCTET_P (src[2])
6235                     && UTF_8_EXTRA_OCTET_P (src[3])))
6236             return -1;
6237           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6238                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6239           if (c < 0x10000       /* overlong sequence */
6240               || c >= 0x110000) /* non-Unicode character  */
6241             return -1;
6242           src += 4;
6243         }
6244       else
6245         return -1;
6246       nchars++;
6247     }
6248
6249   if (src == end)
6250     {
6251       if (! UTF_8_1_OCTET_P (*src))
6252         return -1;
6253       nchars++;
6254       if (*src == '\r')
6255         eol_seen |= EOL_SEEN_CR;
6256       else if (*src  == '\n')
6257         eol_seen |= EOL_SEEN_LF;
6258     }
6259   coding->eol_seen = eol_seen;
6260   return nchars;
6261 }
6262
6263
6264 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6265    SOURCE is encoded.  If CATEGORY is one of
6266    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6267    two-byte, else they are encoded by one-byte.
6268
6269    Return one of EOL_SEEN_XXX.  */
6270
6271 #define MAX_EOL_CHECK_COUNT 3
6272
6273 static int
6274 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6275             enum coding_category category)
6276 {
6277   const unsigned char *src = source, *src_end = src + src_bytes;
6278   unsigned char c;
6279   int total  = 0;
6280   int eol_seen = EOL_SEEN_NONE;
6281
6282   if ((1 << category) & CATEGORY_MASK_UTF_16)
6283     {
6284       bool msb = category == (coding_category_utf_16_le
6285                               | coding_category_utf_16_le_nosig);
6286       bool lsb = !msb;
6287
6288       while (src + 1 < src_end)
6289         {
6290           c = src[lsb];
6291           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6292             {
6293               int this_eol;
6294
6295               if (c == '\n')
6296                 this_eol = EOL_SEEN_LF;
6297               else if (src + 3 >= src_end
6298                        || src[msb + 2] != 0
6299                        || src[lsb + 2] != '\n')
6300                 this_eol = EOL_SEEN_CR;
6301               else
6302                 {
6303                   this_eol = EOL_SEEN_CRLF;
6304                   src += 2;
6305                 }
6306
6307               if (eol_seen == EOL_SEEN_NONE)
6308                 /* This is the first end-of-line.  */
6309                 eol_seen = this_eol;
6310               else if (eol_seen != this_eol)
6311                 {
6312                   /* The found type is different from what found before.
6313                      Allow for stray ^M characters in DOS EOL files.  */
6314                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6315                       || (eol_seen == EOL_SEEN_CRLF
6316                           && this_eol == EOL_SEEN_CR))
6317                     eol_seen = EOL_SEEN_CRLF;
6318                   else
6319                     {
6320                       eol_seen = EOL_SEEN_LF;
6321                       break;
6322                     }
6323                 }
6324               if (++total == MAX_EOL_CHECK_COUNT)
6325                 break;
6326             }
6327           src += 2;
6328         }
6329     }
6330   else
6331     while (src < src_end)
6332       {
6333         c = *src++;
6334         if (c == '\n' || c == '\r')
6335           {
6336             int this_eol;
6337
6338             if (c == '\n')
6339               this_eol = EOL_SEEN_LF;
6340             else if (src >= src_end || *src != '\n')
6341               this_eol = EOL_SEEN_CR;
6342             else
6343               this_eol = EOL_SEEN_CRLF, src++;
6344
6345             if (eol_seen == EOL_SEEN_NONE)
6346               /* This is the first end-of-line.  */
6347               eol_seen = this_eol;
6348             else if (eol_seen != this_eol)
6349               {
6350                 /* The found type is different from what found before.
6351                    Allow for stray ^M characters in DOS EOL files.  */
6352                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6353                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6354                   eol_seen = EOL_SEEN_CRLF;
6355                 else
6356                   {
6357                     eol_seen = EOL_SEEN_LF;
6358                     break;
6359                   }
6360               }
6361             if (++total == MAX_EOL_CHECK_COUNT)
6362               break;
6363           }
6364       }
6365   return eol_seen;
6366 }
6367
6368
6369 static Lisp_Object
6370 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6371 {
6372   Lisp_Object eol_type;
6373
6374   eol_type = CODING_ID_EOL_TYPE (coding->id);
6375   if (! VECTORP (eol_type))
6376     /* Already adjusted.  */
6377     return eol_type;
6378   if (eol_seen & EOL_SEEN_LF)
6379     {
6380       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6381       eol_type = Qunix;
6382     }
6383   else if (eol_seen & EOL_SEEN_CRLF)
6384     {
6385       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6386       eol_type = Qdos;
6387     }
6388   else if (eol_seen & EOL_SEEN_CR)
6389     {
6390       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6391       eol_type = Qmac;
6392     }
6393   return eol_type;
6394 }
6395
6396 /* Detect how a text specified in CODING is encoded.  If a coding
6397    system is detected, update fields of CODING by the detected coding
6398    system.  */
6399
6400 static void
6401 detect_coding (struct coding_system *coding)
6402 {
6403   const unsigned char *src, *src_end;
6404   unsigned int saved_mode = coding->mode;
6405
6406   coding->consumed = coding->consumed_char = 0;
6407   coding->produced = coding->produced_char = 0;
6408   coding_set_source (coding);
6409
6410   src_end = coding->source + coding->src_bytes;
6411
6412   /* If we have not yet decided the text encoding type, detect it
6413      now.  */
6414   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6415     {
6416       int c, i;
6417       struct coding_detection_info detect_info;
6418       bool null_byte_found = 0, eight_bit_found = 0;
6419
6420       coding->head_ascii = 0;
6421       coding->eol_seen = EOL_SEEN_NONE;
6422       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6423       for (src = coding->source; src < src_end; src++)
6424         {
6425           c = *src;
6426           if (c & 0x80)
6427             {
6428               eight_bit_found = 1;
6429               if (null_byte_found)
6430                 break;
6431             }
6432           else if (c < 0x20)
6433             {
6434               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6435                   && ! inhibit_iso_escape_detection
6436                   && ! detect_info.checked)
6437                 {
6438                   if (detect_coding_iso_2022 (coding, &detect_info))
6439                     {
6440                       /* We have scanned the whole data.  */
6441                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6442                         {
6443                           /* We didn't find an 8-bit code.  We may
6444                              have found a null-byte, but it's very
6445                              rare that a binary file conforms to
6446                              ISO-2022.  */
6447                           src = src_end;
6448                           coding->head_ascii = src - coding->source;
6449                         }
6450                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6451                       break;
6452                     }
6453                 }
6454               else if (! c && !inhibit_null_byte_detection)
6455                 {
6456                   null_byte_found = 1;
6457                   if (eight_bit_found)
6458                     break;
6459                 }
6460               else if (! disable_ascii_optimization
6461                        && ! inhibit_eol_conversion)
6462                 {
6463                   if (c == '\r')
6464                     {
6465                       if (src < src_end && src[1] == '\n')
6466                         {
6467                           coding->eol_seen |= EOL_SEEN_CRLF;
6468                           src++;
6469                           if (! eight_bit_found)
6470                             coding->head_ascii++;
6471                         }
6472                       else
6473                         coding->eol_seen |= EOL_SEEN_CR;
6474                     }
6475                   else if (c == '\n')
6476                     {
6477                       coding->eol_seen |= EOL_SEEN_LF;
6478                     }
6479                 }
6480
6481               if (! eight_bit_found)
6482                 coding->head_ascii++;
6483             }
6484           else if (! eight_bit_found)
6485             coding->head_ascii++;
6486         }
6487
6488       if (null_byte_found || eight_bit_found
6489           || coding->head_ascii < coding->src_bytes
6490           || detect_info.found)
6491         {
6492           enum coding_category category;
6493           struct coding_system *this;
6494
6495           if (coding->head_ascii == coding->src_bytes)
6496             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6497             for (i = 0; i < coding_category_raw_text; i++)
6498               {
6499                 category = coding_priorities[i];
6500                 this = coding_categories + category;
6501                 if (detect_info.found & (1 << category))
6502                   break;
6503               }
6504           else
6505             {
6506               if (null_byte_found)
6507                 {
6508                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6509                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6510                 }
6511               for (i = 0; i < coding_category_raw_text; i++)
6512                 {
6513                   category = coding_priorities[i];
6514                   this = coding_categories + category;
6515                   /* Some of this->detector (e.g. detect_coding_sjis)
6516                      require this information.  */
6517                   coding->id = this->id;
6518                   if (this->id < 0)
6519                     {
6520                       /* No coding system of this category is defined.  */
6521                       detect_info.rejected |= (1 << category);
6522                     }
6523                   else if (category >= coding_category_raw_text)
6524                     continue;
6525                   else if (detect_info.checked & (1 << category))
6526                     {
6527                       if (detect_info.found & (1 << category))
6528                         break;
6529                     }
6530                   else if ((*(this->detector)) (coding, &detect_info)
6531                            && detect_info.found & (1 << category))
6532                     {
6533                       if (category == coding_category_utf_16_auto)
6534                         {
6535                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6536                             category = coding_category_utf_16_le;
6537                           else
6538                             category = coding_category_utf_16_be;
6539                         }
6540                       break;
6541                     }
6542                 }
6543             }
6544
6545           if (i < coding_category_raw_text)
6546             setup_coding_system (CODING_ID_NAME (this->id), coding);
6547           else if (null_byte_found)
6548             setup_coding_system (Qno_conversion, coding);
6549           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6550                    == CATEGORY_MASK_ANY)
6551             setup_coding_system (Qraw_text, coding);
6552           else if (detect_info.rejected)
6553             for (i = 0; i < coding_category_raw_text; i++)
6554               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6555                 {
6556                   this = coding_categories + coding_priorities[i];
6557                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6558                   break;
6559                 }
6560         }
6561     }
6562   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6563            == coding_category_utf_8_auto)
6564     {
6565       Lisp_Object coding_systems;
6566       struct coding_detection_info detect_info;
6567
6568       coding_systems
6569         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6570       detect_info.found = detect_info.rejected = 0;
6571       if (check_ascii (coding) == coding->src_bytes)
6572         {
6573           int head_ascii = coding->head_ascii;
6574
6575           if (coding->eol_seen != EOL_SEEN_NONE)
6576             adjust_coding_eol_type (coding, coding->eol_seen);
6577           setup_coding_system (XCDR (coding_systems), coding);
6578           coding->head_ascii = head_ascii;
6579         }
6580       else
6581         {
6582           if (CONSP (coding_systems)
6583               && detect_coding_utf_8 (coding, &detect_info))
6584             {
6585               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6586                 setup_coding_system (XCAR (coding_systems), coding);
6587               else
6588                 setup_coding_system (XCDR (coding_systems), coding);
6589             }
6590         }
6591     }
6592   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6593            == coding_category_utf_16_auto)
6594     {
6595       Lisp_Object coding_systems;
6596       struct coding_detection_info detect_info;
6597
6598       coding_systems
6599         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6600       detect_info.found = detect_info.rejected = 0;
6601       coding->head_ascii = 0;
6602       coding->eol_seen = EOL_SEEN_NONE;
6603       if (CONSP (coding_systems)
6604           && detect_coding_utf_16 (coding, &detect_info))
6605         {
6606           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6607             setup_coding_system (XCAR (coding_systems), coding);
6608           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6609             setup_coding_system (XCDR (coding_systems), coding);
6610         }
6611     }
6612   coding->mode = saved_mode;
6613 }
6614
6615
6616 static void
6617 decode_eol (struct coding_system *coding)
6618 {
6619   Lisp_Object eol_type;
6620   unsigned char *p, *pbeg, *pend;
6621
6622   eol_type = CODING_ID_EOL_TYPE (coding->id);
6623   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6624     return;
6625
6626   if (NILP (coding->dst_object))
6627     pbeg = coding->destination;
6628   else
6629     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6630   pend = pbeg + coding->produced;
6631
6632   if (VECTORP (eol_type))
6633     {
6634       int eol_seen = EOL_SEEN_NONE;
6635
6636       for (p = pbeg; p < pend; p++)
6637         {
6638           if (*p == '\n')
6639             eol_seen |= EOL_SEEN_LF;
6640           else if (*p == '\r')
6641             {
6642               if (p + 1 < pend && *(p + 1) == '\n')
6643                 {
6644                   eol_seen |= EOL_SEEN_CRLF;
6645                   p++;
6646                 }
6647               else
6648                 eol_seen |= EOL_SEEN_CR;
6649             }
6650         }
6651       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6652       if ((eol_seen & EOL_SEEN_CRLF) != 0
6653           && (eol_seen & EOL_SEEN_CR) != 0
6654           && (eol_seen & EOL_SEEN_LF) == 0)
6655         eol_seen = EOL_SEEN_CRLF;
6656       else if (eol_seen != EOL_SEEN_NONE
6657           && eol_seen != EOL_SEEN_LF
6658           && eol_seen != EOL_SEEN_CRLF
6659           && eol_seen != EOL_SEEN_CR)
6660         eol_seen = EOL_SEEN_LF;
6661       if (eol_seen != EOL_SEEN_NONE)
6662         eol_type = adjust_coding_eol_type (coding, eol_seen);
6663     }
6664
6665   if (EQ (eol_type, Qmac))
6666     {
6667       for (p = pbeg; p < pend; p++)
6668         if (*p == '\r')
6669           *p = '\n';
6670     }
6671   else if (EQ (eol_type, Qdos))
6672     {
6673       ptrdiff_t n = 0;
6674
6675       if (NILP (coding->dst_object))
6676         {
6677           /* Start deleting '\r' from the tail to minimize the memory
6678              movement.  */
6679           for (p = pend - 2; p >= pbeg; p--)
6680             if (*p == '\r')
6681               {
6682                 memmove (p, p + 1, pend-- - p - 1);
6683                 n++;
6684               }
6685         }
6686       else
6687         {
6688           ptrdiff_t pos_byte = coding->dst_pos_byte;
6689           ptrdiff_t pos = coding->dst_pos;
6690           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6691
6692           while (pos < pos_end)
6693             {
6694               p = BYTE_POS_ADDR (pos_byte);
6695               if (*p == '\r' && p[1] == '\n')
6696                 {
6697                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6698                   n++;
6699                   pos_end--;
6700                 }
6701               pos++;
6702               if (coding->dst_multibyte)
6703                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6704               else
6705                 pos_byte++;
6706             }
6707         }
6708       coding->produced -= n;
6709       coding->produced_char -= n;
6710     }
6711 }
6712
6713
6714 /* Return a translation table (or list of them) from coding system
6715    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6716    not ENCODEP). */
6717
6718 static Lisp_Object
6719 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6720 {
6721   Lisp_Object standard, translation_table;
6722   Lisp_Object val;
6723
6724   if (NILP (Venable_character_translation))
6725     {
6726       if (max_lookup)
6727         *max_lookup = 0;
6728       return Qnil;
6729     }
6730   if (encodep)
6731     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6732       standard = Vstandard_translation_table_for_encode;
6733   else
6734     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6735       standard = Vstandard_translation_table_for_decode;
6736   if (NILP (translation_table))
6737     translation_table = standard;
6738   else
6739     {
6740       if (SYMBOLP (translation_table))
6741         translation_table = Fget (translation_table, Qtranslation_table);
6742       else if (CONSP (translation_table))
6743         {
6744           translation_table = Fcopy_sequence (translation_table);
6745           for (val = translation_table; CONSP (val); val = XCDR (val))
6746             if (SYMBOLP (XCAR (val)))
6747               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6748         }
6749       if (CHAR_TABLE_P (standard))
6750         {
6751           if (CONSP (translation_table))
6752             translation_table = nconc2 (translation_table,
6753                                         Fcons (standard, Qnil));
6754           else
6755             translation_table = Fcons (translation_table,
6756                                        Fcons (standard, Qnil));
6757         }
6758     }
6759
6760   if (max_lookup)
6761     {
6762       *max_lookup = 1;
6763       if (CHAR_TABLE_P (translation_table)
6764           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6765         {
6766           val = XCHAR_TABLE (translation_table)->extras[1];
6767           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6768             *max_lookup = XFASTINT (val);
6769         }
6770       else if (CONSP (translation_table))
6771         {
6772           Lisp_Object tail;
6773
6774           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6775             if (CHAR_TABLE_P (XCAR (tail))
6776                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6777               {
6778                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6779                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6780                   *max_lookup = XFASTINT (tailval);
6781               }
6782         }
6783     }
6784   return translation_table;
6785 }
6786
6787 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6788   do {                                                          \
6789     trans = Qnil;                                               \
6790     if (CHAR_TABLE_P (table))                                   \
6791       {                                                         \
6792         trans = CHAR_TABLE_REF (table, c);                      \
6793         if (CHARACTERP (trans))                                 \
6794           c = XFASTINT (trans), trans = Qnil;                   \
6795       }                                                         \
6796     else if (CONSP (table))                                     \
6797       {                                                         \
6798         Lisp_Object tail;                                       \
6799                                                                 \
6800         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6801           if (CHAR_TABLE_P (XCAR (tail)))                       \
6802             {                                                   \
6803               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6804               if (CHARACTERP (trans))                           \
6805                 c = XFASTINT (trans), trans = Qnil;             \
6806               else if (! NILP (trans))                          \
6807                 break;                                          \
6808             }                                                   \
6809       }                                                         \
6810   } while (0)
6811
6812
6813 /* Return a translation of character(s) at BUF according to TRANS.
6814    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6815    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6816    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6817    translation is found, and Qnil if not found..
6818    If BUF is too short to lookup characters in FROM, return Qt.  */
6819
6820 static Lisp_Object
6821 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6822 {
6823
6824   if (INTEGERP (trans))
6825     return trans;
6826   for (; CONSP (trans); trans = XCDR (trans))
6827     {
6828       Lisp_Object val = XCAR (trans);
6829       Lisp_Object from = XCAR (val);
6830       ptrdiff_t len = ASIZE (from);
6831       ptrdiff_t i;
6832
6833       for (i = 0; i < len; i++)
6834         {
6835           if (buf + i == buf_end)
6836             return Qt;
6837           if (XINT (AREF (from, i)) != buf[i])
6838             break;
6839         }
6840       if (i == len)
6841         return val;
6842     }
6843   return Qnil;
6844 }
6845
6846
6847 static int
6848 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6849                bool last_block)
6850 {
6851   unsigned char *dst = coding->destination + coding->produced;
6852   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6853   ptrdiff_t produced;
6854   ptrdiff_t produced_chars = 0;
6855   int carryover = 0;
6856
6857   if (! coding->chars_at_source)
6858     {
6859       /* Source characters are in coding->charbuf.  */
6860       int *buf = coding->charbuf;
6861       int *buf_end = buf + coding->charbuf_used;
6862
6863       if (EQ (coding->src_object, coding->dst_object))
6864         {
6865           coding_set_source (coding);
6866           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6867         }
6868
6869       while (buf < buf_end)
6870         {
6871           int c = *buf;
6872           ptrdiff_t i;
6873
6874           if (c >= 0)
6875             {
6876               ptrdiff_t from_nchars = 1, to_nchars = 1;
6877               Lisp_Object trans = Qnil;
6878
6879               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6880               if (! NILP (trans))
6881                 {
6882                   trans = get_translation (trans, buf, buf_end);
6883                   if (INTEGERP (trans))
6884                     c = XINT (trans);
6885                   else if (CONSP (trans))
6886                     {
6887                       from_nchars = ASIZE (XCAR (trans));
6888                       trans = XCDR (trans);
6889                       if (INTEGERP (trans))
6890                         c = XINT (trans);
6891                       else
6892                         {
6893                           to_nchars = ASIZE (trans);
6894                           c = XINT (AREF (trans, 0));
6895                         }
6896                     }
6897                   else if (EQ (trans, Qt) && ! last_block)
6898                     break;
6899                 }
6900
6901               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6902                 {
6903                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6904                        / MAX_MULTIBYTE_LENGTH)
6905                       < to_nchars)
6906                     memory_full (SIZE_MAX);
6907                   dst = alloc_destination (coding,
6908                                            buf_end - buf
6909                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6910                                            dst);
6911                   if (EQ (coding->src_object, coding->dst_object))
6912                     {
6913                       coding_set_source (coding);
6914                       dst_end = (((unsigned char *) coding->source)
6915                                  + coding->consumed);
6916                     }
6917                   else
6918                     dst_end = coding->destination + coding->dst_bytes;
6919                 }
6920
6921               for (i = 0; i < to_nchars; i++)
6922                 {
6923                   if (i > 0)
6924                     c = XINT (AREF (trans, i));
6925                   if (coding->dst_multibyte
6926                       || ! CHAR_BYTE8_P (c))
6927                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6928                   else
6929                     *dst++ = CHAR_TO_BYTE8 (c);
6930                 }
6931               produced_chars += to_nchars;
6932               buf += from_nchars;
6933             }
6934           else
6935             /* This is an annotation datum.  (-C) is the length.  */
6936             buf += -c;
6937         }
6938       carryover = buf_end - buf;
6939     }
6940   else
6941     {
6942       /* Source characters are at coding->source.  */
6943       const unsigned char *src = coding->source;
6944       const unsigned char *src_end = src + coding->consumed;
6945
6946       if (EQ (coding->dst_object, coding->src_object))
6947         dst_end = (unsigned char *) src;
6948       if (coding->src_multibyte != coding->dst_multibyte)
6949         {
6950           if (coding->src_multibyte)
6951             {
6952               bool multibytep = 1;
6953               ptrdiff_t consumed_chars = 0;
6954
6955               while (1)
6956                 {
6957                   const unsigned char *src_base = src;
6958                   int c;
6959
6960                   ONE_MORE_BYTE (c);
6961                   if (dst == dst_end)
6962                     {
6963                       if (EQ (coding->src_object, coding->dst_object))
6964                         dst_end = (unsigned char *) src;
6965                       if (dst == dst_end)
6966                         {
6967                           ptrdiff_t offset = src - coding->source;
6968
6969                           dst = alloc_destination (coding, src_end - src + 1,
6970                                                    dst);
6971                           dst_end = coding->destination + coding->dst_bytes;
6972                           coding_set_source (coding);
6973                           src = coding->source + offset;
6974                           src_end = coding->source + coding->consumed;
6975                           if (EQ (coding->src_object, coding->dst_object))
6976                             dst_end = (unsigned char *) src;
6977                         }
6978                     }
6979                   *dst++ = c;
6980                   produced_chars++;
6981                 }
6982             no_more_source:
6983               ;
6984             }
6985           else
6986             while (src < src_end)
6987               {
6988                 bool multibytep = 1;
6989                 int c = *src++;
6990
6991                 if (dst >= dst_end - 1)
6992                   {
6993                     if (EQ (coding->src_object, coding->dst_object))
6994                       dst_end = (unsigned char *) src;
6995                     if (dst >= dst_end - 1)
6996                       {
6997                         ptrdiff_t offset = src - coding->source;
6998                         ptrdiff_t more_bytes;
6999
7000                         if (EQ (coding->src_object, coding->dst_object))
7001                           more_bytes = ((src_end - src) / 2) + 2;
7002                         else
7003                           more_bytes = src_end - src + 2;
7004                         dst = alloc_destination (coding, more_bytes, dst);
7005                         dst_end = coding->destination + coding->dst_bytes;
7006                         coding_set_source (coding);
7007                         src = coding->source + offset;
7008                         src_end = coding->source + coding->consumed;
7009                         if (EQ (coding->src_object, coding->dst_object))
7010                           dst_end = (unsigned char *) src;
7011                       }
7012                   }
7013                 EMIT_ONE_BYTE (c);
7014               }
7015         }
7016       else
7017         {
7018           if (!EQ (coding->src_object, coding->dst_object))
7019             {
7020               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7021
7022               if (require > 0)
7023                 {
7024                   ptrdiff_t offset = src - coding->source;
7025
7026                   dst = alloc_destination (coding, require, dst);
7027                   coding_set_source (coding);
7028                   src = coding->source + offset;
7029                   src_end = coding->source + coding->consumed;
7030                 }
7031             }
7032           produced_chars = coding->consumed_char;
7033           while (src < src_end)
7034             *dst++ = *src++;
7035         }
7036     }
7037
7038   produced = dst - (coding->destination + coding->produced);
7039   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7040     insert_from_gap (produced_chars, produced, 0);
7041   coding->produced += produced;
7042   coding->produced_char += produced_chars;
7043   return carryover;
7044 }
7045
7046 /* Compose text in CODING->object according to the annotation data at
7047    CHARBUF.  CHARBUF is an array:
7048      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7049  */
7050
7051 static void
7052 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7053 {
7054   int len;
7055   ptrdiff_t to;
7056   enum composition_method method;
7057   Lisp_Object components;
7058
7059   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7060   to = pos + charbuf[2];
7061   method = (enum composition_method) (charbuf[4]);
7062
7063   if (method == COMPOSITION_RELATIVE)
7064     components = Qnil;
7065   else
7066     {
7067       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7068       int i, j;
7069
7070       if (method == COMPOSITION_WITH_RULE)
7071         len = charbuf[2] * 3 - 2;
7072       charbuf += MAX_ANNOTATION_LENGTH;
7073       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7074       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7075         {
7076           if (charbuf[i] >= 0)
7077             args[j] = make_number (charbuf[i]);
7078           else
7079             {
7080               i++;
7081               args[j] = make_number (charbuf[i] % 0x100);
7082             }
7083         }
7084       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7085     }
7086   compose_text (pos, to, components, Qnil, coding->dst_object);
7087 }
7088
7089
7090 /* Put `charset' property on text in CODING->object according to
7091    the annotation data at CHARBUF.  CHARBUF is an array:
7092      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7093  */
7094
7095 static void
7096 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7097 {
7098   ptrdiff_t from = pos - charbuf[2];
7099   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7100
7101   Fput_text_property (make_number (from), make_number (pos),
7102                       Qcharset, CHARSET_NAME (charset),
7103                       coding->dst_object);
7104 }
7105
7106
7107 #define CHARBUF_SIZE 0x4000
7108
7109 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7110   do {                                                                  \
7111     coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int));        \
7112     coding->charbuf_size = CHARBUF_SIZE;                                \
7113   } while (0)
7114
7115
7116 static void
7117 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7118 {
7119   int *charbuf = coding->charbuf;
7120   int *charbuf_end = charbuf + coding->charbuf_used;
7121
7122   if (NILP (coding->dst_object))
7123     return;
7124
7125   while (charbuf < charbuf_end)
7126     {
7127       if (*charbuf >= 0)
7128         pos++, charbuf++;
7129       else
7130         {
7131           int len = -*charbuf;
7132
7133           if (len > 2)
7134             switch (charbuf[1])
7135               {
7136               case CODING_ANNOTATE_COMPOSITION_MASK:
7137                 produce_composition (coding, charbuf, pos);
7138                 break;
7139               case CODING_ANNOTATE_CHARSET_MASK:
7140                 produce_charset (coding, charbuf, pos);
7141                 break;
7142               }
7143           charbuf += len;
7144         }
7145     }
7146 }
7147
7148 /* Decode the data at CODING->src_object into CODING->dst_object.
7149    CODING->src_object is a buffer, a string, or nil.
7150    CODING->dst_object is a buffer.
7151
7152    If CODING->src_object is a buffer, it must be the current buffer.
7153    In this case, if CODING->src_pos is positive, it is a position of
7154    the source text in the buffer, otherwise, the source text is in the
7155    gap area of the buffer, and CODING->src_pos specifies the offset of
7156    the text from GPT (which must be the same as PT).  If this is the
7157    same buffer as CODING->dst_object, CODING->src_pos must be
7158    negative.
7159
7160    If CODING->src_object is a string, CODING->src_pos is an index to
7161    that string.
7162
7163    If CODING->src_object is nil, CODING->source must already point to
7164    the non-relocatable memory area.  In this case, CODING->src_pos is
7165    an offset from CODING->source.
7166
7167    The decoded data is inserted at the current point of the buffer
7168    CODING->dst_object.
7169 */
7170
7171 static void
7172 decode_coding (struct coding_system *coding)
7173 {
7174   Lisp_Object attrs;
7175   Lisp_Object undo_list;
7176   Lisp_Object translation_table;
7177   struct ccl_spec cclspec;
7178   int carryover;
7179   int i;
7180
7181   USE_SAFE_ALLOCA;
7182
7183   if (BUFFERP (coding->src_object)
7184       && coding->src_pos > 0
7185       && coding->src_pos < GPT
7186       && coding->src_pos + coding->src_chars > GPT)
7187     move_gap_both (coding->src_pos, coding->src_pos_byte);
7188
7189   undo_list = Qt;
7190   if (BUFFERP (coding->dst_object))
7191     {
7192       set_buffer_internal (XBUFFER (coding->dst_object));
7193       if (GPT != PT)
7194         move_gap_both (PT, PT_BYTE);
7195
7196       /* We must disable undo_list in order to record the whole insert
7197          transaction via record_insert at the end.  But doing so also
7198          disables the recording of the first change to the undo_list.
7199          Therefore we check for first change here and record it via
7200          record_first_change if needed.  */
7201       if (MODIFF <= SAVE_MODIFF)
7202         record_first_change ();
7203
7204       undo_list = BVAR (current_buffer, undo_list);
7205       bset_undo_list (current_buffer, Qt);
7206     }
7207
7208   coding->consumed = coding->consumed_char = 0;
7209   coding->produced = coding->produced_char = 0;
7210   coding->chars_at_source = 0;
7211   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7212   coding->errors = 0;
7213
7214   ALLOC_CONVERSION_WORK_AREA (coding);
7215
7216   attrs = CODING_ID_ATTRS (coding->id);
7217   translation_table = get_translation_table (attrs, 0, NULL);
7218
7219   carryover = 0;
7220   if (coding->decoder == decode_coding_ccl)
7221     {
7222       coding->spec.ccl = &cclspec;
7223       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7224     }
7225   do
7226     {
7227       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7228
7229       coding_set_source (coding);
7230       coding->annotated = 0;
7231       coding->charbuf_used = carryover;
7232       (*(coding->decoder)) (coding);
7233       coding_set_destination (coding);
7234       carryover = produce_chars (coding, translation_table, 0);
7235       if (coding->annotated)
7236         produce_annotation (coding, pos);
7237       for (i = 0; i < carryover; i++)
7238         coding->charbuf[i]
7239           = coding->charbuf[coding->charbuf_used - carryover + i];
7240     }
7241   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7242          || (coding->consumed < coding->src_bytes
7243              && (coding->result == CODING_RESULT_SUCCESS
7244                  || coding->result == CODING_RESULT_INVALID_SRC)));
7245
7246   if (carryover > 0)
7247     {
7248       coding_set_destination (coding);
7249       coding->charbuf_used = carryover;
7250       produce_chars (coding, translation_table, 1);
7251     }
7252
7253   coding->carryover_bytes = 0;
7254   if (coding->consumed < coding->src_bytes)
7255     {
7256       int nbytes = coding->src_bytes - coding->consumed;
7257       const unsigned char *src;
7258
7259       coding_set_source (coding);
7260       coding_set_destination (coding);
7261       src = coding->source + coding->consumed;
7262
7263       if (coding->mode & CODING_MODE_LAST_BLOCK)
7264         {
7265           /* Flush out unprocessed data as binary chars.  We are sure
7266              that the number of data is less than the size of
7267              coding->charbuf.  */
7268           coding->charbuf_used = 0;
7269           coding->chars_at_source = 0;
7270
7271           while (nbytes-- > 0)
7272             {
7273               int c = *src++;
7274
7275               if (c & 0x80)
7276                 c = BYTE8_TO_CHAR (c);
7277               coding->charbuf[coding->charbuf_used++] = c;
7278             }
7279           produce_chars (coding, Qnil, 1);
7280         }
7281       else
7282         {
7283           /* Record unprocessed bytes in coding->carryover.  We are
7284              sure that the number of data is less than the size of
7285              coding->carryover.  */
7286           unsigned char *p = coding->carryover;
7287
7288           if (nbytes > sizeof coding->carryover)
7289             nbytes = sizeof coding->carryover;
7290           coding->carryover_bytes = nbytes;
7291           while (nbytes-- > 0)
7292             *p++ = *src++;
7293         }
7294       coding->consumed = coding->src_bytes;
7295     }
7296
7297   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7298       && !inhibit_eol_conversion)
7299     decode_eol (coding);
7300   if (BUFFERP (coding->dst_object))
7301     {
7302       bset_undo_list (current_buffer, undo_list);
7303       record_insert (coding->dst_pos, coding->produced_char);
7304     }
7305
7306   SAFE_FREE ();
7307 }
7308
7309
7310 /* Extract an annotation datum from a composition starting at POS and
7311    ending before LIMIT of CODING->src_object (buffer or string), store
7312    the data in BUF, set *STOP to a starting position of the next
7313    composition (if any) or to LIMIT, and return the address of the
7314    next element of BUF.
7315
7316    If such an annotation is not found, set *STOP to a starting
7317    position of a composition after POS (if any) or to LIMIT, and
7318    return BUF.  */
7319
7320 static int *
7321 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7322                                struct coding_system *coding, int *buf,
7323                                ptrdiff_t *stop)
7324 {
7325   ptrdiff_t start, end;
7326   Lisp_Object prop;
7327
7328   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7329       || end > limit)
7330     *stop = limit;
7331   else if (start > pos)
7332     *stop = start;
7333   else
7334     {
7335       if (start == pos)
7336         {
7337           /* We found a composition.  Store the corresponding
7338              annotation data in BUF.  */
7339           int *head = buf;
7340           enum composition_method method = COMPOSITION_METHOD (prop);
7341           int nchars = COMPOSITION_LENGTH (prop);
7342
7343           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7344           if (method != COMPOSITION_RELATIVE)
7345             {
7346               Lisp_Object components;
7347               ptrdiff_t i, len, i_byte;
7348
7349               components = COMPOSITION_COMPONENTS (prop);
7350               if (VECTORP (components))
7351                 {
7352                   len = ASIZE (components);
7353                   for (i = 0; i < len; i++)
7354                     *buf++ = XINT (AREF (components, i));
7355                 }
7356               else if (STRINGP (components))
7357                 {
7358                   len = SCHARS (components);
7359                   i = i_byte = 0;
7360                   while (i < len)
7361                     {
7362                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7363                       buf++;
7364                     }
7365                 }
7366               else if (INTEGERP (components))
7367                 {
7368                   len = 1;
7369                   *buf++ = XINT (components);
7370                 }
7371               else if (CONSP (components))
7372                 {
7373                   for (len = 0; CONSP (components);
7374                        len++, components = XCDR (components))
7375                     *buf++ = XINT (XCAR (components));
7376                 }
7377               else
7378                 emacs_abort ();
7379               *head -= len;
7380             }
7381         }
7382
7383       if (find_composition (end, limit, &start, &end, &prop,
7384                             coding->src_object)
7385           && end <= limit)
7386         *stop = start;
7387       else
7388         *stop = limit;
7389     }
7390   return buf;
7391 }
7392
7393
7394 /* Extract an annotation datum from a text property `charset' at POS of
7395    CODING->src_object (buffer of string), store the data in BUF, set
7396    *STOP to the position where the value of `charset' property changes
7397    (limiting by LIMIT), and return the address of the next element of
7398    BUF.
7399
7400    If the property value is nil, set *STOP to the position where the
7401    property value is non-nil (limiting by LIMIT), and return BUF.  */
7402
7403 static int *
7404 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7405                            struct coding_system *coding, int *buf,
7406                            ptrdiff_t *stop)
7407 {
7408   Lisp_Object val, next;
7409   int id;
7410
7411   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7412   if (! NILP (val) && CHARSETP (val))
7413     id = XINT (CHARSET_SYMBOL_ID (val));
7414   else
7415     id = -1;
7416   ADD_CHARSET_DATA (buf, 0, id);
7417   next = Fnext_single_property_change (make_number (pos), Qcharset,
7418                                        coding->src_object,
7419                                        make_number (limit));
7420   *stop = XINT (next);
7421   return buf;
7422 }
7423
7424
7425 static void
7426 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7427                int max_lookup)
7428 {
7429   int *buf = coding->charbuf;
7430   int *buf_end = coding->charbuf + coding->charbuf_size;
7431   const unsigned char *src = coding->source + coding->consumed;
7432   const unsigned char *src_end = coding->source + coding->src_bytes;
7433   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7434   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7435   bool multibytep = coding->src_multibyte;
7436   Lisp_Object eol_type;
7437   int c;
7438   ptrdiff_t stop, stop_composition, stop_charset;
7439   int *lookup_buf = NULL;
7440
7441   if (! NILP (translation_table))
7442     lookup_buf = alloca (sizeof (int) * max_lookup);
7443
7444   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7445   if (VECTORP (eol_type))
7446     eol_type = Qunix;
7447
7448   /* Note: composition handling is not yet implemented.  */
7449   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7450
7451   if (NILP (coding->src_object))
7452     stop = stop_composition = stop_charset = end_pos;
7453   else
7454     {
7455       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7456         stop = stop_composition = pos;
7457       else
7458         stop = stop_composition = end_pos;
7459       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7460         stop = stop_charset = pos;
7461       else
7462         stop_charset = end_pos;
7463     }
7464
7465   /* Compensate for CRLF and conversion.  */
7466   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7467   while (buf < buf_end)
7468     {
7469       Lisp_Object trans;
7470
7471       if (pos == stop)
7472         {
7473           if (pos == end_pos)
7474             break;
7475           if (pos == stop_composition)
7476             buf = handle_composition_annotation (pos, end_pos, coding,
7477                                                  buf, &stop_composition);
7478           if (pos == stop_charset)
7479             buf = handle_charset_annotation (pos, end_pos, coding,
7480                                              buf, &stop_charset);
7481           stop = (stop_composition < stop_charset
7482                   ? stop_composition : stop_charset);
7483         }
7484
7485       if (! multibytep)
7486         {
7487           int bytes;
7488
7489           if (coding->encoder == encode_coding_raw_text
7490               || coding->encoder == encode_coding_ccl)
7491             c = *src++, pos++;
7492           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7493             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7494           else
7495             c = BYTE8_TO_CHAR (*src), src++, pos++;
7496         }
7497       else
7498         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7499       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7500         c = '\n';
7501       if (! EQ (eol_type, Qunix))
7502         {
7503           if (c == '\n')
7504             {
7505               if (EQ (eol_type, Qdos))
7506                 *buf++ = '\r';
7507               else
7508                 c = '\r';
7509             }
7510         }
7511
7512       trans = Qnil;
7513       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7514       if (NILP (trans))
7515         *buf++ = c;
7516       else
7517         {
7518           ptrdiff_t from_nchars = 1, to_nchars = 1;
7519           int *lookup_buf_end;
7520           const unsigned char *p = src;
7521           int i;
7522
7523           lookup_buf[0] = c;
7524           for (i = 1; i < max_lookup && p < src_end; i++)
7525             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7526           lookup_buf_end = lookup_buf + i;
7527           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7528           if (INTEGERP (trans))
7529             c = XINT (trans);
7530           else if (CONSP (trans))
7531             {
7532               from_nchars = ASIZE (XCAR (trans));
7533               trans = XCDR (trans);
7534               if (INTEGERP (trans))
7535                 c = XINT (trans);
7536               else
7537                 {
7538                   to_nchars = ASIZE (trans);
7539                   if (buf_end - buf < to_nchars)
7540                     break;
7541                   c = XINT (AREF (trans, 0));
7542                 }
7543             }
7544           else
7545             break;
7546           *buf++ = c;
7547           for (i = 1; i < to_nchars; i++)
7548             *buf++ = XINT (AREF (trans, i));
7549           for (i = 1; i < from_nchars; i++, pos++)
7550             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7551         }
7552     }
7553
7554   coding->consumed = src - coding->source;
7555   coding->consumed_char = pos - coding->src_pos;
7556   coding->charbuf_used = buf - coding->charbuf;
7557   coding->chars_at_source = 0;
7558 }
7559
7560
7561 /* Encode the text at CODING->src_object into CODING->dst_object.
7562    CODING->src_object is a buffer or a string.
7563    CODING->dst_object is a buffer or nil.
7564
7565    If CODING->src_object is a buffer, it must be the current buffer.
7566    In this case, if CODING->src_pos is positive, it is a position of
7567    the source text in the buffer, otherwise. the source text is in the
7568    gap area of the buffer, and coding->src_pos specifies the offset of
7569    the text from GPT (which must be the same as PT).  If this is the
7570    same buffer as CODING->dst_object, CODING->src_pos must be
7571    negative and CODING should not have `pre-write-conversion'.
7572
7573    If CODING->src_object is a string, CODING should not have
7574    `pre-write-conversion'.
7575
7576    If CODING->dst_object is a buffer, the encoded data is inserted at
7577    the current point of that buffer.
7578
7579    If CODING->dst_object is nil, the encoded data is placed at the
7580    memory area specified by CODING->destination.  */
7581
7582 static void
7583 encode_coding (struct coding_system *coding)
7584 {
7585   Lisp_Object attrs;
7586   Lisp_Object translation_table;
7587   int max_lookup;
7588   struct ccl_spec cclspec;
7589
7590   USE_SAFE_ALLOCA;
7591
7592   attrs = CODING_ID_ATTRS (coding->id);
7593   if (coding->encoder == encode_coding_raw_text)
7594     translation_table = Qnil, max_lookup = 0;
7595   else
7596     translation_table = get_translation_table (attrs, 1, &max_lookup);
7597
7598   if (BUFFERP (coding->dst_object))
7599     {
7600       set_buffer_internal (XBUFFER (coding->dst_object));
7601       coding->dst_multibyte
7602         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7603     }
7604
7605   coding->consumed = coding->consumed_char = 0;
7606   coding->produced = coding->produced_char = 0;
7607   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7608   coding->errors = 0;
7609
7610   ALLOC_CONVERSION_WORK_AREA (coding);
7611
7612   if (coding->encoder == encode_coding_ccl)
7613     {
7614       coding->spec.ccl = &cclspec;
7615       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7616     }
7617   do {
7618     coding_set_source (coding);
7619     consume_chars (coding, translation_table, max_lookup);
7620     coding_set_destination (coding);
7621     (*(coding->encoder)) (coding);
7622   } while (coding->consumed_char < coding->src_chars);
7623
7624   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7625     insert_from_gap (coding->produced_char, coding->produced, 0);
7626
7627   SAFE_FREE ();
7628 }
7629
7630
7631 /* Name (or base name) of work buffer for code conversion.  */
7632 static Lisp_Object Vcode_conversion_workbuf_name;
7633
7634 /* A working buffer used by the top level conversion.  Once it is
7635    created, it is never destroyed.  It has the name
7636    Vcode_conversion_workbuf_name.  The other working buffers are
7637    destroyed after the use is finished, and their names are modified
7638    versions of Vcode_conversion_workbuf_name.  */
7639 static Lisp_Object Vcode_conversion_reused_workbuf;
7640
7641 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7642 static bool reused_workbuf_in_use;
7643
7644
7645 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7646    multibyteness of returning buffer.  */
7647
7648 static Lisp_Object
7649 make_conversion_work_buffer (bool multibyte)
7650 {
7651   Lisp_Object name, workbuf;
7652   struct buffer *current;
7653
7654   if (reused_workbuf_in_use)
7655     {
7656       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7657       workbuf = Fget_buffer_create (name);
7658     }
7659   else
7660     {
7661       reused_workbuf_in_use = 1;
7662       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7663         Vcode_conversion_reused_workbuf
7664           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7665       workbuf = Vcode_conversion_reused_workbuf;
7666     }
7667   current = current_buffer;
7668   set_buffer_internal (XBUFFER (workbuf));
7669   /* We can't allow modification hooks to run in the work buffer.  For
7670      instance, directory_files_internal assumes that file decoding
7671      doesn't compile new regexps.  */
7672   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7673   Ferase_buffer ();
7674   bset_undo_list (current_buffer, Qt);
7675   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7676   set_buffer_internal (current);
7677   return workbuf;
7678 }
7679
7680
7681 static Lisp_Object
7682 code_conversion_restore (Lisp_Object arg)
7683 {
7684   Lisp_Object current, workbuf;
7685   struct gcpro gcpro1;
7686
7687   GCPRO1 (arg);
7688   current = XCAR (arg);
7689   workbuf = XCDR (arg);
7690   if (! NILP (workbuf))
7691     {
7692       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7693         reused_workbuf_in_use = 0;
7694       else
7695         Fkill_buffer (workbuf);
7696     }
7697   set_buffer_internal (XBUFFER (current));
7698   UNGCPRO;
7699   return Qnil;
7700 }
7701
7702 Lisp_Object
7703 code_conversion_save (bool with_work_buf, bool multibyte)
7704 {
7705   Lisp_Object workbuf = Qnil;
7706
7707   if (with_work_buf)
7708     workbuf = make_conversion_work_buffer (multibyte);
7709   record_unwind_protect (code_conversion_restore,
7710                          Fcons (Fcurrent_buffer (), workbuf));
7711   return workbuf;
7712 }
7713
7714 void
7715 decode_coding_gap (struct coding_system *coding,
7716                    ptrdiff_t chars, ptrdiff_t bytes)
7717 {
7718   ptrdiff_t count = SPECPDL_INDEX ();
7719   Lisp_Object attrs;
7720
7721   coding->src_object = Fcurrent_buffer ();
7722   coding->src_chars = chars;
7723   coding->src_bytes = bytes;
7724   coding->src_pos = -chars;
7725   coding->src_pos_byte = -bytes;
7726   coding->src_multibyte = chars < bytes;
7727   coding->dst_object = coding->src_object;
7728   coding->dst_pos = PT;
7729   coding->dst_pos_byte = PT_BYTE;
7730   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7731
7732   if (CODING_REQUIRE_DETECTION (coding))
7733     detect_coding (coding);
7734   attrs = CODING_ID_ATTRS (coding->id);
7735   if (! disable_ascii_optimization
7736       && ! coding->src_multibyte
7737       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7738       && NILP (CODING_ATTR_POST_READ (attrs))
7739       && NILP (get_translation_table (attrs, 0, NULL)))
7740     {
7741       chars = coding->head_ascii;
7742       if (chars < 0)
7743         chars = check_ascii (coding);
7744       if (chars != bytes)
7745         {
7746           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7747             chars = check_utf_8 (coding);
7748           else
7749             chars = -1;
7750         }
7751       if (chars >= 0)
7752         {
7753           if (coding->eol_seen != EOL_SEEN_NONE)
7754             adjust_coding_eol_type (coding, coding->eol_seen);
7755
7756           if (coding->eol_seen == EOL_SEEN_CR)
7757             {
7758               unsigned char *src_end = GAP_END_ADDR;
7759               unsigned char *src = src_end - coding->src_bytes;
7760
7761               while (src < src_end)
7762                 {
7763                   if (*src++ == '\r')
7764                     src[-1] = '\n';
7765                 }
7766             }
7767           else if (coding->eol_seen == EOL_SEEN_CRLF)
7768             {
7769               unsigned char *src = GAP_END_ADDR;
7770               unsigned char *src_beg = src - coding->src_bytes;
7771               unsigned char *dst = src;
7772               ptrdiff_t diff;
7773
7774               while (src_beg < src)
7775                 {
7776                   *--dst = *--src;
7777                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7778                     src--;
7779                 }
7780               diff = dst - src;
7781               bytes -= diff;
7782               chars -= diff;
7783             }
7784           coding->produced = bytes;
7785           coding->produced_char = chars;
7786           insert_from_gap (chars, bytes, 1);
7787           return;
7788         }
7789     }
7790   code_conversion_save (0, 0);
7791
7792   coding->mode |= CODING_MODE_LAST_BLOCK;
7793   current_buffer->text->inhibit_shrinking = 1;
7794   decode_coding (coding);
7795   current_buffer->text->inhibit_shrinking = 0;
7796
7797   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7798     {
7799       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7800       Lisp_Object val;
7801
7802       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7803       val = call1 (CODING_ATTR_POST_READ (attrs),
7804                    make_number (coding->produced_char));
7805       CHECK_NATNUM (val);
7806       coding->produced_char += Z - prev_Z;
7807       coding->produced += Z_BYTE - prev_Z_BYTE;
7808     }
7809
7810   unbind_to (count, Qnil);
7811 }
7812
7813
7814 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7815    SRC_OBJECT into DST_OBJECT by coding context CODING.
7816
7817    SRC_OBJECT is a buffer, a string, or Qnil.
7818
7819    If it is a buffer, the text is at point of the buffer.  FROM and TO
7820    are positions in the buffer.
7821
7822    If it is a string, the text is at the beginning of the string.
7823    FROM and TO are indices to the string.
7824
7825    If it is nil, the text is at coding->source.  FROM and TO are
7826    indices to coding->source.
7827
7828    DST_OBJECT is a buffer, Qt, or Qnil.
7829
7830    If it is a buffer, the decoded text is inserted at point of the
7831    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7832    is deleted.
7833
7834    If it is Qt, a string is made from the decoded text, and
7835    set in CODING->dst_object.
7836
7837    If it is Qnil, the decoded text is stored at CODING->destination.
7838    The caller must allocate CODING->dst_bytes bytes at
7839    CODING->destination by xmalloc.  If the decoded text is longer than
7840    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7841  */
7842
7843 void
7844 decode_coding_object (struct coding_system *coding,
7845                       Lisp_Object src_object,
7846                       ptrdiff_t from, ptrdiff_t from_byte,
7847                       ptrdiff_t to, ptrdiff_t to_byte,
7848                       Lisp_Object dst_object)
7849 {
7850   ptrdiff_t count = SPECPDL_INDEX ();
7851   unsigned char *destination IF_LINT (= NULL);
7852   ptrdiff_t dst_bytes IF_LINT (= 0);
7853   ptrdiff_t chars = to - from;
7854   ptrdiff_t bytes = to_byte - from_byte;
7855   Lisp_Object attrs;
7856   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7857   bool need_marker_adjustment = 0;
7858   Lisp_Object old_deactivate_mark;
7859
7860   old_deactivate_mark = Vdeactivate_mark;
7861
7862   if (NILP (dst_object))
7863     {
7864       destination = coding->destination;
7865       dst_bytes = coding->dst_bytes;
7866     }
7867
7868   coding->src_object = src_object;
7869   coding->src_chars = chars;
7870   coding->src_bytes = bytes;
7871   coding->src_multibyte = chars < bytes;
7872
7873   if (STRINGP (src_object))
7874     {
7875       coding->src_pos = from;
7876       coding->src_pos_byte = from_byte;
7877     }
7878   else if (BUFFERP (src_object))
7879     {
7880       set_buffer_internal (XBUFFER (src_object));
7881       if (from != GPT)
7882         move_gap_both (from, from_byte);
7883       if (EQ (src_object, dst_object))
7884         {
7885           struct Lisp_Marker *tail;
7886
7887           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7888             {
7889               tail->need_adjustment
7890                 = tail->charpos == (tail->insertion_type ? from : to);
7891               need_marker_adjustment |= tail->need_adjustment;
7892             }
7893           saved_pt = PT, saved_pt_byte = PT_BYTE;
7894           TEMP_SET_PT_BOTH (from, from_byte);
7895           current_buffer->text->inhibit_shrinking = 1;
7896           del_range_both (from, from_byte, to, to_byte, 1);
7897           coding->src_pos = -chars;
7898           coding->src_pos_byte = -bytes;
7899         }
7900       else
7901         {
7902           coding->src_pos = from;
7903           coding->src_pos_byte = from_byte;
7904         }
7905     }
7906
7907   if (CODING_REQUIRE_DETECTION (coding))
7908     detect_coding (coding);
7909   attrs = CODING_ID_ATTRS (coding->id);
7910
7911   if (EQ (dst_object, Qt)
7912       || (! NILP (CODING_ATTR_POST_READ (attrs))
7913           && NILP (dst_object)))
7914     {
7915       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7916       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7917       coding->dst_pos = BEG;
7918       coding->dst_pos_byte = BEG_BYTE;
7919     }
7920   else if (BUFFERP (dst_object))
7921     {
7922       code_conversion_save (0, 0);
7923       coding->dst_object = dst_object;
7924       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7925       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7926       coding->dst_multibyte
7927         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7928     }
7929   else
7930     {
7931       code_conversion_save (0, 0);
7932       coding->dst_object = Qnil;
7933       /* Most callers presume this will return a multibyte result, and they
7934          won't use `binary' or `raw-text' anyway, so let's not worry about
7935          CODING_FOR_UNIBYTE.  */
7936       coding->dst_multibyte = 1;
7937     }
7938
7939   decode_coding (coding);
7940
7941   if (BUFFERP (coding->dst_object))
7942     set_buffer_internal (XBUFFER (coding->dst_object));
7943
7944   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7945     {
7946       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7947       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7948       Lisp_Object val;
7949
7950       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7951       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7952               old_deactivate_mark);
7953       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7954                         make_number (coding->produced_char));
7955       UNGCPRO;
7956       CHECK_NATNUM (val);
7957       coding->produced_char += Z - prev_Z;
7958       coding->produced += Z_BYTE - prev_Z_BYTE;
7959     }
7960
7961   if (EQ (dst_object, Qt))
7962     {
7963       coding->dst_object = Fbuffer_string ();
7964     }
7965   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7966     {
7967       set_buffer_internal (XBUFFER (coding->dst_object));
7968       if (dst_bytes < coding->produced)
7969         {
7970           eassert (coding->produced > 0);
7971           destination = xrealloc (destination, coding->produced);
7972           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7973             move_gap_both (BEGV, BEGV_BYTE);
7974           memcpy (destination, BEGV_ADDR, coding->produced);
7975           coding->destination = destination;
7976         }
7977     }
7978
7979   if (saved_pt >= 0)
7980     {
7981       /* This is the case of:
7982          (BUFFERP (src_object) && EQ (src_object, dst_object))
7983          As we have moved PT while replacing the original buffer
7984          contents, we must recover it now.  */
7985       set_buffer_internal (XBUFFER (src_object));
7986       current_buffer->text->inhibit_shrinking = 0;
7987       if (saved_pt < from)
7988         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7989       else if (saved_pt < from + chars)
7990         TEMP_SET_PT_BOTH (from, from_byte);
7991       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7992         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7993                           saved_pt_byte + (coding->produced - bytes));
7994       else
7995         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7996                           saved_pt_byte + (coding->produced - bytes));
7997
7998       if (need_marker_adjustment)
7999         {
8000           struct Lisp_Marker *tail;
8001
8002           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8003             if (tail->need_adjustment)
8004               {
8005                 tail->need_adjustment = 0;
8006                 if (tail->insertion_type)
8007                   {
8008                     tail->bytepos = from_byte;
8009                     tail->charpos = from;
8010                   }
8011                 else
8012                   {
8013                     tail->bytepos = from_byte + coding->produced;
8014                     tail->charpos
8015                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8016                          ? tail->bytepos : from + coding->produced_char);
8017                   }
8018               }
8019         }
8020     }
8021
8022   Vdeactivate_mark = old_deactivate_mark;
8023   unbind_to (count, coding->dst_object);
8024 }
8025
8026
8027 void
8028 encode_coding_object (struct coding_system *coding,
8029                       Lisp_Object src_object,
8030                       ptrdiff_t from, ptrdiff_t from_byte,
8031                       ptrdiff_t to, ptrdiff_t to_byte,
8032                       Lisp_Object dst_object)
8033 {
8034   ptrdiff_t count = SPECPDL_INDEX ();
8035   ptrdiff_t chars = to - from;
8036   ptrdiff_t bytes = to_byte - from_byte;
8037   Lisp_Object attrs;
8038   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8039   bool need_marker_adjustment = 0;
8040   bool kill_src_buffer = 0;
8041   Lisp_Object old_deactivate_mark;
8042
8043   old_deactivate_mark = Vdeactivate_mark;
8044
8045   coding->src_object = src_object;
8046   coding->src_chars = chars;
8047   coding->src_bytes = bytes;
8048   coding->src_multibyte = chars < bytes;
8049
8050   attrs = CODING_ID_ATTRS (coding->id);
8051
8052   if (EQ (src_object, dst_object))
8053     {
8054       struct Lisp_Marker *tail;
8055
8056       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8057         {
8058           tail->need_adjustment
8059             = tail->charpos == (tail->insertion_type ? from : to);
8060           need_marker_adjustment |= tail->need_adjustment;
8061         }
8062     }
8063
8064   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8065     {
8066       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8067       set_buffer_internal (XBUFFER (coding->src_object));
8068       if (STRINGP (src_object))
8069         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8070       else if (BUFFERP (src_object))
8071         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8072       else
8073         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8074
8075       if (EQ (src_object, dst_object))
8076         {
8077           set_buffer_internal (XBUFFER (src_object));
8078           saved_pt = PT, saved_pt_byte = PT_BYTE;
8079           del_range_both (from, from_byte, to, to_byte, 1);
8080           set_buffer_internal (XBUFFER (coding->src_object));
8081         }
8082
8083       {
8084         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8085
8086         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8087                 old_deactivate_mark);
8088         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8089                     make_number (BEG), make_number (Z));
8090         UNGCPRO;
8091       }
8092       if (XBUFFER (coding->src_object) != current_buffer)
8093         kill_src_buffer = 1;
8094       coding->src_object = Fcurrent_buffer ();
8095       if (BEG != GPT)
8096         move_gap_both (BEG, BEG_BYTE);
8097       coding->src_chars = Z - BEG;
8098       coding->src_bytes = Z_BYTE - BEG_BYTE;
8099       coding->src_pos = BEG;
8100       coding->src_pos_byte = BEG_BYTE;
8101       coding->src_multibyte = Z < Z_BYTE;
8102     }
8103   else if (STRINGP (src_object))
8104     {
8105       code_conversion_save (0, 0);
8106       coding->src_pos = from;
8107       coding->src_pos_byte = from_byte;
8108     }
8109   else if (BUFFERP (src_object))
8110     {
8111       code_conversion_save (0, 0);
8112       set_buffer_internal (XBUFFER (src_object));
8113       if (EQ (src_object, dst_object))
8114         {
8115           saved_pt = PT, saved_pt_byte = PT_BYTE;
8116           coding->src_object = del_range_1 (from, to, 1, 1);
8117           coding->src_pos = 0;
8118           coding->src_pos_byte = 0;
8119         }
8120       else
8121         {
8122           if (from < GPT && to >= GPT)
8123             move_gap_both (from, from_byte);
8124           coding->src_pos = from;
8125           coding->src_pos_byte = from_byte;
8126         }
8127     }
8128   else
8129     code_conversion_save (0, 0);
8130
8131   if (BUFFERP (dst_object))
8132     {
8133       coding->dst_object = dst_object;
8134       if (EQ (src_object, dst_object))
8135         {
8136           coding->dst_pos = from;
8137           coding->dst_pos_byte = from_byte;
8138         }
8139       else
8140         {
8141           struct buffer *current = current_buffer;
8142
8143           set_buffer_temp (XBUFFER (dst_object));
8144           coding->dst_pos = PT;
8145           coding->dst_pos_byte = PT_BYTE;
8146           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8147           set_buffer_temp (current);
8148         }
8149       coding->dst_multibyte
8150         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8151     }
8152   else if (EQ (dst_object, Qt))
8153     {
8154       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8155       coding->dst_object = Qnil;
8156       coding->destination = xmalloc (dst_bytes);
8157       coding->dst_bytes = dst_bytes;
8158       coding->dst_multibyte = 0;
8159     }
8160   else
8161     {
8162       coding->dst_object = Qnil;
8163       coding->dst_multibyte = 0;
8164     }
8165
8166   encode_coding (coding);
8167
8168   if (EQ (dst_object, Qt))
8169     {
8170       if (BUFFERP (coding->dst_object))
8171         coding->dst_object = Fbuffer_string ();
8172       else
8173         {
8174           coding->dst_object
8175             = make_unibyte_string ((char *) coding->destination,
8176                                    coding->produced);
8177           xfree (coding->destination);
8178         }
8179     }
8180
8181   if (saved_pt >= 0)
8182     {
8183       /* This is the case of:
8184          (BUFFERP (src_object) && EQ (src_object, dst_object))
8185          As we have moved PT while replacing the original buffer
8186          contents, we must recover it now.  */
8187       set_buffer_internal (XBUFFER (src_object));
8188       if (saved_pt < from)
8189         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8190       else if (saved_pt < from + chars)
8191         TEMP_SET_PT_BOTH (from, from_byte);
8192       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8193         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8194                           saved_pt_byte + (coding->produced - bytes));
8195       else
8196         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8197                           saved_pt_byte + (coding->produced - bytes));
8198
8199       if (need_marker_adjustment)
8200         {
8201           struct Lisp_Marker *tail;
8202
8203           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8204             if (tail->need_adjustment)
8205               {
8206                 tail->need_adjustment = 0;
8207                 if (tail->insertion_type)
8208                   {
8209                     tail->bytepos = from_byte;
8210                     tail->charpos = from;
8211                   }
8212                 else
8213                   {
8214                     tail->bytepos = from_byte + coding->produced;
8215                     tail->charpos
8216                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8217                          ? tail->bytepos : from + coding->produced_char);
8218                   }
8219               }
8220         }
8221     }
8222
8223   if (kill_src_buffer)
8224     Fkill_buffer (coding->src_object);
8225
8226   Vdeactivate_mark = old_deactivate_mark;
8227   unbind_to (count, Qnil);
8228 }
8229
8230
8231 Lisp_Object
8232 preferred_coding_system (void)
8233 {
8234   int id = coding_categories[coding_priorities[0]].id;
8235
8236   return CODING_ID_NAME (id);
8237 }
8238
8239 #if defined (WINDOWSNT) || defined (CYGWIN)
8240
8241 Lisp_Object
8242 from_unicode (Lisp_Object str)
8243 {
8244   CHECK_STRING (str);
8245   if (!STRING_MULTIBYTE (str) &&
8246       SBYTES (str) & 1)
8247     {
8248       str = Fsubstring (str, make_number (0), make_number (-1));
8249     }
8250
8251   return code_convert_string_norecord (str, Qutf_16le, 0);
8252 }
8253
8254 Lisp_Object
8255 from_unicode_buffer (const wchar_t* wstr)
8256 {
8257     return from_unicode (
8258         make_unibyte_string (
8259             (char*) wstr,
8260             /* we get one of the two final 0 bytes for free. */
8261             1 + sizeof (wchar_t) * wcslen (wstr)));
8262 }
8263
8264 wchar_t *
8265 to_unicode (Lisp_Object str, Lisp_Object *buf)
8266 {
8267   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8268   /* We need to make another copy (in addition to the one made by
8269      code_convert_string_norecord) to ensure that the final string is
8270      _doubly_ zero terminated --- that is, that the string is
8271      terminated by two zero bytes and one utf-16le null character.
8272      Because strings are already terminated with a single zero byte,
8273      we just add one additional zero. */
8274   str = make_uninit_string (SBYTES (*buf) + 1);
8275   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8276   SDATA (str) [SBYTES (*buf)] = '\0';
8277   *buf = str;
8278   return WCSDATA (*buf);
8279 }
8280
8281 #endif /* WINDOWSNT || CYGWIN */
8282
8283 \f
8284 #ifdef emacs
8285 /*** 8. Emacs Lisp library functions ***/
8286
8287 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8288        doc: /* Return t if OBJECT is nil or a coding-system.
8289 See the documentation of `define-coding-system' for information
8290 about coding-system objects.  */)
8291   (Lisp_Object object)
8292 {
8293   if (NILP (object)
8294       || CODING_SYSTEM_ID (object) >= 0)
8295     return Qt;
8296   if (! SYMBOLP (object)
8297       || NILP (Fget (object, Qcoding_system_define_form)))
8298     return Qnil;
8299   return Qt;
8300 }
8301
8302 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8303        Sread_non_nil_coding_system, 1, 1, 0,
8304        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8305   (Lisp_Object prompt)
8306 {
8307   Lisp_Object val;
8308   do
8309     {
8310       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8311                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8312     }
8313   while (SCHARS (val) == 0);
8314   return (Fintern (val, Qnil));
8315 }
8316
8317 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8318        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8319 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8320 Ignores case when completing coding systems (all Emacs coding systems
8321 are lower-case).  */)
8322   (Lisp_Object prompt, Lisp_Object default_coding_system)
8323 {
8324   Lisp_Object val;
8325   ptrdiff_t count = SPECPDL_INDEX ();
8326
8327   if (SYMBOLP (default_coding_system))
8328     default_coding_system = SYMBOL_NAME (default_coding_system);
8329   specbind (Qcompletion_ignore_case, Qt);
8330   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8331                           Qt, Qnil, Qcoding_system_history,
8332                           default_coding_system, Qnil);
8333   unbind_to (count, Qnil);
8334   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8335 }
8336
8337 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8338        1, 1, 0,
8339        doc: /* Check validity of CODING-SYSTEM.
8340 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8341 It is valid if it is nil or a symbol defined as a coding system by the
8342 function `define-coding-system'.  */)
8343   (Lisp_Object coding_system)
8344 {
8345   Lisp_Object define_form;
8346
8347   define_form = Fget (coding_system, Qcoding_system_define_form);
8348   if (! NILP (define_form))
8349     {
8350       Fput (coding_system, Qcoding_system_define_form, Qnil);
8351       safe_eval (define_form);
8352     }
8353   if (!NILP (Fcoding_system_p (coding_system)))
8354     return coding_system;
8355   xsignal1 (Qcoding_system_error, coding_system);
8356 }
8357
8358 \f
8359 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8360    HIGHEST, return the coding system of the highest
8361    priority among the detected coding systems.  Otherwise return a
8362    list of detected coding systems sorted by their priorities.  If
8363    MULTIBYTEP, it is assumed that the bytes are in correct
8364    multibyte form but contains only ASCII and eight-bit chars.
8365    Otherwise, the bytes are raw bytes.
8366
8367    CODING-SYSTEM controls the detection as below:
8368
8369    If it is nil, detect both text-format and eol-format.  If the
8370    text-format part of CODING-SYSTEM is already specified
8371    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8372    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8373    detect only text-format.  */
8374
8375 Lisp_Object
8376 detect_coding_system (const unsigned char *src,
8377                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8378                       bool highest, bool multibytep,
8379                       Lisp_Object coding_system)
8380 {
8381   const unsigned char *src_end = src + src_bytes;
8382   Lisp_Object attrs, eol_type;
8383   Lisp_Object val = Qnil;
8384   struct coding_system coding;
8385   ptrdiff_t id;
8386   struct coding_detection_info detect_info;
8387   enum coding_category base_category;
8388   bool null_byte_found = 0, eight_bit_found = 0;
8389
8390   if (NILP (coding_system))
8391     coding_system = Qundecided;
8392   setup_coding_system (coding_system, &coding);
8393   attrs = CODING_ID_ATTRS (coding.id);
8394   eol_type = CODING_ID_EOL_TYPE (coding.id);
8395   coding_system = CODING_ATTR_BASE_NAME (attrs);
8396
8397   coding.source = src;
8398   coding.src_chars = src_chars;
8399   coding.src_bytes = src_bytes;
8400   coding.src_multibyte = multibytep;
8401   coding.consumed = 0;
8402   coding.mode |= CODING_MODE_LAST_BLOCK;
8403   coding.head_ascii = 0;
8404
8405   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8406
8407   /* At first, detect text-format if necessary.  */
8408   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8409   if (base_category == coding_category_undecided)
8410     {
8411       enum coding_category category IF_LINT (= 0);
8412       struct coding_system *this IF_LINT (= NULL);
8413       int c, i;
8414
8415       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8416       for (; src < src_end; src++)
8417         {
8418           c = *src;
8419           if (c & 0x80)
8420             {
8421               eight_bit_found = 1;
8422               if (null_byte_found)
8423                 break;
8424             }
8425           else if (c < 0x20)
8426             {
8427               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8428                   && ! inhibit_iso_escape_detection
8429                   && ! detect_info.checked)
8430                 {
8431                   if (detect_coding_iso_2022 (&coding, &detect_info))
8432                     {
8433                       /* We have scanned the whole data.  */
8434                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8435                         {
8436                           /* We didn't find an 8-bit code.  We may
8437                              have found a null-byte, but it's very
8438                              rare that a binary file confirm to
8439                              ISO-2022.  */
8440                           src = src_end;
8441                           coding.head_ascii = src - coding.source;
8442                         }
8443                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8444                       break;
8445                     }
8446                 }
8447               else if (! c && !inhibit_null_byte_detection)
8448                 {
8449                   null_byte_found = 1;
8450                   if (eight_bit_found)
8451                     break;
8452                 }
8453               if (! eight_bit_found)
8454                 coding.head_ascii++;
8455             }
8456           else if (! eight_bit_found)
8457             coding.head_ascii++;
8458         }
8459
8460       if (null_byte_found || eight_bit_found
8461           || coding.head_ascii < coding.src_bytes
8462           || detect_info.found)
8463         {
8464           if (coding.head_ascii == coding.src_bytes)
8465             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8466             for (i = 0; i < coding_category_raw_text; i++)
8467               {
8468                 category = coding_priorities[i];
8469                 this = coding_categories + category;
8470                 if (detect_info.found & (1 << category))
8471                   break;
8472               }
8473           else
8474             {
8475               if (null_byte_found)
8476                 {
8477                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8478                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8479                 }
8480               for (i = 0; i < coding_category_raw_text; i++)
8481                 {
8482                   category = coding_priorities[i];
8483                   this = coding_categories + category;
8484
8485                   if (this->id < 0)
8486                     {
8487                       /* No coding system of this category is defined.  */
8488                       detect_info.rejected |= (1 << category);
8489                     }
8490                   else if (category >= coding_category_raw_text)
8491                     continue;
8492                   else if (detect_info.checked & (1 << category))
8493                     {
8494                       if (highest
8495                           && (detect_info.found & (1 << category)))
8496                         break;
8497                     }
8498                   else if ((*(this->detector)) (&coding, &detect_info)
8499                            && highest
8500                            && (detect_info.found & (1 << category)))
8501                     {
8502                       if (category == coding_category_utf_16_auto)
8503                         {
8504                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8505                             category = coding_category_utf_16_le;
8506                           else
8507                             category = coding_category_utf_16_be;
8508                         }
8509                       break;
8510                     }
8511                 }
8512             }
8513         }
8514
8515       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8516           || null_byte_found)
8517         {
8518           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8519           id = CODING_SYSTEM_ID (Qno_conversion);
8520           val = Fcons (make_number (id), Qnil);
8521         }
8522       else if (! detect_info.rejected && ! detect_info.found)
8523         {
8524           detect_info.found = CATEGORY_MASK_ANY;
8525           id = coding_categories[coding_category_undecided].id;
8526           val = Fcons (make_number (id), Qnil);
8527         }
8528       else if (highest)
8529         {
8530           if (detect_info.found)
8531             {
8532               detect_info.found = 1 << category;
8533               val = Fcons (make_number (this->id), Qnil);
8534             }
8535           else
8536             for (i = 0; i < coding_category_raw_text; i++)
8537               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8538                 {
8539                   detect_info.found = 1 << coding_priorities[i];
8540                   id = coding_categories[coding_priorities[i]].id;
8541                   val = Fcons (make_number (id), Qnil);
8542                   break;
8543                 }
8544         }
8545       else
8546         {
8547           int mask = detect_info.rejected | detect_info.found;
8548           int found = 0;
8549
8550           for (i = coding_category_raw_text - 1; i >= 0; i--)
8551             {
8552               category = coding_priorities[i];
8553               if (! (mask & (1 << category)))
8554                 {
8555                   found |= 1 << category;
8556                   id = coding_categories[category].id;
8557                   if (id >= 0)
8558                     val = Fcons (make_number (id), val);
8559                 }
8560             }
8561           for (i = coding_category_raw_text - 1; i >= 0; i--)
8562             {
8563               category = coding_priorities[i];
8564               if (detect_info.found & (1 << category))
8565                 {
8566                   id = coding_categories[category].id;
8567                   val = Fcons (make_number (id), val);
8568                 }
8569             }
8570           detect_info.found |= found;
8571         }
8572     }
8573   else if (base_category == coding_category_utf_8_auto)
8574     {
8575       if (detect_coding_utf_8 (&coding, &detect_info))
8576         {
8577           struct coding_system *this;
8578
8579           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8580             this = coding_categories + coding_category_utf_8_sig;
8581           else
8582             this = coding_categories + coding_category_utf_8_nosig;
8583           val = Fcons (make_number (this->id), Qnil);
8584         }
8585     }
8586   else if (base_category == coding_category_utf_16_auto)
8587     {
8588       if (detect_coding_utf_16 (&coding, &detect_info))
8589         {
8590           struct coding_system *this;
8591
8592           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8593             this = coding_categories + coding_category_utf_16_le;
8594           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8595             this = coding_categories + coding_category_utf_16_be;
8596           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8597             this = coding_categories + coding_category_utf_16_be_nosig;
8598           else
8599             this = coding_categories + coding_category_utf_16_le_nosig;
8600           val = Fcons (make_number (this->id), Qnil);
8601         }
8602     }
8603   else
8604     {
8605       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8606       val = Fcons (make_number (coding.id), Qnil);
8607     }
8608
8609   /* Then, detect eol-format if necessary.  */
8610   {
8611     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8612     Lisp_Object tail;
8613
8614     if (VECTORP (eol_type))
8615       {
8616         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8617           {
8618             if (null_byte_found)
8619               normal_eol = EOL_SEEN_LF;
8620             else
8621               normal_eol = detect_eol (coding.source, src_bytes,
8622                                        coding_category_raw_text);
8623           }
8624         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8625                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8626           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8627                                       coding_category_utf_16_be);
8628         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8629                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8630           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8631                                       coding_category_utf_16_le);
8632       }
8633     else
8634       {
8635         if (EQ (eol_type, Qunix))
8636           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8637         else if (EQ (eol_type, Qdos))
8638           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8639         else
8640           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8641       }
8642
8643     for (tail = val; CONSP (tail); tail = XCDR (tail))
8644       {
8645         enum coding_category category;
8646         int this_eol;
8647
8648         id = XINT (XCAR (tail));
8649         attrs = CODING_ID_ATTRS (id);
8650         category = XINT (CODING_ATTR_CATEGORY (attrs));
8651         eol_type = CODING_ID_EOL_TYPE (id);
8652         if (VECTORP (eol_type))
8653           {
8654             if (category == coding_category_utf_16_be
8655                 || category == coding_category_utf_16_be_nosig)
8656               this_eol = utf_16_be_eol;
8657             else if (category == coding_category_utf_16_le
8658                      || category == coding_category_utf_16_le_nosig)
8659               this_eol = utf_16_le_eol;
8660             else
8661               this_eol = normal_eol;
8662
8663             if (this_eol == EOL_SEEN_LF)
8664               XSETCAR (tail, AREF (eol_type, 0));
8665             else if (this_eol == EOL_SEEN_CRLF)
8666               XSETCAR (tail, AREF (eol_type, 1));
8667             else if (this_eol == EOL_SEEN_CR)
8668               XSETCAR (tail, AREF (eol_type, 2));
8669             else
8670               XSETCAR (tail, CODING_ID_NAME (id));
8671           }
8672         else
8673           XSETCAR (tail, CODING_ID_NAME (id));
8674       }
8675   }
8676
8677   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8678 }
8679
8680
8681 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8682        2, 3, 0,
8683        doc: /* Detect coding system of the text in the region between START and END.
8684 Return a list of possible coding systems ordered by priority.
8685 The coding systems to try and their priorities follows what
8686 the function `coding-system-priority-list' (which see) returns.
8687
8688 If only ASCII characters are found (except for such ISO-2022 control
8689 characters as ESC), it returns a list of single element `undecided'
8690 or its subsidiary coding system according to a detected end-of-line
8691 format.
8692
8693 If optional argument HIGHEST is non-nil, return the coding system of
8694 highest priority.  */)
8695   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8696 {
8697   ptrdiff_t from, to;
8698   ptrdiff_t from_byte, to_byte;
8699
8700   validate_region (&start, &end);
8701   from = XINT (start), to = XINT (end);
8702   from_byte = CHAR_TO_BYTE (from);
8703   to_byte = CHAR_TO_BYTE (to);
8704
8705   if (from < GPT && to >= GPT)
8706     move_gap_both (to, to_byte);
8707
8708   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8709                                to - from, to_byte - from_byte,
8710                                !NILP (highest),
8711                                !NILP (BVAR (current_buffer
8712                                       , enable_multibyte_characters)),
8713                                Qnil);
8714 }
8715
8716 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8717        1, 2, 0,
8718        doc: /* Detect coding system of the text in STRING.
8719 Return a list of possible coding systems ordered by priority.
8720 The coding systems to try and their priorities follows what
8721 the function `coding-system-priority-list' (which see) returns.
8722
8723 If only ASCII characters are found (except for such ISO-2022 control
8724 characters as ESC), it returns a list of single element `undecided'
8725 or its subsidiary coding system according to a detected end-of-line
8726 format.
8727
8728 If optional argument HIGHEST is non-nil, return the coding system of
8729 highest priority.  */)
8730   (Lisp_Object string, Lisp_Object highest)
8731 {
8732   CHECK_STRING (string);
8733
8734   return detect_coding_system (SDATA (string),
8735                                SCHARS (string), SBYTES (string),
8736                                !NILP (highest), STRING_MULTIBYTE (string),
8737                                Qnil);
8738 }
8739
8740
8741 static bool
8742 char_encodable_p (int c, Lisp_Object attrs)
8743 {
8744   Lisp_Object tail;
8745   struct charset *charset;
8746   Lisp_Object translation_table;
8747
8748   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8749   if (! NILP (translation_table))
8750     c = translate_char (translation_table, c);
8751   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8752        CONSP (tail); tail = XCDR (tail))
8753     {
8754       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8755       if (CHAR_CHARSET_P (c, charset))
8756         break;
8757     }
8758   return (! NILP (tail));
8759 }
8760
8761
8762 /* Return a list of coding systems that safely encode the text between
8763    START and END.  If EXCLUDE is non-nil, it is a list of coding
8764    systems not to check.  The returned list doesn't contain any such
8765    coding systems.  In any case, if the text contains only ASCII or is
8766    unibyte, return t.  */
8767
8768 DEFUN ("find-coding-systems-region-internal",
8769        Ffind_coding_systems_region_internal,
8770        Sfind_coding_systems_region_internal, 2, 3, 0,
8771        doc: /* Internal use only.  */)
8772   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8773 {
8774   Lisp_Object coding_attrs_list, safe_codings;
8775   ptrdiff_t start_byte, end_byte;
8776   const unsigned char *p, *pbeg, *pend;
8777   int c;
8778   Lisp_Object tail, elt, work_table;
8779
8780   if (STRINGP (start))
8781     {
8782       if (!STRING_MULTIBYTE (start)
8783           || SCHARS (start) == SBYTES (start))
8784         return Qt;
8785       start_byte = 0;
8786       end_byte = SBYTES (start);
8787     }
8788   else
8789     {
8790       CHECK_NUMBER_COERCE_MARKER (start);
8791       CHECK_NUMBER_COERCE_MARKER (end);
8792       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8793         args_out_of_range (start, end);
8794       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8795         return Qt;
8796       start_byte = CHAR_TO_BYTE (XINT (start));
8797       end_byte = CHAR_TO_BYTE (XINT (end));
8798       if (XINT (end) - XINT (start) == end_byte - start_byte)
8799         return Qt;
8800
8801       if (XINT (start) < GPT && XINT (end) > GPT)
8802         {
8803           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8804             move_gap_both (XINT (start), start_byte);
8805           else
8806             move_gap_both (XINT (end), end_byte);
8807         }
8808     }
8809
8810   coding_attrs_list = Qnil;
8811   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8812     if (NILP (exclude)
8813         || NILP (Fmemq (XCAR (tail), exclude)))
8814       {
8815         Lisp_Object attrs;
8816
8817         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8818         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8819             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8820           {
8821             ASET (attrs, coding_attr_trans_tbl,
8822                   get_translation_table (attrs, 1, NULL));
8823             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8824           }
8825       }
8826
8827   if (STRINGP (start))
8828     p = pbeg = SDATA (start);
8829   else
8830     p = pbeg = BYTE_POS_ADDR (start_byte);
8831   pend = p + (end_byte - start_byte);
8832
8833   while (p < pend && ASCII_BYTE_P (*p)) p++;
8834   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8835
8836   work_table = Fmake_char_table (Qnil, Qnil);
8837   while (p < pend)
8838     {
8839       if (ASCII_BYTE_P (*p))
8840         p++;
8841       else
8842         {
8843           c = STRING_CHAR_ADVANCE (p);
8844           if (!NILP (char_table_ref (work_table, c)))
8845             /* This character was already checked.  Ignore it.  */
8846             continue;
8847
8848           charset_map_loaded = 0;
8849           for (tail = coding_attrs_list; CONSP (tail);)
8850             {
8851               elt = XCAR (tail);
8852               if (NILP (elt))
8853                 tail = XCDR (tail);
8854               else if (char_encodable_p (c, elt))
8855                 tail = XCDR (tail);
8856               else if (CONSP (XCDR (tail)))
8857                 {
8858                   XSETCAR (tail, XCAR (XCDR (tail)));
8859                   XSETCDR (tail, XCDR (XCDR (tail)));
8860                 }
8861               else
8862                 {
8863                   XSETCAR (tail, Qnil);
8864                   tail = XCDR (tail);
8865                 }
8866             }
8867           if (charset_map_loaded)
8868             {
8869               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8870
8871               if (STRINGP (start))
8872                 pbeg = SDATA (start);
8873               else
8874                 pbeg = BYTE_POS_ADDR (start_byte);
8875               p = pbeg + p_offset;
8876               pend = pbeg + pend_offset;
8877             }
8878           char_table_set (work_table, c, Qt);
8879         }
8880     }
8881
8882   safe_codings = list2 (Qraw_text, Qno_conversion);
8883   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8884     if (! NILP (XCAR (tail)))
8885       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8886
8887   return safe_codings;
8888 }
8889
8890
8891 DEFUN ("unencodable-char-position", Funencodable_char_position,
8892        Sunencodable_char_position, 3, 5, 0,
8893        doc: /*
8894 Return position of first un-encodable character in a region.
8895 START and END specify the region and CODING-SYSTEM specifies the
8896 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8897
8898 If optional 4th argument COUNT is non-nil, it specifies at most how
8899 many un-encodable characters to search.  In this case, the value is a
8900 list of positions.
8901
8902 If optional 5th argument STRING is non-nil, it is a string to search
8903 for un-encodable characters.  In that case, START and END are indexes
8904 to the string.  */)
8905   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8906 {
8907   EMACS_INT n;
8908   struct coding_system coding;
8909   Lisp_Object attrs, charset_list, translation_table;
8910   Lisp_Object positions;
8911   ptrdiff_t from, to;
8912   const unsigned char *p, *stop, *pend;
8913   bool ascii_compatible;
8914
8915   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8916   attrs = CODING_ID_ATTRS (coding.id);
8917   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8918     return Qnil;
8919   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8920   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8921   translation_table = get_translation_table (attrs, 1, NULL);
8922
8923   if (NILP (string))
8924     {
8925       validate_region (&start, &end);
8926       from = XINT (start);
8927       to = XINT (end);
8928       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8929           || (ascii_compatible
8930               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8931         return Qnil;
8932       p = CHAR_POS_ADDR (from);
8933       pend = CHAR_POS_ADDR (to);
8934       if (from < GPT && to >= GPT)
8935         stop = GPT_ADDR;
8936       else
8937         stop = pend;
8938     }
8939   else
8940     {
8941       CHECK_STRING (string);
8942       CHECK_NATNUM (start);
8943       CHECK_NATNUM (end);
8944       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8945         args_out_of_range_3 (string, start, end);
8946       from = XINT (start);
8947       to = XINT (end);
8948       if (! STRING_MULTIBYTE (string))
8949         return Qnil;
8950       p = SDATA (string) + string_char_to_byte (string, from);
8951       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8952       if (ascii_compatible && (to - from) == (pend - p))
8953         return Qnil;
8954     }
8955
8956   if (NILP (count))
8957     n = 1;
8958   else
8959     {
8960       CHECK_NATNUM (count);
8961       n = XINT (count);
8962     }
8963
8964   positions = Qnil;
8965   charset_map_loaded = 0;
8966   while (1)
8967     {
8968       int c;
8969
8970       if (ascii_compatible)
8971         while (p < stop && ASCII_BYTE_P (*p))
8972           p++, from++;
8973       if (p >= stop)
8974         {
8975           if (p >= pend)
8976             break;
8977           stop = pend;
8978           p = GAP_END_ADDR;
8979         }
8980
8981       c = STRING_CHAR_ADVANCE (p);
8982       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8983           && ! char_charset (translate_char (translation_table, c),
8984                              charset_list, NULL))
8985         {
8986           positions = Fcons (make_number (from), positions);
8987           n--;
8988           if (n == 0)
8989             break;
8990         }
8991
8992       from++;
8993       if (charset_map_loaded && NILP (string))
8994         {
8995           p = CHAR_POS_ADDR (from);
8996           pend = CHAR_POS_ADDR (to);
8997           if (from < GPT && to >= GPT)
8998             stop = GPT_ADDR;
8999           else
9000             stop = pend;
9001           charset_map_loaded = 0;
9002         }
9003     }
9004
9005   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9006 }
9007
9008
9009 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9010        Scheck_coding_systems_region, 3, 3, 0,
9011        doc: /* Check if the region is encodable by coding systems.
9012
9013 START and END are buffer positions specifying the region.
9014 CODING-SYSTEM-LIST is a list of coding systems to check.
9015
9016 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9017 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9018 whole region, POS0, POS1, ... are buffer positions where non-encodable
9019 characters are found.
9020
9021 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9022 value is nil.
9023
9024 START may be a string.  In that case, check if the string is
9025 encodable, and the value contains indices to the string instead of
9026 buffer positions.  END is ignored.
9027
9028 If the current buffer (or START if it is a string) is unibyte, the value
9029 is nil.  */)
9030   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9031 {
9032   Lisp_Object list;
9033   ptrdiff_t start_byte, end_byte;
9034   ptrdiff_t pos;
9035   const unsigned char *p, *pbeg, *pend;
9036   int c;
9037   Lisp_Object tail, elt, attrs;
9038
9039   if (STRINGP (start))
9040     {
9041       if (!STRING_MULTIBYTE (start)
9042           || SCHARS (start) == SBYTES (start))
9043         return Qnil;
9044       start_byte = 0;
9045       end_byte = SBYTES (start);
9046       pos = 0;
9047     }
9048   else
9049     {
9050       CHECK_NUMBER_COERCE_MARKER (start);
9051       CHECK_NUMBER_COERCE_MARKER (end);
9052       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9053         args_out_of_range (start, end);
9054       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9055         return Qnil;
9056       start_byte = CHAR_TO_BYTE (XINT (start));
9057       end_byte = CHAR_TO_BYTE (XINT (end));
9058       if (XINT (end) - XINT (start) == end_byte - start_byte)
9059         return Qnil;
9060
9061       if (XINT (start) < GPT && XINT (end) > GPT)
9062         {
9063           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9064             move_gap_both (XINT (start), start_byte);
9065           else
9066             move_gap_both (XINT (end), end_byte);
9067         }
9068       pos = XINT (start);
9069     }
9070
9071   list = Qnil;
9072   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9073     {
9074       elt = XCAR (tail);
9075       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9076       ASET (attrs, coding_attr_trans_tbl,
9077             get_translation_table (attrs, 1, NULL));
9078       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
9079     }
9080
9081   if (STRINGP (start))
9082     p = pbeg = SDATA (start);
9083   else
9084     p = pbeg = BYTE_POS_ADDR (start_byte);
9085   pend = p + (end_byte - start_byte);
9086
9087   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9088   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9089
9090   while (p < pend)
9091     {
9092       if (ASCII_BYTE_P (*p))
9093         p++;
9094       else
9095         {
9096           c = STRING_CHAR_ADVANCE (p);
9097
9098           charset_map_loaded = 0;
9099           for (tail = list; CONSP (tail); tail = XCDR (tail))
9100             {
9101               elt = XCDR (XCAR (tail));
9102               if (! char_encodable_p (c, XCAR (elt)))
9103                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9104             }
9105           if (charset_map_loaded)
9106             {
9107               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9108
9109               if (STRINGP (start))
9110                 pbeg = SDATA (start);
9111               else
9112                 pbeg = BYTE_POS_ADDR (start_byte);
9113               p = pbeg + p_offset;
9114               pend = pbeg + pend_offset;
9115             }
9116         }
9117       pos++;
9118     }
9119
9120   tail = list;
9121   list = Qnil;
9122   for (; CONSP (tail); tail = XCDR (tail))
9123     {
9124       elt = XCAR (tail);
9125       if (CONSP (XCDR (XCDR (elt))))
9126         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9127                       list);
9128     }
9129
9130   return list;
9131 }
9132
9133
9134 static Lisp_Object
9135 code_convert_region (Lisp_Object start, Lisp_Object end,
9136                      Lisp_Object coding_system, Lisp_Object dst_object,
9137                      bool encodep, bool norecord)
9138 {
9139   struct coding_system coding;
9140   ptrdiff_t from, from_byte, to, to_byte;
9141   Lisp_Object src_object;
9142
9143   if (NILP (coding_system))
9144     coding_system = Qno_conversion;
9145   else
9146     CHECK_CODING_SYSTEM (coding_system);
9147   src_object = Fcurrent_buffer ();
9148   if (NILP (dst_object))
9149     dst_object = src_object;
9150   else if (! EQ (dst_object, Qt))
9151     CHECK_BUFFER (dst_object);
9152
9153   validate_region (&start, &end);
9154   from = XFASTINT (start);
9155   from_byte = CHAR_TO_BYTE (from);
9156   to = XFASTINT (end);
9157   to_byte = CHAR_TO_BYTE (to);
9158
9159   setup_coding_system (coding_system, &coding);
9160   coding.mode |= CODING_MODE_LAST_BLOCK;
9161
9162   if (encodep)
9163     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9164                           dst_object);
9165   else
9166     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9167                           dst_object);
9168   if (! norecord)
9169     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9170
9171   return (BUFFERP (dst_object)
9172           ? make_number (coding.produced_char)
9173           : coding.dst_object);
9174 }
9175
9176
9177 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9178        3, 4, "r\nzCoding system: ",
9179        doc: /* Decode the current region from the specified coding system.
9180 When called from a program, takes four arguments:
9181         START, END, CODING-SYSTEM, and DESTINATION.
9182 START and END are buffer positions.
9183
9184 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9185 If nil, the region between START and END is replaced by the decoded text.
9186 If buffer, the decoded text is inserted in that buffer after point (point
9187 does not move).
9188 In those cases, the length of the decoded text is returned.
9189 If DESTINATION is t, the decoded text is returned.
9190
9191 This function sets `last-coding-system-used' to the precise coding system
9192 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9193 not fully specified.)  */)
9194   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9195 {
9196   return code_convert_region (start, end, coding_system, destination, 0, 0);
9197 }
9198
9199 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9200        3, 4, "r\nzCoding system: ",
9201        doc: /* Encode the current region by specified coding system.
9202 When called from a program, takes four arguments:
9203         START, END, CODING-SYSTEM and DESTINATION.
9204 START and END are buffer positions.
9205
9206 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9207 If nil, the region between START and END is replace by the encoded text.
9208 If buffer, the encoded text is inserted in that buffer after point (point
9209 does not move).
9210 In those cases, the length of the encoded text is returned.
9211 If DESTINATION is t, the encoded text is returned.
9212
9213 This function sets `last-coding-system-used' to the precise coding system
9214 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9215 not fully specified.)  */)
9216   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9217 {
9218   return code_convert_region (start, end, coding_system, destination, 1, 0);
9219 }
9220
9221 Lisp_Object
9222 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9223                      Lisp_Object dst_object, bool encodep, bool nocopy,
9224                      bool norecord)
9225 {
9226   struct coding_system coding;
9227   ptrdiff_t chars, bytes;
9228
9229   CHECK_STRING (string);
9230   if (NILP (coding_system))
9231     {
9232       if (! norecord)
9233         Vlast_coding_system_used = Qno_conversion;
9234       if (NILP (dst_object))
9235         return (nocopy ? Fcopy_sequence (string) : string);
9236     }
9237
9238   if (NILP (coding_system))
9239     coding_system = Qno_conversion;
9240   else
9241     CHECK_CODING_SYSTEM (coding_system);
9242   if (NILP (dst_object))
9243     dst_object = Qt;
9244   else if (! EQ (dst_object, Qt))
9245     CHECK_BUFFER (dst_object);
9246
9247   setup_coding_system (coding_system, &coding);
9248   coding.mode |= CODING_MODE_LAST_BLOCK;
9249   chars = SCHARS (string);
9250   bytes = SBYTES (string);
9251   if (encodep)
9252     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9253   else
9254     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9255   if (! norecord)
9256     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9257
9258   return (BUFFERP (dst_object)
9259           ? make_number (coding.produced_char)
9260           : coding.dst_object);
9261 }
9262
9263
9264 /* Encode or decode STRING according to CODING_SYSTEM.
9265    Do not set Vlast_coding_system_used.
9266
9267    This function is called only from macros DECODE_FILE and
9268    ENCODE_FILE, thus we ignore character composition.  */
9269
9270 Lisp_Object
9271 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9272                               bool encodep)
9273 {
9274   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9275 }
9276
9277
9278 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9279        2, 4, 0,
9280        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9281
9282 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9283 if the decoding operation is trivial.
9284
9285 Optional fourth arg BUFFER non-nil means that the decoded text is
9286 inserted in that buffer after point (point does not move).  In this
9287 case, the return value is the length of the decoded text.
9288
9289 This function sets `last-coding-system-used' to the precise coding system
9290 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9291 not fully specified.)  */)
9292   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9293 {
9294   return code_convert_string (string, coding_system, buffer,
9295                               0, ! NILP (nocopy), 0);
9296 }
9297
9298 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9299        2, 4, 0,
9300        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9301
9302 Optional third arg NOCOPY non-nil means it is OK to return STRING
9303 itself if the encoding operation is trivial.
9304
9305 Optional fourth arg BUFFER non-nil means that the encoded text is
9306 inserted in that buffer after point (point does not move).  In this
9307 case, the return value is the length of the encoded text.
9308
9309 This function sets `last-coding-system-used' to the precise coding system
9310 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9311 not fully specified.)  */)
9312   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9313 {
9314   return code_convert_string (string, coding_system, buffer,
9315                               1, ! NILP (nocopy), 0);
9316 }
9317
9318 \f
9319 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9320        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9321 Return the corresponding character.  */)
9322   (Lisp_Object code)
9323 {
9324   Lisp_Object spec, attrs, val;
9325   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9326   EMACS_INT ch;
9327   int c;
9328
9329   CHECK_NATNUM (code);
9330   ch = XFASTINT (code);
9331   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9332   attrs = AREF (spec, 0);
9333
9334   if (ASCII_BYTE_P (ch)
9335       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9336     return code;
9337
9338   val = CODING_ATTR_CHARSET_LIST (attrs);
9339   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9340   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9341   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9342
9343   if (ch <= 0x7F)
9344     {
9345       c = ch;
9346       charset = charset_roman;
9347     }
9348   else if (ch >= 0xA0 && ch < 0xDF)
9349     {
9350       c = ch - 0x80;
9351       charset = charset_kana;
9352     }
9353   else
9354     {
9355       EMACS_INT c1 = ch >> 8;
9356       int c2 = ch & 0xFF;
9357
9358       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9359           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9360         error ("Invalid code: %"pI"d", ch);
9361       c = ch;
9362       SJIS_TO_JIS (c);
9363       charset = charset_kanji;
9364     }
9365   c = DECODE_CHAR (charset, c);
9366   if (c < 0)
9367     error ("Invalid code: %"pI"d", ch);
9368   return make_number (c);
9369 }
9370
9371
9372 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9373        doc: /* Encode a Japanese character CH to shift_jis encoding.
9374 Return the corresponding code in SJIS.  */)
9375   (Lisp_Object ch)
9376 {
9377   Lisp_Object spec, attrs, charset_list;
9378   int c;
9379   struct charset *charset;
9380   unsigned code;
9381
9382   CHECK_CHARACTER (ch);
9383   c = XFASTINT (ch);
9384   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9385   attrs = AREF (spec, 0);
9386
9387   if (ASCII_CHAR_P (c)
9388       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9389     return ch;
9390
9391   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9392   charset = char_charset (c, charset_list, &code);
9393   if (code == CHARSET_INVALID_CODE (charset))
9394     error ("Can't encode by shift_jis encoding: %c", c);
9395   JIS_TO_SJIS (code);
9396
9397   return make_number (code);
9398 }
9399
9400 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9401        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9402 Return the corresponding character.  */)
9403   (Lisp_Object code)
9404 {
9405   Lisp_Object spec, attrs, val;
9406   struct charset *charset_roman, *charset_big5, *charset;
9407   EMACS_INT ch;
9408   int c;
9409
9410   CHECK_NATNUM (code);
9411   ch = XFASTINT (code);
9412   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9413   attrs = AREF (spec, 0);
9414
9415   if (ASCII_BYTE_P (ch)
9416       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9417     return code;
9418
9419   val = CODING_ATTR_CHARSET_LIST (attrs);
9420   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9421   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9422
9423   if (ch <= 0x7F)
9424     {
9425       c = ch;
9426       charset = charset_roman;
9427     }
9428   else
9429     {
9430       EMACS_INT b1 = ch >> 8;
9431       int b2 = ch & 0x7F;
9432       if (b1 < 0xA1 || b1 > 0xFE
9433           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9434         error ("Invalid code: %"pI"d", ch);
9435       c = ch;
9436       charset = charset_big5;
9437     }
9438   c = DECODE_CHAR (charset, c);
9439   if (c < 0)
9440     error ("Invalid code: %"pI"d", ch);
9441   return make_number (c);
9442 }
9443
9444 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9445        doc: /* Encode the Big5 character CH to BIG5 coding system.
9446 Return the corresponding character code in Big5.  */)
9447   (Lisp_Object ch)
9448 {
9449   Lisp_Object spec, attrs, charset_list;
9450   struct charset *charset;
9451   int c;
9452   unsigned code;
9453
9454   CHECK_CHARACTER (ch);
9455   c = XFASTINT (ch);
9456   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9457   attrs = AREF (spec, 0);
9458   if (ASCII_CHAR_P (c)
9459       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9460     return ch;
9461
9462   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9463   charset = char_charset (c, charset_list, &code);
9464   if (code == CHARSET_INVALID_CODE (charset))
9465     error ("Can't encode by Big5 encoding: %c", c);
9466
9467   return make_number (code);
9468 }
9469
9470 \f
9471 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9472        Sset_terminal_coding_system_internal, 1, 2, 0,
9473        doc: /* Internal use only.  */)
9474   (Lisp_Object coding_system, Lisp_Object terminal)
9475 {
9476   struct terminal *term = get_terminal (terminal, 1);
9477   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9478   CHECK_SYMBOL (coding_system);
9479   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9480   /* We had better not send unsafe characters to terminal.  */
9481   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9482   /* Character composition should be disabled.  */
9483   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9484   terminal_coding->src_multibyte = 1;
9485   terminal_coding->dst_multibyte = 0;
9486   tset_charset_list
9487     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9488             ? coding_charset_list (terminal_coding)
9489             : Fcons (make_number (charset_ascii), Qnil)));
9490   return Qnil;
9491 }
9492
9493 DEFUN ("set-safe-terminal-coding-system-internal",
9494        Fset_safe_terminal_coding_system_internal,
9495        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9496        doc: /* Internal use only.  */)
9497   (Lisp_Object coding_system)
9498 {
9499   CHECK_SYMBOL (coding_system);
9500   setup_coding_system (Fcheck_coding_system (coding_system),
9501                        &safe_terminal_coding);
9502   /* Character composition should be disabled.  */
9503   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9504   safe_terminal_coding.src_multibyte = 1;
9505   safe_terminal_coding.dst_multibyte = 0;
9506   return Qnil;
9507 }
9508
9509 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9510        Sterminal_coding_system, 0, 1, 0,
9511        doc: /* Return coding system specified for terminal output on the given terminal.
9512 TERMINAL may be a terminal object, a frame, or nil for the selected
9513 frame's terminal device.  */)
9514   (Lisp_Object terminal)
9515 {
9516   struct coding_system *terminal_coding
9517     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9518   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9519
9520   /* For backward compatibility, return nil if it is `undecided'.  */
9521   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9522 }
9523
9524 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9525        Sset_keyboard_coding_system_internal, 1, 2, 0,
9526        doc: /* Internal use only.  */)
9527   (Lisp_Object coding_system, Lisp_Object terminal)
9528 {
9529   struct terminal *t = get_terminal (terminal, 1);
9530   CHECK_SYMBOL (coding_system);
9531   if (NILP (coding_system))
9532     coding_system = Qno_conversion;
9533   else
9534     Fcheck_coding_system (coding_system);
9535   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9536   /* Character composition should be disabled.  */
9537   TERMINAL_KEYBOARD_CODING (t)->common_flags
9538     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9539   return Qnil;
9540 }
9541
9542 DEFUN ("keyboard-coding-system",
9543        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9544        doc: /* Return coding system specified for decoding keyboard input.  */)
9545   (Lisp_Object terminal)
9546 {
9547   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9548                          (get_terminal (terminal, 1))->id);
9549 }
9550
9551 \f
9552 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9553        Sfind_operation_coding_system,  1, MANY, 0,
9554        doc: /* Choose a coding system for an operation based on the target name.
9555 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9556 DECODING-SYSTEM is the coding system to use for decoding
9557 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9558 for encoding (in case OPERATION does encoding).
9559
9560 The first argument OPERATION specifies an I/O primitive:
9561   For file I/O, `insert-file-contents' or `write-region'.
9562   For process I/O, `call-process', `call-process-region', or `start-process'.
9563   For network I/O, `open-network-stream'.
9564
9565 The remaining arguments should be the same arguments that were passed
9566 to the primitive.  Depending on which primitive, one of those arguments
9567 is selected as the TARGET.  For example, if OPERATION does file I/O,
9568 whichever argument specifies the file name is TARGET.
9569
9570 TARGET has a meaning which depends on OPERATION:
9571   For file I/O, TARGET is a file name (except for the special case below).
9572   For process I/O, TARGET is a process name.
9573   For network I/O, TARGET is a service name or a port number.
9574
9575 This function looks up what is specified for TARGET in
9576 `file-coding-system-alist', `process-coding-system-alist',
9577 or `network-coding-system-alist' depending on OPERATION.
9578 They may specify a coding system, a cons of coding systems,
9579 or a function symbol to call.
9580 In the last case, we call the function with one argument,
9581 which is a list of all the arguments given to this function.
9582 If the function can't decide a coding system, it can return
9583 `undecided' so that the normal code-detection is performed.
9584
9585 If OPERATION is `insert-file-contents', the argument corresponding to
9586 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9587 file name to look up, and BUFFER is a buffer that contains the file's
9588 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9589 function to call for FILENAME, that function should examine the
9590 contents of BUFFER instead of reading the file.
9591
9592 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9593   (ptrdiff_t nargs, Lisp_Object *args)
9594 {
9595   Lisp_Object operation, target_idx, target, val;
9596   register Lisp_Object chain;
9597
9598   if (nargs < 2)
9599     error ("Too few arguments");
9600   operation = args[0];
9601   if (!SYMBOLP (operation)
9602       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9603     error ("Invalid first argument");
9604   if (nargs <= 1 + XFASTINT (target_idx))
9605     error ("Too few arguments for operation `%s'",
9606            SDATA (SYMBOL_NAME (operation)));
9607   target = args[XFASTINT (target_idx) + 1];
9608   if (!(STRINGP (target)
9609         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9610             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9611         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9612     error ("Invalid argument %"pI"d of operation `%s'",
9613            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9614   if (CONSP (target))
9615     target = XCAR (target);
9616
9617   chain = ((EQ (operation, Qinsert_file_contents)
9618             || EQ (operation, Qwrite_region))
9619            ? Vfile_coding_system_alist
9620            : (EQ (operation, Qopen_network_stream)
9621               ? Vnetwork_coding_system_alist
9622               : Vprocess_coding_system_alist));
9623   if (NILP (chain))
9624     return Qnil;
9625
9626   for (; CONSP (chain); chain = XCDR (chain))
9627     {
9628       Lisp_Object elt;
9629
9630       elt = XCAR (chain);
9631       if (CONSP (elt)
9632           && ((STRINGP (target)
9633                && STRINGP (XCAR (elt))
9634                && fast_string_match (XCAR (elt), target) >= 0)
9635               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9636         {
9637           val = XCDR (elt);
9638           /* Here, if VAL is both a valid coding system and a valid
9639              function symbol, we return VAL as a coding system.  */
9640           if (CONSP (val))
9641             return val;
9642           if (! SYMBOLP (val))
9643             return Qnil;
9644           if (! NILP (Fcoding_system_p (val)))
9645             return Fcons (val, val);
9646           if (! NILP (Ffboundp (val)))
9647             {
9648               /* We use call1 rather than safe_call1
9649                  so as to get bug reports about functions called here
9650                  which don't handle the current interface.  */
9651               val = call1 (val, Flist (nargs, args));
9652               if (CONSP (val))
9653                 return val;
9654               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9655                 return Fcons (val, val);
9656             }
9657           return Qnil;
9658         }
9659     }
9660   return Qnil;
9661 }
9662
9663 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9664        Sset_coding_system_priority, 0, MANY, 0,
9665        doc: /* Assign higher priority to the coding systems given as arguments.
9666 If multiple coding systems belong to the same category,
9667 all but the first one are ignored.
9668
9669 usage: (set-coding-system-priority &rest coding-systems)  */)
9670   (ptrdiff_t nargs, Lisp_Object *args)
9671 {
9672   ptrdiff_t i, j;
9673   bool changed[coding_category_max];
9674   enum coding_category priorities[coding_category_max];
9675
9676   memset (changed, 0, sizeof changed);
9677
9678   for (i = j = 0; i < nargs; i++)
9679     {
9680       enum coding_category category;
9681       Lisp_Object spec, attrs;
9682
9683       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9684       attrs = AREF (spec, 0);
9685       category = XINT (CODING_ATTR_CATEGORY (attrs));
9686       if (changed[category])
9687         /* Ignore this coding system because a coding system of the
9688            same category already had a higher priority.  */
9689         continue;
9690       changed[category] = 1;
9691       priorities[j++] = category;
9692       if (coding_categories[category].id >= 0
9693           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9694         setup_coding_system (args[i], &coding_categories[category]);
9695       Fset (AREF (Vcoding_category_table, category), args[i]);
9696     }
9697
9698   /* Now we have decided top J priorities.  Reflect the order of the
9699      original priorities to the remaining priorities.  */
9700
9701   for (i = j, j = 0; i < coding_category_max; i++, j++)
9702     {
9703       while (j < coding_category_max
9704              && changed[coding_priorities[j]])
9705         j++;
9706       if (j == coding_category_max)
9707         emacs_abort ();
9708       priorities[i] = coding_priorities[j];
9709     }
9710
9711   memcpy (coding_priorities, priorities, sizeof priorities);
9712
9713   /* Update `coding-category-list'.  */
9714   Vcoding_category_list = Qnil;
9715   for (i = coding_category_max; i-- > 0; )
9716     Vcoding_category_list
9717       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9718                Vcoding_category_list);
9719
9720   return Qnil;
9721 }
9722
9723 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9724        Scoding_system_priority_list, 0, 1, 0,
9725        doc: /* Return a list of coding systems ordered by their priorities.
9726 The list contains a subset of coding systems; i.e. coding systems
9727 assigned to each coding category (see `coding-category-list').
9728
9729 HIGHESTP non-nil means just return the highest priority one.  */)
9730   (Lisp_Object highestp)
9731 {
9732   int i;
9733   Lisp_Object val;
9734
9735   for (i = 0, val = Qnil; i < coding_category_max; i++)
9736     {
9737       enum coding_category category = coding_priorities[i];
9738       int id = coding_categories[category].id;
9739       Lisp_Object attrs;
9740
9741       if (id < 0)
9742         continue;
9743       attrs = CODING_ID_ATTRS (id);
9744       if (! NILP (highestp))
9745         return CODING_ATTR_BASE_NAME (attrs);
9746       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9747     }
9748   return Fnreverse (val);
9749 }
9750
9751 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9752
9753 static Lisp_Object
9754 make_subsidiaries (Lisp_Object base)
9755 {
9756   Lisp_Object subsidiaries;
9757   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9758   char *buf = alloca (base_name_len + 6);
9759   int i;
9760
9761   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9762   subsidiaries = make_uninit_vector (3);
9763   for (i = 0; i < 3; i++)
9764     {
9765       strcpy (buf + base_name_len, suffixes[i]);
9766       ASET (subsidiaries, i, intern (buf));
9767     }
9768   return subsidiaries;
9769 }
9770
9771
9772 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9773        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9774        doc: /* For internal use only.
9775 usage: (define-coding-system-internal ...)  */)
9776   (ptrdiff_t nargs, Lisp_Object *args)
9777 {
9778   Lisp_Object name;
9779   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9780   Lisp_Object attrs;            /* Vector of attributes.  */
9781   Lisp_Object eol_type;
9782   Lisp_Object aliases;
9783   Lisp_Object coding_type, charset_list, safe_charsets;
9784   enum coding_category category;
9785   Lisp_Object tail, val;
9786   int max_charset_id = 0;
9787   int i;
9788
9789   if (nargs < coding_arg_max)
9790     goto short_args;
9791
9792   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9793
9794   name = args[coding_arg_name];
9795   CHECK_SYMBOL (name);
9796   ASET (attrs, coding_attr_base_name, name);
9797
9798   val = args[coding_arg_mnemonic];
9799   if (! STRINGP (val))
9800     CHECK_CHARACTER (val);
9801   ASET (attrs, coding_attr_mnemonic, val);
9802
9803   coding_type = args[coding_arg_coding_type];
9804   CHECK_SYMBOL (coding_type);
9805   ASET (attrs, coding_attr_type, coding_type);
9806
9807   charset_list = args[coding_arg_charset_list];
9808   if (SYMBOLP (charset_list))
9809     {
9810       if (EQ (charset_list, Qiso_2022))
9811         {
9812           if (! EQ (coding_type, Qiso_2022))
9813             error ("Invalid charset-list");
9814           charset_list = Viso_2022_charset_list;
9815         }
9816       else if (EQ (charset_list, Qemacs_mule))
9817         {
9818           if (! EQ (coding_type, Qemacs_mule))
9819             error ("Invalid charset-list");
9820           charset_list = Vemacs_mule_charset_list;
9821         }
9822       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9823         {
9824           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9825             error ("Invalid charset-list");
9826           if (max_charset_id < XFASTINT (XCAR (tail)))
9827             max_charset_id = XFASTINT (XCAR (tail));
9828         }
9829     }
9830   else
9831     {
9832       charset_list = Fcopy_sequence (charset_list);
9833       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9834         {
9835           struct charset *charset;
9836
9837           val = XCAR (tail);
9838           CHECK_CHARSET_GET_CHARSET (val, charset);
9839           if (EQ (coding_type, Qiso_2022)
9840               ? CHARSET_ISO_FINAL (charset) < 0
9841               : EQ (coding_type, Qemacs_mule)
9842               ? CHARSET_EMACS_MULE_ID (charset) < 0
9843               : 0)
9844             error ("Can't handle charset `%s'",
9845                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9846
9847           XSETCAR (tail, make_number (charset->id));
9848           if (max_charset_id < charset->id)
9849             max_charset_id = charset->id;
9850         }
9851     }
9852   ASET (attrs, coding_attr_charset_list, charset_list);
9853
9854   safe_charsets = make_uninit_string (max_charset_id + 1);
9855   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9856   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9857     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9858   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9859
9860   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9861
9862   val = args[coding_arg_decode_translation_table];
9863   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9864     CHECK_SYMBOL (val);
9865   ASET (attrs, coding_attr_decode_tbl, val);
9866
9867   val = args[coding_arg_encode_translation_table];
9868   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9869     CHECK_SYMBOL (val);
9870   ASET (attrs, coding_attr_encode_tbl, val);
9871
9872   val = args[coding_arg_post_read_conversion];
9873   CHECK_SYMBOL (val);
9874   ASET (attrs, coding_attr_post_read, val);
9875
9876   val = args[coding_arg_pre_write_conversion];
9877   CHECK_SYMBOL (val);
9878   ASET (attrs, coding_attr_pre_write, val);
9879
9880   val = args[coding_arg_default_char];
9881   if (NILP (val))
9882     ASET (attrs, coding_attr_default_char, make_number (' '));
9883   else
9884     {
9885       CHECK_CHARACTER (val);
9886       ASET (attrs, coding_attr_default_char, val);
9887     }
9888
9889   val = args[coding_arg_for_unibyte];
9890   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9891
9892   val = args[coding_arg_plist];
9893   CHECK_LIST (val);
9894   ASET (attrs, coding_attr_plist, val);
9895
9896   if (EQ (coding_type, Qcharset))
9897     {
9898       /* Generate a lisp vector of 256 elements.  Each element is nil,
9899          integer, or a list of charset IDs.
9900
9901          If Nth element is nil, the byte code N is invalid in this
9902          coding system.
9903
9904          If Nth element is a number NUM, N is the first byte of a
9905          charset whose ID is NUM.
9906
9907          If Nth element is a list of charset IDs, N is the first byte
9908          of one of them.  The list is sorted by dimensions of the
9909          charsets.  A charset of smaller dimension comes first. */
9910       val = Fmake_vector (make_number (256), Qnil);
9911
9912       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9913         {
9914           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9915           int dim = CHARSET_DIMENSION (charset);
9916           int idx = (dim - 1) * 4;
9917
9918           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9919             ASET (attrs, coding_attr_ascii_compat, Qt);
9920
9921           for (i = charset->code_space[idx];
9922                i <= charset->code_space[idx + 1]; i++)
9923             {
9924               Lisp_Object tmp, tmp2;
9925               int dim2;
9926
9927               tmp = AREF (val, i);
9928               if (NILP (tmp))
9929                 tmp = XCAR (tail);
9930               else if (NUMBERP (tmp))
9931                 {
9932                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9933                   if (dim < dim2)
9934                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9935                   else
9936                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9937                 }
9938               else
9939                 {
9940                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9941                     {
9942                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9943                       if (dim < dim2)
9944                         break;
9945                     }
9946                   if (NILP (tmp2))
9947                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9948                   else
9949                     {
9950                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9951                       XSETCAR (tmp2, XCAR (tail));
9952                     }
9953                 }
9954               ASET (val, i, tmp);
9955             }
9956         }
9957       ASET (attrs, coding_attr_charset_valids, val);
9958       category = coding_category_charset;
9959     }
9960   else if (EQ (coding_type, Qccl))
9961     {
9962       Lisp_Object valids;
9963
9964       if (nargs < coding_arg_ccl_max)
9965         goto short_args;
9966
9967       val = args[coding_arg_ccl_decoder];
9968       CHECK_CCL_PROGRAM (val);
9969       if (VECTORP (val))
9970         val = Fcopy_sequence (val);
9971       ASET (attrs, coding_attr_ccl_decoder, val);
9972
9973       val = args[coding_arg_ccl_encoder];
9974       CHECK_CCL_PROGRAM (val);
9975       if (VECTORP (val))
9976         val = Fcopy_sequence (val);
9977       ASET (attrs, coding_attr_ccl_encoder, val);
9978
9979       val = args[coding_arg_ccl_valids];
9980       valids = Fmake_string (make_number (256), make_number (0));
9981       for (tail = val; CONSP (tail); tail = XCDR (tail))
9982         {
9983           int from, to;
9984
9985           val = XCAR (tail);
9986           if (INTEGERP (val))
9987             {
9988               if (! (0 <= XINT (val) && XINT (val) <= 255))
9989                 args_out_of_range_3 (val, make_number (0), make_number (255));
9990               from = to = XINT (val);
9991             }
9992           else
9993             {
9994               CHECK_CONS (val);
9995               CHECK_NATNUM_CAR (val);
9996               CHECK_NUMBER_CDR (val);
9997               if (XINT (XCAR (val)) > 255)
9998                 args_out_of_range_3 (XCAR (val),
9999                                      make_number (0), make_number (255));
10000               from = XINT (XCAR (val));
10001               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10002                 args_out_of_range_3 (XCDR (val),
10003                                      XCAR (val), make_number (255));
10004               to = XINT (XCDR (val));
10005             }
10006           for (i = from; i <= to; i++)
10007             SSET (valids, i, 1);
10008         }
10009       ASET (attrs, coding_attr_ccl_valids, valids);
10010
10011       category = coding_category_ccl;
10012     }
10013   else if (EQ (coding_type, Qutf_16))
10014     {
10015       Lisp_Object bom, endian;
10016
10017       ASET (attrs, coding_attr_ascii_compat, Qnil);
10018
10019       if (nargs < coding_arg_utf16_max)
10020         goto short_args;
10021
10022       bom = args[coding_arg_utf16_bom];
10023       if (! NILP (bom) && ! EQ (bom, Qt))
10024         {
10025           CHECK_CONS (bom);
10026           val = XCAR (bom);
10027           CHECK_CODING_SYSTEM (val);
10028           val = XCDR (bom);
10029           CHECK_CODING_SYSTEM (val);
10030         }
10031       ASET (attrs, coding_attr_utf_bom, bom);
10032
10033       endian = args[coding_arg_utf16_endian];
10034       CHECK_SYMBOL (endian);
10035       if (NILP (endian))
10036         endian = Qbig;
10037       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10038         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10039       ASET (attrs, coding_attr_utf_16_endian, endian);
10040
10041       category = (CONSP (bom)
10042                   ? coding_category_utf_16_auto
10043                   : NILP (bom)
10044                   ? (EQ (endian, Qbig)
10045                      ? coding_category_utf_16_be_nosig
10046                      : coding_category_utf_16_le_nosig)
10047                   : (EQ (endian, Qbig)
10048                      ? coding_category_utf_16_be
10049                      : coding_category_utf_16_le));
10050     }
10051   else if (EQ (coding_type, Qiso_2022))
10052     {
10053       Lisp_Object initial, reg_usage, request, flags;
10054
10055       if (nargs < coding_arg_iso2022_max)
10056         goto short_args;
10057
10058       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10059       CHECK_VECTOR (initial);
10060       for (i = 0; i < 4; i++)
10061         {
10062           val = AREF (initial, i);
10063           if (! NILP (val))
10064             {
10065               struct charset *charset;
10066
10067               CHECK_CHARSET_GET_CHARSET (val, charset);
10068               ASET (initial, i, make_number (CHARSET_ID (charset)));
10069               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10070                 ASET (attrs, coding_attr_ascii_compat, Qt);
10071             }
10072           else
10073             ASET (initial, i, make_number (-1));
10074         }
10075
10076       reg_usage = args[coding_arg_iso2022_reg_usage];
10077       CHECK_CONS (reg_usage);
10078       CHECK_NUMBER_CAR (reg_usage);
10079       CHECK_NUMBER_CDR (reg_usage);
10080
10081       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10082       for (tail = request; CONSP (tail); tail = XCDR (tail))
10083         {
10084           int id;
10085           Lisp_Object tmp1;
10086
10087           val = XCAR (tail);
10088           CHECK_CONS (val);
10089           tmp1 = XCAR (val);
10090           CHECK_CHARSET_GET_ID (tmp1, id);
10091           CHECK_NATNUM_CDR (val);
10092           if (XINT (XCDR (val)) >= 4)
10093             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10094           XSETCAR (val, make_number (id));
10095         }
10096
10097       flags = args[coding_arg_iso2022_flags];
10098       CHECK_NATNUM (flags);
10099       i = XINT (flags) & INT_MAX;
10100       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10101         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10102       flags = make_number (i);
10103
10104       ASET (attrs, coding_attr_iso_initial, initial);
10105       ASET (attrs, coding_attr_iso_usage, reg_usage);
10106       ASET (attrs, coding_attr_iso_request, request);
10107       ASET (attrs, coding_attr_iso_flags, flags);
10108       setup_iso_safe_charsets (attrs);
10109
10110       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10111         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10112                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10113                     ? coding_category_iso_7_else
10114                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10115                     ? coding_category_iso_7
10116                     : coding_category_iso_7_tight);
10117       else
10118         {
10119           int id = XINT (AREF (initial, 1));
10120
10121           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10122                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10123                        || id < 0)
10124                       ? coding_category_iso_8_else
10125                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10126                       ? coding_category_iso_8_1
10127                       : coding_category_iso_8_2);
10128         }
10129       if (category != coding_category_iso_8_1
10130           && category != coding_category_iso_8_2)
10131         ASET (attrs, coding_attr_ascii_compat, Qnil);
10132     }
10133   else if (EQ (coding_type, Qemacs_mule))
10134     {
10135       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10136         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10137       ASET (attrs, coding_attr_ascii_compat, Qt);
10138       category = coding_category_emacs_mule;
10139     }
10140   else if (EQ (coding_type, Qshift_jis))
10141     {
10142
10143       struct charset *charset;
10144
10145       if (XINT (Flength (charset_list)) != 3
10146           && XINT (Flength (charset_list)) != 4)
10147         error ("There should be three or four charsets");
10148
10149       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10150       if (CHARSET_DIMENSION (charset) != 1)
10151         error ("Dimension of charset %s is not one",
10152                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10153       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10154         ASET (attrs, coding_attr_ascii_compat, Qt);
10155
10156       charset_list = XCDR (charset_list);
10157       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10158       if (CHARSET_DIMENSION (charset) != 1)
10159         error ("Dimension of charset %s is not one",
10160                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10161
10162       charset_list = XCDR (charset_list);
10163       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10164       if (CHARSET_DIMENSION (charset) != 2)
10165         error ("Dimension of charset %s is not two",
10166                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10167
10168       charset_list = XCDR (charset_list);
10169       if (! NILP (charset_list))
10170         {
10171           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10172           if (CHARSET_DIMENSION (charset) != 2)
10173             error ("Dimension of charset %s is not two",
10174                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10175         }
10176
10177       category = coding_category_sjis;
10178       Vsjis_coding_system = name;
10179     }
10180   else if (EQ (coding_type, Qbig5))
10181     {
10182       struct charset *charset;
10183
10184       if (XINT (Flength (charset_list)) != 2)
10185         error ("There should be just two charsets");
10186
10187       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10188       if (CHARSET_DIMENSION (charset) != 1)
10189         error ("Dimension of charset %s is not one",
10190                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10191       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10192         ASET (attrs, coding_attr_ascii_compat, Qt);
10193
10194       charset_list = XCDR (charset_list);
10195       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10196       if (CHARSET_DIMENSION (charset) != 2)
10197         error ("Dimension of charset %s is not two",
10198                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10199
10200       category = coding_category_big5;
10201       Vbig5_coding_system = name;
10202     }
10203   else if (EQ (coding_type, Qraw_text))
10204     {
10205       category = coding_category_raw_text;
10206       ASET (attrs, coding_attr_ascii_compat, Qt);
10207     }
10208   else if (EQ (coding_type, Qutf_8))
10209     {
10210       Lisp_Object bom;
10211
10212       if (nargs < coding_arg_utf8_max)
10213         goto short_args;
10214
10215       bom = args[coding_arg_utf8_bom];
10216       if (! NILP (bom) && ! EQ (bom, Qt))
10217         {
10218           CHECK_CONS (bom);
10219           val = XCAR (bom);
10220           CHECK_CODING_SYSTEM (val);
10221           val = XCDR (bom);
10222           CHECK_CODING_SYSTEM (val);
10223         }
10224       ASET (attrs, coding_attr_utf_bom, bom);
10225       if (NILP (bom))
10226         ASET (attrs, coding_attr_ascii_compat, Qt);
10227
10228       category = (CONSP (bom) ? coding_category_utf_8_auto
10229                   : NILP (bom) ? coding_category_utf_8_nosig
10230                   : coding_category_utf_8_sig);
10231     }
10232   else if (EQ (coding_type, Qundecided))
10233     category = coding_category_undecided;
10234   else
10235     error ("Invalid coding system type: %s",
10236            SDATA (SYMBOL_NAME (coding_type)));
10237
10238   ASET (attrs, coding_attr_category, make_number (category));
10239   ASET (attrs, coding_attr_plist,
10240         Fcons (QCcategory,
10241                Fcons (AREF (Vcoding_category_table, category),
10242                       CODING_ATTR_PLIST (attrs))));
10243   ASET (attrs, coding_attr_plist,
10244         Fcons (QCascii_compatible_p,
10245                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10246                       CODING_ATTR_PLIST (attrs))));
10247
10248   eol_type = args[coding_arg_eol_type];
10249   if (! NILP (eol_type)
10250       && ! EQ (eol_type, Qunix)
10251       && ! EQ (eol_type, Qdos)
10252       && ! EQ (eol_type, Qmac))
10253     error ("Invalid eol-type");
10254
10255   aliases = Fcons (name, Qnil);
10256
10257   if (NILP (eol_type))
10258     {
10259       eol_type = make_subsidiaries (name);
10260       for (i = 0; i < 3; i++)
10261         {
10262           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10263
10264           this_name = AREF (eol_type, i);
10265           this_aliases = Fcons (this_name, Qnil);
10266           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10267           this_spec = make_uninit_vector (3);
10268           ASET (this_spec, 0, attrs);
10269           ASET (this_spec, 1, this_aliases);
10270           ASET (this_spec, 2, this_eol_type);
10271           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10272           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10273           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10274           if (NILP (val))
10275             Vcoding_system_alist
10276               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10277                        Vcoding_system_alist);
10278         }
10279     }
10280
10281   spec_vec = make_uninit_vector (3);
10282   ASET (spec_vec, 0, attrs);
10283   ASET (spec_vec, 1, aliases);
10284   ASET (spec_vec, 2, eol_type);
10285
10286   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10287   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10288   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10289   if (NILP (val))
10290     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10291                                   Vcoding_system_alist);
10292
10293   {
10294     int id = coding_categories[category].id;
10295
10296     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10297       setup_coding_system (name, &coding_categories[category]);
10298   }
10299
10300   return Qnil;
10301
10302  short_args:
10303   return Fsignal (Qwrong_number_of_arguments,
10304                   Fcons (intern ("define-coding-system-internal"),
10305                          make_number (nargs)));
10306 }
10307
10308
10309 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10310        3, 3, 0,
10311        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10312   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10313 {
10314   Lisp_Object spec, attrs;
10315
10316   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10317   attrs = AREF (spec, 0);
10318   if (EQ (prop, QCmnemonic))
10319     {
10320       if (! STRINGP (val))
10321         CHECK_CHARACTER (val);
10322       ASET (attrs, coding_attr_mnemonic, val);
10323     }
10324   else if (EQ (prop, QCdefault_char))
10325     {
10326       if (NILP (val))
10327         val = make_number (' ');
10328       else
10329         CHECK_CHARACTER (val);
10330       ASET (attrs, coding_attr_default_char, val);
10331     }
10332   else if (EQ (prop, QCdecode_translation_table))
10333     {
10334       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10335         CHECK_SYMBOL (val);
10336       ASET (attrs, coding_attr_decode_tbl, val);
10337     }
10338   else if (EQ (prop, QCencode_translation_table))
10339     {
10340       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10341         CHECK_SYMBOL (val);
10342       ASET (attrs, coding_attr_encode_tbl, val);
10343     }
10344   else if (EQ (prop, QCpost_read_conversion))
10345     {
10346       CHECK_SYMBOL (val);
10347       ASET (attrs, coding_attr_post_read, val);
10348     }
10349   else if (EQ (prop, QCpre_write_conversion))
10350     {
10351       CHECK_SYMBOL (val);
10352       ASET (attrs, coding_attr_pre_write, val);
10353     }
10354   else if (EQ (prop, QCascii_compatible_p))
10355     {
10356       ASET (attrs, coding_attr_ascii_compat, val);
10357     }
10358
10359   ASET (attrs, coding_attr_plist,
10360         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10361   return val;
10362 }
10363
10364
10365 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10366        Sdefine_coding_system_alias, 2, 2, 0,
10367        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10368   (Lisp_Object alias, Lisp_Object coding_system)
10369 {
10370   Lisp_Object spec, aliases, eol_type, val;
10371
10372   CHECK_SYMBOL (alias);
10373   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10374   aliases = AREF (spec, 1);
10375   /* ALIASES should be a list of length more than zero, and the first
10376      element is a base coding system.  Append ALIAS at the tail of the
10377      list.  */
10378   while (!NILP (XCDR (aliases)))
10379     aliases = XCDR (aliases);
10380   XSETCDR (aliases, Fcons (alias, Qnil));
10381
10382   eol_type = AREF (spec, 2);
10383   if (VECTORP (eol_type))
10384     {
10385       Lisp_Object subsidiaries;
10386       int i;
10387
10388       subsidiaries = make_subsidiaries (alias);
10389       for (i = 0; i < 3; i++)
10390         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10391                                      AREF (eol_type, i));
10392     }
10393
10394   Fputhash (alias, spec, Vcoding_system_hash_table);
10395   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10396   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10397   if (NILP (val))
10398     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10399                                   Vcoding_system_alist);
10400
10401   return Qnil;
10402 }
10403
10404 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10405        1, 1, 0,
10406        doc: /* Return the base of CODING-SYSTEM.
10407 Any alias or subsidiary coding system is not a base coding system.  */)
10408   (Lisp_Object coding_system)
10409 {
10410   Lisp_Object spec, attrs;
10411
10412   if (NILP (coding_system))
10413     return (Qno_conversion);
10414   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10415   attrs = AREF (spec, 0);
10416   return CODING_ATTR_BASE_NAME (attrs);
10417 }
10418
10419 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10420        1, 1, 0,
10421        doc: "Return the property list of CODING-SYSTEM.")
10422   (Lisp_Object coding_system)
10423 {
10424   Lisp_Object spec, attrs;
10425
10426   if (NILP (coding_system))
10427     coding_system = Qno_conversion;
10428   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10429   attrs = AREF (spec, 0);
10430   return CODING_ATTR_PLIST (attrs);
10431 }
10432
10433
10434 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10435        1, 1, 0,
10436        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10437   (Lisp_Object coding_system)
10438 {
10439   Lisp_Object spec;
10440
10441   if (NILP (coding_system))
10442     coding_system = Qno_conversion;
10443   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10444   return AREF (spec, 1);
10445 }
10446
10447 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10448        Scoding_system_eol_type, 1, 1, 0,
10449        doc: /* Return eol-type of CODING-SYSTEM.
10450 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10451
10452 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10453 and CR respectively.
10454
10455 A vector value indicates that a format of end-of-line should be
10456 detected automatically.  Nth element of the vector is the subsidiary
10457 coding system whose eol-type is N.  */)
10458   (Lisp_Object coding_system)
10459 {
10460   Lisp_Object spec, eol_type;
10461   int n;
10462
10463   if (NILP (coding_system))
10464     coding_system = Qno_conversion;
10465   if (! CODING_SYSTEM_P (coding_system))
10466     return Qnil;
10467   spec = CODING_SYSTEM_SPEC (coding_system);
10468   eol_type = AREF (spec, 2);
10469   if (VECTORP (eol_type))
10470     return Fcopy_sequence (eol_type);
10471   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10472   return make_number (n);
10473 }
10474
10475 #endif /* emacs */
10476
10477 \f
10478 /*** 9. Post-amble ***/
10479
10480 void
10481 init_coding_once (void)
10482 {
10483   int i;
10484
10485   for (i = 0; i < coding_category_max; i++)
10486     {
10487       coding_categories[i].id = -1;
10488       coding_priorities[i] = i;
10489     }
10490
10491   /* ISO2022 specific initialize routine.  */
10492   for (i = 0; i < 0x20; i++)
10493     iso_code_class[i] = ISO_control_0;
10494   for (i = 0x21; i < 0x7F; i++)
10495     iso_code_class[i] = ISO_graphic_plane_0;
10496   for (i = 0x80; i < 0xA0; i++)
10497     iso_code_class[i] = ISO_control_1;
10498   for (i = 0xA1; i < 0xFF; i++)
10499     iso_code_class[i] = ISO_graphic_plane_1;
10500   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10501   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10502   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10503   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10504   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10505   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10506   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10507   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10508   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10509
10510   for (i = 0; i < 256; i++)
10511     {
10512       emacs_mule_bytes[i] = 1;
10513     }
10514   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10515   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10516   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10517   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10518 }
10519
10520 #ifdef emacs
10521
10522 void
10523 syms_of_coding (void)
10524 {
10525   staticpro (&Vcoding_system_hash_table);
10526   {
10527     Lisp_Object args[2];
10528     args[0] = QCtest;
10529     args[1] = Qeq;
10530     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10531   }
10532
10533   staticpro (&Vsjis_coding_system);
10534   Vsjis_coding_system = Qnil;
10535
10536   staticpro (&Vbig5_coding_system);
10537   Vbig5_coding_system = Qnil;
10538
10539   staticpro (&Vcode_conversion_reused_workbuf);
10540   Vcode_conversion_reused_workbuf = Qnil;
10541
10542   staticpro (&Vcode_conversion_workbuf_name);
10543   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10544
10545   reused_workbuf_in_use = 0;
10546
10547   DEFSYM (Qcharset, "charset");
10548   DEFSYM (Qtarget_idx, "target-idx");
10549   DEFSYM (Qcoding_system_history, "coding-system-history");
10550   Fset (Qcoding_system_history, Qnil);
10551
10552   /* Target FILENAME is the first argument.  */
10553   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10554   /* Target FILENAME is the third argument.  */
10555   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10556
10557   DEFSYM (Qcall_process, "call-process");
10558   /* Target PROGRAM is the first argument.  */
10559   Fput (Qcall_process, Qtarget_idx, make_number (0));
10560
10561   DEFSYM (Qcall_process_region, "call-process-region");
10562   /* Target PROGRAM is the third argument.  */
10563   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10564
10565   DEFSYM (Qstart_process, "start-process");
10566   /* Target PROGRAM is the third argument.  */
10567   Fput (Qstart_process, Qtarget_idx, make_number (2));
10568
10569   DEFSYM (Qopen_network_stream, "open-network-stream");
10570   /* Target SERVICE is the fourth argument.  */
10571   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10572
10573   DEFSYM (Qcoding_system, "coding-system");
10574   DEFSYM (Qcoding_aliases, "coding-aliases");
10575
10576   DEFSYM (Qeol_type, "eol-type");
10577   DEFSYM (Qunix, "unix");
10578   DEFSYM (Qdos, "dos");
10579   DEFSYM (Qmac, "mac");
10580
10581   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10582   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10583   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10584   DEFSYM (Qdefault_char, "default-char");
10585   DEFSYM (Qundecided, "undecided");
10586   DEFSYM (Qno_conversion, "no-conversion");
10587   DEFSYM (Qraw_text, "raw-text");
10588
10589   DEFSYM (Qiso_2022, "iso-2022");
10590
10591   DEFSYM (Qutf_8, "utf-8");
10592   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10593
10594 #if defined (WINDOWSNT) || defined (CYGWIN)
10595   /* No, not utf-16-le: that one has a BOM.  */
10596   DEFSYM (Qutf_16le, "utf-16le");
10597 #endif
10598
10599   DEFSYM (Qutf_16, "utf-16");
10600   DEFSYM (Qbig, "big");
10601   DEFSYM (Qlittle, "little");
10602
10603   DEFSYM (Qshift_jis, "shift-jis");
10604   DEFSYM (Qbig5, "big5");
10605
10606   DEFSYM (Qcoding_system_p, "coding-system-p");
10607
10608   DEFSYM (Qcoding_system_error, "coding-system-error");
10609   Fput (Qcoding_system_error, Qerror_conditions,
10610         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10611   Fput (Qcoding_system_error, Qerror_message,
10612         build_pure_c_string ("Invalid coding system"));
10613
10614   /* Intern this now in case it isn't already done.
10615      Setting this variable twice is harmless.
10616      But don't staticpro it here--that is done in alloc.c.  */
10617   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10618
10619   DEFSYM (Qtranslation_table, "translation-table");
10620   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10621   DEFSYM (Qtranslation_table_id, "translation-table-id");
10622   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10623   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10624
10625   DEFSYM (Qvalid_codes, "valid-codes");
10626
10627   DEFSYM (Qemacs_mule, "emacs-mule");
10628
10629   DEFSYM (QCcategory, ":category");
10630   DEFSYM (QCmnemonic, ":mnemonic");
10631   DEFSYM (QCdefault_char, ":default-char");
10632   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10633   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10634   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10635   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10636   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10637
10638   Vcoding_category_table
10639     = Fmake_vector (make_number (coding_category_max), Qnil);
10640   staticpro (&Vcoding_category_table);
10641   /* Followings are target of code detection.  */
10642   ASET (Vcoding_category_table, coding_category_iso_7,
10643         intern_c_string ("coding-category-iso-7"));
10644   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10645         intern_c_string ("coding-category-iso-7-tight"));
10646   ASET (Vcoding_category_table, coding_category_iso_8_1,
10647         intern_c_string ("coding-category-iso-8-1"));
10648   ASET (Vcoding_category_table, coding_category_iso_8_2,
10649         intern_c_string ("coding-category-iso-8-2"));
10650   ASET (Vcoding_category_table, coding_category_iso_7_else,
10651         intern_c_string ("coding-category-iso-7-else"));
10652   ASET (Vcoding_category_table, coding_category_iso_8_else,
10653         intern_c_string ("coding-category-iso-8-else"));
10654   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10655         intern_c_string ("coding-category-utf-8-auto"));
10656   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10657         intern_c_string ("coding-category-utf-8"));
10658   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10659         intern_c_string ("coding-category-utf-8-sig"));
10660   ASET (Vcoding_category_table, coding_category_utf_16_be,
10661         intern_c_string ("coding-category-utf-16-be"));
10662   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10663         intern_c_string ("coding-category-utf-16-auto"));
10664   ASET (Vcoding_category_table, coding_category_utf_16_le,
10665         intern_c_string ("coding-category-utf-16-le"));
10666   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10667         intern_c_string ("coding-category-utf-16-be-nosig"));
10668   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10669         intern_c_string ("coding-category-utf-16-le-nosig"));
10670   ASET (Vcoding_category_table, coding_category_charset,
10671         intern_c_string ("coding-category-charset"));
10672   ASET (Vcoding_category_table, coding_category_sjis,
10673         intern_c_string ("coding-category-sjis"));
10674   ASET (Vcoding_category_table, coding_category_big5,
10675         intern_c_string ("coding-category-big5"));
10676   ASET (Vcoding_category_table, coding_category_ccl,
10677         intern_c_string ("coding-category-ccl"));
10678   ASET (Vcoding_category_table, coding_category_emacs_mule,
10679         intern_c_string ("coding-category-emacs-mule"));
10680   /* Followings are NOT target of code detection.  */
10681   ASET (Vcoding_category_table, coding_category_raw_text,
10682         intern_c_string ("coding-category-raw-text"));
10683   ASET (Vcoding_category_table, coding_category_undecided,
10684         intern_c_string ("coding-category-undecided"));
10685
10686   DEFSYM (Qinsufficient_source, "insufficient-source");
10687   DEFSYM (Qinvalid_source, "invalid-source");
10688   DEFSYM (Qinterrupted, "interrupted");
10689   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10690
10691   defsubr (&Scoding_system_p);
10692   defsubr (&Sread_coding_system);
10693   defsubr (&Sread_non_nil_coding_system);
10694   defsubr (&Scheck_coding_system);
10695   defsubr (&Sdetect_coding_region);
10696   defsubr (&Sdetect_coding_string);
10697   defsubr (&Sfind_coding_systems_region_internal);
10698   defsubr (&Sunencodable_char_position);
10699   defsubr (&Scheck_coding_systems_region);
10700   defsubr (&Sdecode_coding_region);
10701   defsubr (&Sencode_coding_region);
10702   defsubr (&Sdecode_coding_string);
10703   defsubr (&Sencode_coding_string);
10704   defsubr (&Sdecode_sjis_char);
10705   defsubr (&Sencode_sjis_char);
10706   defsubr (&Sdecode_big5_char);
10707   defsubr (&Sencode_big5_char);
10708   defsubr (&Sset_terminal_coding_system_internal);
10709   defsubr (&Sset_safe_terminal_coding_system_internal);
10710   defsubr (&Sterminal_coding_system);
10711   defsubr (&Sset_keyboard_coding_system_internal);
10712   defsubr (&Skeyboard_coding_system);
10713   defsubr (&Sfind_operation_coding_system);
10714   defsubr (&Sset_coding_system_priority);
10715   defsubr (&Sdefine_coding_system_internal);
10716   defsubr (&Sdefine_coding_system_alias);
10717   defsubr (&Scoding_system_put);
10718   defsubr (&Scoding_system_base);
10719   defsubr (&Scoding_system_plist);
10720   defsubr (&Scoding_system_aliases);
10721   defsubr (&Scoding_system_eol_type);
10722   defsubr (&Scoding_system_priority_list);
10723
10724   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10725                doc: /* List of coding systems.
10726
10727 Do not alter the value of this variable manually.  This variable should be
10728 updated by the functions `define-coding-system' and
10729 `define-coding-system-alias'.  */);
10730   Vcoding_system_list = Qnil;
10731
10732   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10733                doc: /* Alist of coding system names.
10734 Each element is one element list of coding system name.
10735 This variable is given to `completing-read' as COLLECTION argument.
10736
10737 Do not alter the value of this variable manually.  This variable should be
10738 updated by the functions `make-coding-system' and
10739 `define-coding-system-alias'.  */);
10740   Vcoding_system_alist = Qnil;
10741
10742   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10743                doc: /* List of coding-categories (symbols) ordered by priority.
10744
10745 On detecting a coding system, Emacs tries code detection algorithms
10746 associated with each coding-category one by one in this order.  When
10747 one algorithm agrees with a byte sequence of source text, the coding
10748 system bound to the corresponding coding-category is selected.
10749
10750 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10751   {
10752     int i;
10753
10754     Vcoding_category_list = Qnil;
10755     for (i = coding_category_max - 1; i >= 0; i--)
10756       Vcoding_category_list
10757         = Fcons (AREF (Vcoding_category_table, i),
10758                  Vcoding_category_list);
10759   }
10760
10761   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10762                doc: /* Specify the coding system for read operations.
10763 It is useful to bind this variable with `let', but do not set it globally.
10764 If the value is a coding system, it is used for decoding on read operation.
10765 If not, an appropriate element is used from one of the coding system alists.
10766 There are three such tables: `file-coding-system-alist',
10767 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10768   Vcoding_system_for_read = Qnil;
10769
10770   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10771                doc: /* Specify the coding system for write operations.
10772 Programs bind this variable with `let', but you should not set it globally.
10773 If the value is a coding system, it is used for encoding of output,
10774 when writing it to a file and when sending it to a file or subprocess.
10775
10776 If this does not specify a coding system, an appropriate element
10777 is used from one of the coding system alists.
10778 There are three such tables: `file-coding-system-alist',
10779 `process-coding-system-alist', and `network-coding-system-alist'.
10780 For output to files, if the above procedure does not specify a coding system,
10781 the value of `buffer-file-coding-system' is used.  */);
10782   Vcoding_system_for_write = Qnil;
10783
10784   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10785                doc: /*
10786 Coding system used in the latest file or process I/O.  */);
10787   Vlast_coding_system_used = Qnil;
10788
10789   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10790                doc: /*
10791 Error status of the last code conversion.
10792
10793 When an error was detected in the last code conversion, this variable
10794 is set to one of the following symbols.
10795   `insufficient-source'
10796   `inconsistent-eol'
10797   `invalid-source'
10798   `interrupted'
10799   `insufficient-memory'
10800 When no error was detected, the value doesn't change.  So, to check
10801 the error status of a code conversion by this variable, you must
10802 explicitly set this variable to nil before performing code
10803 conversion.  */);
10804   Vlast_code_conversion_error = Qnil;
10805
10806   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10807                doc: /*
10808 *Non-nil means always inhibit code conversion of end-of-line format.
10809 See info node `Coding Systems' and info node `Text and Binary' concerning
10810 such conversion.  */);
10811   inhibit_eol_conversion = 0;
10812
10813   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10814                doc: /*
10815 Non-nil means process buffer inherits coding system of process output.
10816 Bind it to t if the process output is to be treated as if it were a file
10817 read from some filesystem.  */);
10818   inherit_process_coding_system = 0;
10819
10820   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10821                doc: /*
10822 Alist to decide a coding system to use for a file I/O operation.
10823 The format is ((PATTERN . VAL) ...),
10824 where PATTERN is a regular expression matching a file name,
10825 VAL is a coding system, a cons of coding systems, or a function symbol.
10826 If VAL is a coding system, it is used for both decoding and encoding
10827 the file contents.
10828 If VAL is a cons of coding systems, the car part is used for decoding,
10829 and the cdr part is used for encoding.
10830 If VAL is a function symbol, the function must return a coding system
10831 or a cons of coding systems which are used as above.  The function is
10832 called with an argument that is a list of the arguments with which
10833 `find-operation-coding-system' was called.  If the function can't decide
10834 a coding system, it can return `undecided' so that the normal
10835 code-detection is performed.
10836
10837 See also the function `find-operation-coding-system'
10838 and the variable `auto-coding-alist'.  */);
10839   Vfile_coding_system_alist = Qnil;
10840
10841   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10842                doc: /*
10843 Alist to decide a coding system to use for a process I/O operation.
10844 The format is ((PATTERN . VAL) ...),
10845 where PATTERN is a regular expression matching a program name,
10846 VAL is a coding system, a cons of coding systems, or a function symbol.
10847 If VAL is a coding system, it is used for both decoding what received
10848 from the program and encoding what sent to the program.
10849 If VAL is a cons of coding systems, the car part is used for decoding,
10850 and the cdr part is used for encoding.
10851 If VAL is a function symbol, the function must return a coding system
10852 or a cons of coding systems which are used as above.
10853
10854 See also the function `find-operation-coding-system'.  */);
10855   Vprocess_coding_system_alist = Qnil;
10856
10857   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10858                doc: /*
10859 Alist to decide a coding system to use for a network I/O operation.
10860 The format is ((PATTERN . VAL) ...),
10861 where PATTERN is a regular expression matching a network service name
10862 or is a port number to connect to,
10863 VAL is a coding system, a cons of coding systems, or a function symbol.
10864 If VAL is a coding system, it is used for both decoding what received
10865 from the network stream and encoding what sent to the network stream.
10866 If VAL is a cons of coding systems, the car part is used for decoding,
10867 and the cdr part is used for encoding.
10868 If VAL is a function symbol, the function must return a coding system
10869 or a cons of coding systems which are used as above.
10870
10871 See also the function `find-operation-coding-system'.  */);
10872   Vnetwork_coding_system_alist = Qnil;
10873
10874   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10875                doc: /* Coding system to use with system messages.
10876 Also used for decoding keyboard input on X Window system.  */);
10877   Vlocale_coding_system = Qnil;
10878
10879   /* The eol mnemonics are reset in startup.el system-dependently.  */
10880   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10881                doc: /*
10882 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10883   eol_mnemonic_unix = build_pure_c_string (":");
10884
10885   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10886                doc: /*
10887 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10888   eol_mnemonic_dos = build_pure_c_string ("\\");
10889
10890   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10891                doc: /*
10892 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10893   eol_mnemonic_mac = build_pure_c_string ("/");
10894
10895   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10896                doc: /*
10897 *String displayed in mode line when end-of-line format is not yet determined.  */);
10898   eol_mnemonic_undecided = build_pure_c_string (":");
10899
10900   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10901                doc: /*
10902 *Non-nil enables character translation while encoding and decoding.  */);
10903   Venable_character_translation = Qt;
10904
10905   DEFVAR_LISP ("standard-translation-table-for-decode",
10906                Vstandard_translation_table_for_decode,
10907                doc: /* Table for translating characters while decoding.  */);
10908   Vstandard_translation_table_for_decode = Qnil;
10909
10910   DEFVAR_LISP ("standard-translation-table-for-encode",
10911                Vstandard_translation_table_for_encode,
10912                doc: /* Table for translating characters while encoding.  */);
10913   Vstandard_translation_table_for_encode = Qnil;
10914
10915   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10916                doc: /* Alist of charsets vs revision numbers.
10917 While encoding, if a charset (car part of an element) is found,
10918 designate it with the escape sequence identifying revision (cdr part
10919 of the element).  */);
10920   Vcharset_revision_table = Qnil;
10921
10922   DEFVAR_LISP ("default-process-coding-system",
10923                Vdefault_process_coding_system,
10924                doc: /* Cons of coding systems used for process I/O by default.
10925 The car part is used for decoding a process output,
10926 the cdr part is used for encoding a text to be sent to a process.  */);
10927   Vdefault_process_coding_system = Qnil;
10928
10929   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10930                doc: /*
10931 Table of extra Latin codes in the range 128..159 (inclusive).
10932 This is a vector of length 256.
10933 If Nth element is non-nil, the existence of code N in a file
10934 \(or output of subprocess) doesn't prevent it to be detected as
10935 a coding system of ISO 2022 variant which has a flag
10936 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10937 or reading output of a subprocess.
10938 Only 128th through 159th elements have a meaning.  */);
10939   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10940
10941   DEFVAR_LISP ("select-safe-coding-system-function",
10942                Vselect_safe_coding_system_function,
10943                doc: /*
10944 Function to call to select safe coding system for encoding a text.
10945
10946 If set, this function is called to force a user to select a proper
10947 coding system which can encode the text in the case that a default
10948 coding system used in each operation can't encode the text.  The
10949 function should take care that the buffer is not modified while
10950 the coding system is being selected.
10951
10952 The default value is `select-safe-coding-system' (which see).  */);
10953   Vselect_safe_coding_system_function = Qnil;
10954
10955   DEFVAR_BOOL ("coding-system-require-warning",
10956                coding_system_require_warning,
10957                doc: /* Internal use only.
10958 If non-nil, on writing a file, `select-safe-coding-system-function' is
10959 called even if `coding-system-for-write' is non-nil.  The command
10960 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10961   coding_system_require_warning = 0;
10962
10963
10964   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10965                inhibit_iso_escape_detection,
10966                doc: /*
10967 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10968
10969 When Emacs reads text, it tries to detect how the text is encoded.
10970 This code detection is sensitive to escape sequences.  If Emacs sees
10971 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10972 of the ISO2022 encodings, and decodes text by the corresponding coding
10973 system (e.g. `iso-2022-7bit').
10974
10975 However, there may be a case that you want to read escape sequences in
10976 a file as is.  In such a case, you can set this variable to non-nil.
10977 Then the code detection will ignore any escape sequences, and no text is
10978 detected as encoded in some ISO-2022 encoding.  The result is that all
10979 escape sequences become visible in a buffer.
10980
10981 The default value is nil, and it is strongly recommended not to change
10982 it.  That is because many Emacs Lisp source files that contain
10983 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10984 in Emacs's distribution, and they won't be decoded correctly on
10985 reading if you suppress escape sequence detection.
10986
10987 The other way to read escape sequences in a file without decoding is
10988 to explicitly specify some coding system that doesn't use ISO-2022
10989 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
10990   inhibit_iso_escape_detection = 0;
10991
10992   DEFVAR_BOOL ("inhibit-null-byte-detection",
10993                inhibit_null_byte_detection,
10994                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10995 By default, Emacs treats it as binary data, and does not attempt to
10996 decode it.  The effect is as if you specified `no-conversion' for
10997 reading that text.
10998
10999 Set this to non-nil when a regular text happens to include null bytes.
11000 Examples are Index nodes of Info files and null-byte delimited output
11001 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11002 decode text as usual.  */);
11003   inhibit_null_byte_detection = 0;
11004
11005   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11006                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11007 Internal use only.  Removed after the experimental optimizer gets stable. */);
11008   disable_ascii_optimization = 0;
11009
11010   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11011                doc: /* Char table for translating self-inserting characters.
11012 This is applied to the result of input methods, not their input.
11013 See also `keyboard-translate-table'.
11014
11015 Use of this variable for character code unification was rendered
11016 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11017 internal character representation.  */);
11018     Vtranslation_table_for_input = Qnil;
11019
11020   {
11021     Lisp_Object args[coding_arg_max];
11022     Lisp_Object plist[16];
11023     int i;
11024
11025     for (i = 0; i < coding_arg_max; i++)
11026       args[i] = Qnil;
11027
11028     plist[0] = intern_c_string (":name");
11029     plist[1] = args[coding_arg_name] = Qno_conversion;
11030     plist[2] = intern_c_string (":mnemonic");
11031     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11032     plist[4] = intern_c_string (":coding-type");
11033     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11034     plist[6] = intern_c_string (":ascii-compatible-p");
11035     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11036     plist[8] = intern_c_string (":default-char");
11037     plist[9] = args[coding_arg_default_char] = make_number (0);
11038     plist[10] = intern_c_string (":for-unibyte");
11039     plist[11] = args[coding_arg_for_unibyte] = Qt;
11040     plist[12] = intern_c_string (":docstring");
11041     plist[13] = build_pure_c_string ("Do no conversion.\n\
11042 \n\
11043 When you visit a file with this coding, the file is read into a\n\
11044 unibyte buffer as is, thus each byte of a file is treated as a\n\
11045 character.");
11046     plist[14] = intern_c_string (":eol-type");
11047     plist[15] = args[coding_arg_eol_type] = Qunix;
11048     args[coding_arg_plist] = Flist (16, plist);
11049     Fdefine_coding_system_internal (coding_arg_max, args);
11050
11051     plist[1] = args[coding_arg_name] = Qundecided;
11052     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11053     plist[5] = args[coding_arg_coding_type] = Qundecided;
11054     /* This is already set.
11055        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11056     plist[8] = intern_c_string (":charset-list");
11057     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11058     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11059     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11060     plist[15] = args[coding_arg_eol_type] = Qnil;
11061     args[coding_arg_plist] = Flist (16, plist);
11062     Fdefine_coding_system_internal (coding_arg_max, args);
11063   }
11064
11065   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11066
11067   {
11068     int i;
11069
11070     for (i = 0; i < coding_category_max; i++)
11071       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11072   }
11073 #if defined (DOS_NT)
11074   system_eol_type = Qdos;
11075 #else
11076   system_eol_type = Qunix;
11077 #endif
11078   staticpro (&system_eol_type);
11079 }
11080
11081 char *
11082 emacs_strerror (int error_number)
11083 {
11084   char *str;
11085
11086   synchronize_system_messages_locale ();
11087   str = strerror (error_number);
11088
11089   if (! NILP (Vlocale_coding_system))
11090     {
11091       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11092                                                       Vlocale_coding_system,
11093                                                       0);
11094       str = SSDATA (dec);
11095     }
11096
11097   return str;
11098 }
11099
11100 #endif /* emacs */