src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2013 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #include "lisp.h"
 290 #include "character.h"
 291 #include "buffer.h"
 292 #include "charset.h"
 293 #include "ccl.h"
 294 #include "composite.h"
 295 #include "coding.h"
 296 #include "window.h"
 297 #include "frame.h"
 298 #include "termhooks.h"
 299
 300 Lisp_Object Vcoding_system_hash_table;
 301
 302 static Lisp_Object Qcoding_system, Qeol_type;
 303 static Lisp_Object Qcoding_aliases;
 304 Lisp_Object Qunix, Qdos;
 305 Lisp_Object Qbuffer_file_coding_system;
 306 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 307 static Lisp_Object Qdefault_char;
 308 Lisp_Object Qno_conversion, Qundecided;
 309 Lisp_Object Qcharset, Qutf_8;
 310 static Lisp_Object Qiso_2022;
 311 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 312 static Lisp_Object Qbig, Qlittle;
 313 static Lisp_Object Qcoding_system_history;
 314 static Lisp_Object Qvalid_codes;
 315 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 316 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 317 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 318 static Lisp_Object QCascii_compatible_p;
 319
 320 Lisp_Object Qcall_process, Qcall_process_region;
 321 Lisp_Object Qstart_process, Qopen_network_stream;
 322 static Lisp_Object Qtarget_idx;
 323
 324 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 325 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 326
 327 /* If a symbol has this property, evaluate the value to define the
 328    symbol as a coding system.  */
 329 static Lisp_Object Qcoding_system_define_form;
 330
 331 /* Format of end-of-line decided by system.  This is Qunix on
 332    Unix and Mac, Qdos on DOS/Windows.
 333    This has an effect only for external encoding (i.e. for output to
 334    file and process), not for in-buffer or Lisp string encoding.  */
 335 static Lisp_Object system_eol_type;
 336
 337 #ifdef emacs
 338
 339 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 340
 341 /* Coding system emacs-mule and raw-text are for converting only
 342    end-of-line format.  */
 343 Lisp_Object Qemacs_mule, Qraw_text;
 344 Lisp_Object Qutf_8_emacs;
 345
 346 #if defined (WINDOWSNT) || defined (CYGWIN)
 347 static Lisp_Object Qutf_16le;
 348 #endif
 349
 350 /* Coding-systems are handed between Emacs Lisp programs and C internal
 351    routines by the following three variables.  */
 352 /* Coding system to be used to encode text for terminal display when
 353    terminal coding system is nil.  */
 354 struct coding_system safe_terminal_coding;
 355
 356 #endif /* emacs */
 357
 358 Lisp_Object Qtranslation_table;
 359 Lisp_Object Qtranslation_table_id;
 360 static Lisp_Object Qtranslation_table_for_decode;
 361 static Lisp_Object Qtranslation_table_for_encode;
 362
 363 /* Two special coding systems.  */
 364 static Lisp_Object Vsjis_coding_system;
 365 static Lisp_Object Vbig5_coding_system;
 366
 367 /* ISO2022 section */
 368
 369 #define CODING_ISO_INITIAL(coding, reg)                 \
 370   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 371                      coding_attr_iso_initial),          \
 372                reg)))
 373
 374
 375 #define CODING_ISO_REQUEST(coding, charset_id)          \
 376   (((charset_id) <= (coding)->max_charset_id            \
 377     ? ((coding)->safe_charsets[charset_id] != 255       \
 378        ? (coding)->safe_charsets[charset_id]            \
 379        : -1)                                            \
 380     : -1))
 381
 382
 383 #define CODING_ISO_FLAGS(coding)        \
 384   ((coding)->spec.iso_2022.flags)
 385 #define CODING_ISO_DESIGNATION(coding, reg)     \
 386   ((coding)->spec.iso_2022.current_designation[reg])
 387 #define CODING_ISO_INVOCATION(coding, plane)    \
 388   ((coding)->spec.iso_2022.current_invocation[plane])
 389 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 390   ((coding)->spec.iso_2022.single_shifting)
 391 #define CODING_ISO_BOL(coding)  \
 392   ((coding)->spec.iso_2022.bol)
 393 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 394   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 395 #define CODING_ISO_CMP_STATUS(coding)   \
 396   (&(coding)->spec.iso_2022.cmp_status)
 397 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 398   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 399 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 400   ((coding)->spec.iso_2022.embedded_utf_8)
 401
 402 /* Control characters of ISO2022.  */
 403                         /* code */      /* function */
 404 #define ISO_CODE_SO     0x0E            /* shift-out */
 405 #define ISO_CODE_SI     0x0F            /* shift-in */
 406 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 407 #define ISO_CODE_ESC    0x1B            /* escape */
 408 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 409 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 410 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 411
 412 /* All code (1-byte) of ISO2022 is classified into one of the
 413    followings.  */
 414 enum iso_code_class_type
 415   {
 416     ISO_control_0,              /* Control codes in the range
 417                                    0x00..0x1F and 0x7F, except for the
 418                                    following 5 codes.  */
 419     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 420     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 421     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 422     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 423     ISO_control_1,              /* Control codes in the range
 424                                    0x80..0x9F, except for the
 425                                    following 3 codes.  */
 426     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 427     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 428     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 429     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 430     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 431     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 432     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 433   };
 434
 435 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 436     `iso-flags' attribute of an iso2022 coding system.  */
 437
 438 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 439    instead of the correct short-form sequence (e.g. ESC $ A).  */
 440 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 441
 442 /* If set, reset graphic planes and registers at end-of-line to the
 443    initial state.  */
 444 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 445
 446 /* If set, reset graphic planes and registers before any control
 447    characters to the initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 449
 450 /* If set, encode by 7-bit environment.  */
 451 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 452
 453 /* If set, use locking-shift function.  */
 454 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 455
 456 /* If set, use single-shift function.  Overwrite
 457    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 458 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 459
 460 /* If set, use designation escape sequence.  */
 461 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 462
 463 /* If set, produce revision number sequence.  */
 464 #define CODING_ISO_FLAG_REVISION        0x0080
 465
 466 /* If set, produce ISO6429's direction specifying sequence.  */
 467 #define CODING_ISO_FLAG_DIRECTION       0x0100
 468
 469 /* If set, assume designation states are reset at beginning of line on
 470    output.  */
 471 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 472
 473 /* If set, designation sequence should be placed at beginning of line
 474    on output.  */
 475 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 476
 477 /* If set, do not encode unsafe characters on output.  */
 478 #define CODING_ISO_FLAG_SAFE            0x0800
 479
 480 /* If set, extra latin codes (128..159) are accepted as a valid code
 481    on input.  */
 482 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 483
 484 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 485
 486 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 487
 488 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 489
 490 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 491
 492 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 493
 494 /* A character to be produced on output if encoding of the original
 495    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 496 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 497
 498 /* UTF-8 section */
 499 #define CODING_UTF_8_BOM(coding)        \
 500   ((coding)->spec.utf_8_bom)
 501
 502 /* UTF-16 section */
 503 #define CODING_UTF_16_BOM(coding)       \
 504   ((coding)->spec.utf_16.bom)
 505
 506 #define CODING_UTF_16_ENDIAN(coding)    \
 507   ((coding)->spec.utf_16.endian)
 508
 509 #define CODING_UTF_16_SURROGATE(coding) \
 510   ((coding)->spec.utf_16.surrogate)
 511
 512
 513 /* CCL section */
 514 #define CODING_CCL_DECODER(coding)      \
 515   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 516 #define CODING_CCL_ENCODER(coding)      \
 517   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 518 #define CODING_CCL_VALIDS(coding)                                          \
 519   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 520
 521 /* Index for each coding category in `coding_categories' */
 522
 523 enum coding_category
 524   {
 525     coding_category_iso_7,
 526     coding_category_iso_7_tight,
 527     coding_category_iso_8_1,
 528     coding_category_iso_8_2,
 529     coding_category_iso_7_else,
 530     coding_category_iso_8_else,
 531     coding_category_utf_8_auto,
 532     coding_category_utf_8_nosig,
 533     coding_category_utf_8_sig,
 534     coding_category_utf_16_auto,
 535     coding_category_utf_16_be,
 536     coding_category_utf_16_le,
 537     coding_category_utf_16_be_nosig,
 538     coding_category_utf_16_le_nosig,
 539     coding_category_charset,
 540     coding_category_sjis,
 541     coding_category_big5,
 542     coding_category_ccl,
 543     coding_category_emacs_mule,
 544     /* All above are targets of code detection.  */
 545     coding_category_raw_text,
 546     coding_category_undecided,
 547     coding_category_max
 548   };
 549
 550 /* Definitions of flag bits used in detect_coding_XXXX.  */
 551 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 552 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 553 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 554 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 555 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 556 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 557 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 558 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 559 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 560 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 561 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 562 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 563 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 564 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 565 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 566 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 567 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 568 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 569 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 570 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 571
 572 /* This value is returned if detect_coding_mask () find nothing other
 573    than ASCII characters.  */
 574 #define CATEGORY_MASK_ANY               \
 575   (CATEGORY_MASK_ISO_7                  \
 576    | CATEGORY_MASK_ISO_7_TIGHT          \
 577    | CATEGORY_MASK_ISO_8_1              \
 578    | CATEGORY_MASK_ISO_8_2              \
 579    | CATEGORY_MASK_ISO_7_ELSE           \
 580    | CATEGORY_MASK_ISO_8_ELSE           \
 581    | CATEGORY_MASK_UTF_8_AUTO           \
 582    | CATEGORY_MASK_UTF_8_NOSIG          \
 583    | CATEGORY_MASK_UTF_8_SIG            \
 584    | CATEGORY_MASK_UTF_16_AUTO          \
 585    | CATEGORY_MASK_UTF_16_BE            \
 586    | CATEGORY_MASK_UTF_16_LE            \
 587    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 588    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 589    | CATEGORY_MASK_CHARSET              \
 590    | CATEGORY_MASK_SJIS                 \
 591    | CATEGORY_MASK_BIG5                 \
 592    | CATEGORY_MASK_CCL                  \
 593    | CATEGORY_MASK_EMACS_MULE)
 594
 595
 596 #define CATEGORY_MASK_ISO_7BIT \
 597   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 598
 599 #define CATEGORY_MASK_ISO_8BIT \
 600   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 601
 602 #define CATEGORY_MASK_ISO_ELSE \
 603   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 604
 605 #define CATEGORY_MASK_ISO_ESCAPE        \
 606   (CATEGORY_MASK_ISO_7                  \
 607    | CATEGORY_MASK_ISO_7_TIGHT          \
 608    | CATEGORY_MASK_ISO_7_ELSE           \
 609    | CATEGORY_MASK_ISO_8_ELSE)
 610
 611 #define CATEGORY_MASK_ISO       \
 612   (  CATEGORY_MASK_ISO_7BIT     \
 613      | CATEGORY_MASK_ISO_8BIT   \
 614      | CATEGORY_MASK_ISO_ELSE)
 615
 616 #define CATEGORY_MASK_UTF_16            \
 617   (CATEGORY_MASK_UTF_16_AUTO            \
 618    | CATEGORY_MASK_UTF_16_BE            \
 619    | CATEGORY_MASK_UTF_16_LE            \
 620    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 621    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 622
 623 #define CATEGORY_MASK_UTF_8     \
 624   (CATEGORY_MASK_UTF_8_AUTO     \
 625    | CATEGORY_MASK_UTF_8_NOSIG  \
 626    | CATEGORY_MASK_UTF_8_SIG)
 627
 628 /* Table of coding categories (Lisp symbols).  This variable is for
 629    internal use only.  */
 630 static Lisp_Object Vcoding_category_table;
 631
 632 /* Table of coding-categories ordered by priority.  */
 633 static enum coding_category coding_priorities[coding_category_max];
 634
 635 /* Nth element is a coding context for the coding system bound to the
 636    Nth coding category.  */
 637 static struct coding_system coding_categories[coding_category_max];
 638
 639 /*** Commonly used macros and functions ***/
 640
 641 #ifndef min
 642 #define min(a, b) ((a) < (b) ? (a) : (b))
 643 #endif
 644 #ifndef max
 645 #define max(a, b) ((a) > (b) ? (a) : (b))
 646 #endif
 647
 648 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 649   do {                                                  \
 650     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 651     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 652   } while (0)
 653
 654
 655 /* Safely get one byte from the source text pointed by SRC which ends
 656    at SRC_END, and set C to that byte.  If there are not enough bytes
 657    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 658    and a multibyte character is found at SRC, set C to the
 659    negative value of the character code.  The caller should declare
 660    and set these variables appropriately in advance:
 661         src, src_end, multibytep */
 662
 663 #define ONE_MORE_BYTE(c)                                \
 664   do {                                                  \
 665     if (src == src_end)                                 \
 666       {                                                 \
 667         if (src_base < src)                             \
 668           record_conversion_result                      \
 669             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 670         goto no_more_source;                            \
 671       }                                                 \
 672     c = *src++;                                         \
 673     if (multibytep && (c & 0x80))                       \
 674       {                                                 \
 675         if ((c & 0xFE) == 0xC0)                         \
 676           c = ((c & 1) << 6) | *src++;                  \
 677         else                                            \
 678           {                                             \
 679             src--;                                      \
 680             c = - string_char (src, &src, NULL);        \
 681             record_conversion_result                    \
 682               (coding, CODING_RESULT_INVALID_SRC);      \
 683           }                                             \
 684       }                                                 \
 685     consumed_chars++;                                   \
 686   } while (0)
 687
 688 /* Safely get two bytes from the source text pointed by SRC which ends
 689    at SRC_END, and set C1 and C2 to those bytes while skipping the
 690    heading multibyte characters.  If there are not enough bytes in the
 691    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 692    a multibyte character is found for C2, set C2 to the negative value
 693    of the character code.  The caller should declare and set these
 694    variables appropriately in advance:
 695         src, src_end, multibytep
 696    It is intended that this macro is used in detect_coding_utf_16.  */
 697
 698 #define TWO_MORE_BYTES(c1, c2)                          \
 699   do {                                                  \
 700     do {                                                \
 701       if (src == src_end)                               \
 702         goto no_more_source;                            \
 703       c1 = *src++;                                      \
 704       if (multibytep && (c1 & 0x80))                    \
 705         {                                               \
 706           if ((c1 & 0xFE) == 0xC0)                      \
 707             c1 = ((c1 & 1) << 6) | *src++;              \
 708           else                                          \
 709             {                                           \
 710               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 711               c1 = -1;                                  \
 712             }                                           \
 713         }                                               \
 714     } while (c1 < 0);                                   \
 715     if (src == src_end)                                 \
 716       goto no_more_source;                              \
 717     c2 = *src++;                                        \
 718     if (multibytep && (c2 & 0x80))                      \
 719       {                                                 \
 720         if ((c2 & 0xFE) == 0xC0)                        \
 721           c2 = ((c2 & 1) << 6) | *src++;                \
 722         else                                            \
 723           c2 = -1;                                      \
 724       }                                                 \
 725   } while (0)
 726
 727
 728 /* Store a byte C in the place pointed by DST and increment DST to the
 729    next free point, and increment PRODUCED_CHARS.  The caller should
 730    assure that C is 0..127, and declare and set the variable `dst'
 731    appropriately in advance.
 732 */
 733
 734
 735 #define EMIT_ONE_ASCII_BYTE(c)  \
 736   do {                          \
 737     produced_chars++;           \
 738     *dst++ = (c);               \
 739   } while (0)
 740
 741
 742 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 743
 744 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 745   do {                                  \
 746     produced_chars += 2;                \
 747     *dst++ = (c1), *dst++ = (c2);       \
 748   } while (0)
 749
 750
 751 /* Store a byte C in the place pointed by DST and increment DST to the
 752    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 753    store in an appropriate multibyte form.  The caller should
 754    declare and set the variables `dst' and `multibytep' appropriately
 755    in advance.  */
 756
 757 #define EMIT_ONE_BYTE(c)                \
 758   do {                                  \
 759     produced_chars++;                   \
 760     if (multibytep)                     \
 761       {                                 \
 762         unsigned ch = (c);              \
 763         if (ch >= 0x80)                 \
 764           ch = BYTE8_TO_CHAR (ch);      \
 765         CHAR_STRING_ADVANCE (ch, dst);  \
 766       }                                 \
 767     else                                \
 768       *dst++ = (c);                     \
 769   } while (0)
 770
 771
 772 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 773
 774 #define EMIT_TWO_BYTES(c1, c2)          \
 775   do {                                  \
 776     produced_chars += 2;                \
 777     if (multibytep)                     \
 778       {                                 \
 779         unsigned ch;                    \
 780                                         \
 781         ch = (c1);                      \
 782         if (ch >= 0x80)                 \
 783           ch = BYTE8_TO_CHAR (ch);      \
 784         CHAR_STRING_ADVANCE (ch, dst);  \
 785         ch = (c2);                      \
 786         if (ch >= 0x80)                 \
 787           ch = BYTE8_TO_CHAR (ch);      \
 788         CHAR_STRING_ADVANCE (ch, dst);  \
 789       }                                 \
 790     else                                \
 791       {                                 \
 792         *dst++ = (c1);                  \
 793         *dst++ = (c2);                  \
 794       }                                 \
 795   } while (0)
 796
 797
 798 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 799   do {                                  \
 800     EMIT_ONE_BYTE (c1);                 \
 801     EMIT_TWO_BYTES (c2, c3);            \
 802   } while (0)
 803
 804
 805 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 806   do {                                          \
 807     EMIT_TWO_BYTES (c1, c2);                    \
 808     EMIT_TWO_BYTES (c3, c4);                    \
 809   } while (0)
 810
 811
 812 static void
 813 record_conversion_result (struct coding_system *coding,
 814                           enum coding_result_code result)
 815 {
 816   coding->result = result;
 817   switch (result)
 818     {
 819     case CODING_RESULT_INSUFFICIENT_SRC:
 820       Vlast_code_conversion_error = Qinsufficient_source;
 821       break;
 822     case CODING_RESULT_INCONSISTENT_EOL:
 823       Vlast_code_conversion_error = Qinconsistent_eol;
 824       break;
 825     case CODING_RESULT_INVALID_SRC:
 826       Vlast_code_conversion_error = Qinvalid_source;
 827       break;
 828     case CODING_RESULT_INTERRUPT:
 829       Vlast_code_conversion_error = Qinterrupted;
 830       break;
 831     case CODING_RESULT_INSUFFICIENT_MEM:
 832       Vlast_code_conversion_error = Qinsufficient_memory;
 833       break;
 834     case CODING_RESULT_INSUFFICIENT_DST:
 835       /* Don't record this error in Vlast_code_conversion_error
 836          because it happens just temporarily and is resolved when the
 837          whole conversion is finished.  */
 838       break;
 839     case CODING_RESULT_SUCCESS:
 840       break;
 841     default:
 842       Vlast_code_conversion_error = intern ("Unknown error");
 843     }
 844 }
 845
 846 /* These wrapper macros are used to preserve validity of pointers into
 847    buffer text across calls to decode_char, encode_char, etc, which
 848    could cause relocation of buffers if it loads a charset map,
 849    because loading a charset map allocates large structures.  */
 850
 851 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 852   do {                                                                       \
 853     ptrdiff_t offset;                                                        \
 854                                                                              \
 855     charset_map_loaded = 0;                                                  \
 856     c = DECODE_CHAR (charset, code);                                         \
 857     if (charset_map_loaded                                                   \
 858         && (offset = coding_change_source (coding)))                         \
 859       {                                                                      \
 860         src += offset;                                                       \
 861         src_base += offset;                                                  \
 862         src_end += offset;                                                   \
 863       }                                                                      \
 864   } while (0)
 865
 866 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 867   do {                                                                  \
 868     ptrdiff_t offset;                                                   \
 869                                                                         \
 870     charset_map_loaded = 0;                                             \
 871     code = ENCODE_CHAR (charset, c);                                    \
 872     if (charset_map_loaded                                              \
 873         && (offset = coding_change_destination (coding)))               \
 874       {                                                                 \
 875         dst += offset;                                                  \
 876         dst_end += offset;                                              \
 877       }                                                                 \
 878   } while (0)
 879
 880 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 881   do {                                                                  \
 882     ptrdiff_t offset;                                                   \
 883                                                                         \
 884     charset_map_loaded = 0;                                             \
 885     charset = char_charset (c, charset_list, code_return);              \
 886     if (charset_map_loaded                                              \
 887         && (offset = coding_change_destination (coding)))               \
 888       {                                                                 \
 889         dst += offset;                                                  \
 890         dst_end += offset;                                              \
 891       }                                                                 \
 892   } while (0)
 893
 894 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 895   do {                                                                  \
 896     ptrdiff_t offset;                                                   \
 897                                                                         \
 898     charset_map_loaded = 0;                                             \
 899     result = CHAR_CHARSET_P (c, charset);                               \
 900     if (charset_map_loaded                                              \
 901         && (offset = coding_change_destination (coding)))               \
 902       {                                                                 \
 903         dst += offset;                                                  \
 904         dst_end += offset;                                              \
 905       }                                                                 \
 906   } while (0)
 907
 908
 909 /* If there are at least BYTES length of room at dst, allocate memory
 910    for coding->destination and update dst and dst_end.  We don't have
 911    to take care of coding->source which will be relocated.  It is
 912    handled by calling coding_set_source in encode_coding.  */
 913
 914 #define ASSURE_DESTINATION(bytes)                               \
 915   do {                                                          \
 916     if (dst + (bytes) >= dst_end)                               \
 917       {                                                         \
 918         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 919                                                                 \
 920         dst = alloc_destination (coding, more_bytes, dst);      \
 921         dst_end = coding->destination + coding->dst_bytes;      \
 922       }                                                         \
 923   } while (0)
 924
 925
 926 /* Store multibyte form of the character C in P, and advance P to the
 927    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 928    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 929    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 930
 931 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 932
 933 /* Return the character code of character whose multibyte form is at
 934    P, and advance P to the end of the multibyte form.  This used to be
 935    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 936    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 937
 938 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 939
 940 /* Set coding->source from coding->src_object.  */
 941
 942 static void
 943 coding_set_source (struct coding_system *coding)
 944 {
 945   if (BUFFERP (coding->src_object))
 946     {
 947       struct buffer *buf = XBUFFER (coding->src_object);
 948
 949       if (coding->src_pos < 0)
 950         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 951       else
 952         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 953     }
 954   else if (STRINGP (coding->src_object))
 955     {
 956       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 957     }
 958   else
 959     {
 960       /* Otherwise, the source is C string and is never relocated
 961          automatically.  Thus we don't have to update anything.  */
 962     }
 963 }
 964
 965
 966 /* Set coding->source from coding->src_object, and return how many
 967    bytes coding->source was changed.  */
 968
 969 static ptrdiff_t
 970 coding_change_source (struct coding_system *coding)
 971 {
 972   const unsigned char *orig = coding->source;
 973   coding_set_source (coding);
 974   return coding->source - orig;
 975 }
 976
 977
 978 /* Set coding->destination from coding->dst_object.  */
 979
 980 static void
 981 coding_set_destination (struct coding_system *coding)
 982 {
 983   if (BUFFERP (coding->dst_object))
 984     {
 985       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 986         {
 987           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 988           coding->dst_bytes = (GAP_END_ADDR
 989                                - (coding->src_bytes - coding->consumed)
 990                                - coding->destination);
 991         }
 992       else
 993         {
 994           /* We are sure that coding->dst_pos_byte is before the gap
 995              of the buffer. */
 996           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 997                                  + coding->dst_pos_byte - BEG_BYTE);
 998           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 999                                - coding->destination);
1000         }
1001     }
1002   else
1003     {
1004       /* Otherwise, the destination is C string and is never relocated
1005          automatically.  Thus we don't have to update anything.  */
1006     }
1007 }
1008
1009
1010 /* Set coding->destination from coding->dst_object, and return how
1011    many bytes coding->destination was changed.  */
1012
1013 static ptrdiff_t
1014 coding_change_destination (struct coding_system *coding)
1015 {
1016   const unsigned char *orig = coding->destination;
1017   coding_set_destination (coding);
1018   return coding->destination - orig;
1019 }
1020
1021
1022 static void
1023 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1024 {
1025   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1026     string_overflow ();
1027   coding->destination = xrealloc (coding->destination,
1028                                   coding->dst_bytes + bytes);
1029   coding->dst_bytes += bytes;
1030 }
1031
1032 static void
1033 coding_alloc_by_making_gap (struct coding_system *coding,
1034                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1035 {
1036   if (EQ (coding->src_object, coding->dst_object))
1037     {
1038       /* The gap may contain the produced data at the head and not-yet
1039          consumed data at the tail.  To preserve those data, we at
1040          first make the gap size to zero, then increase the gap
1041          size.  */
1042       ptrdiff_t add = GAP_SIZE;
1043
1044       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1045       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1046       make_gap (bytes);
1047       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1048       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1049     }
1050   else
1051     {
1052       Lisp_Object this_buffer;
1053
1054       this_buffer = Fcurrent_buffer ();
1055       set_buffer_internal (XBUFFER (coding->dst_object));
1056       make_gap (bytes);
1057       set_buffer_internal (XBUFFER (this_buffer));
1058     }
1059 }
1060
1061
1062 static unsigned char *
1063 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1064                    unsigned char *dst)
1065 {
1066   ptrdiff_t offset = dst - coding->destination;
1067
1068   if (BUFFERP (coding->dst_object))
1069     {
1070       struct buffer *buf = XBUFFER (coding->dst_object);
1071
1072       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1073     }
1074   else
1075     coding_alloc_by_realloc (coding, nbytes);
1076   coding_set_destination (coding);
1077   dst = coding->destination + offset;
1078   return dst;
1079 }
1080
1081 /** Macros for annotations.  */
1082
1083 /* An annotation data is stored in the array coding->charbuf in this
1084    format:
1085      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1086    LENGTH is the number of elements in the annotation.
1087    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1088    NCHARS is the number of characters in the text annotated.
1089
1090    The format of the following elements depend on ANNOTATION_MASK.
1091
1092    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1093    follows:
1094      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1095
1096    NBYTES is the number of bytes specified in the header part of
1097    old-style emacs-mule encoding, or 0 for the other kind of
1098    composition.
1099
1100    METHOD is one of enum composition_method.
1101
1102    Optional COMPOSITION-COMPONENTS are characters and composition
1103    rules.
1104
1105    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1106    follows.
1107
1108    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1109    recover from an invalid annotation, and should be skipped by
1110    produce_annotation.  */
1111
1112 /* Maximum length of the header of annotation data.  */
1113 #define MAX_ANNOTATION_LENGTH 5
1114
1115 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1116   do {                                                  \
1117     *(buf)++ = -(len);                                  \
1118     *(buf)++ = (mask);                                  \
1119     *(buf)++ = (nchars);                                \
1120     coding->annotated = 1;                              \
1121   } while (0);
1122
1123 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1124   do {                                                                      \
1125     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1126     *buf++ = nbytes;                                                        \
1127     *buf++ = method;                                                        \
1128   } while (0)
1129
1130
1131 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1132   do {                                                                  \
1133     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1134     *buf++ = id;                                                        \
1135   } while (0)
1136
1137 \f
1138 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1139
1140
1141
1142 \f
1143 /*** 3. UTF-8 ***/
1144
1145 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1146    Return true if a text is encoded in UTF-8.  */
1147
1148 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1149 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1150 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1151 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1152 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1153 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1154
1155 #define UTF_8_BOM_1 0xEF
1156 #define UTF_8_BOM_2 0xBB
1157 #define UTF_8_BOM_3 0xBF
1158
1159 static bool
1160 detect_coding_utf_8 (struct coding_system *coding,
1161                      struct coding_detection_info *detect_info)
1162 {
1163   const unsigned char *src = coding->source, *src_base;
1164   const unsigned char *src_end = coding->source + coding->src_bytes;
1165   bool multibytep = coding->src_multibyte;
1166   ptrdiff_t consumed_chars = 0;
1167   bool bom_found = 0;
1168   bool found = 0;
1169
1170   detect_info->checked |= CATEGORY_MASK_UTF_8;
1171   /* A coding system of this category is always ASCII compatible.  */
1172   src += coding->head_ascii;
1173
1174   while (1)
1175     {
1176       int c, c1, c2, c3, c4;
1177
1178       src_base = src;
1179       ONE_MORE_BYTE (c);
1180       if (c < 0 || UTF_8_1_OCTET_P (c))
1181         continue;
1182       ONE_MORE_BYTE (c1);
1183       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1184         break;
1185       if (UTF_8_2_OCTET_LEADING_P (c))
1186         {
1187           found = 1;
1188           continue;
1189         }
1190       ONE_MORE_BYTE (c2);
1191       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1192         break;
1193       if (UTF_8_3_OCTET_LEADING_P (c))
1194         {
1195           found = 1;
1196           if (src_base == coding->source
1197               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1198             bom_found = 1;
1199           continue;
1200         }
1201       ONE_MORE_BYTE (c3);
1202       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1203         break;
1204       if (UTF_8_4_OCTET_LEADING_P (c))
1205         {
1206           found = 1;
1207           continue;
1208         }
1209       ONE_MORE_BYTE (c4);
1210       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1211         break;
1212       if (UTF_8_5_OCTET_LEADING_P (c))
1213         {
1214           found = 1;
1215           continue;
1216         }
1217       break;
1218     }
1219   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1220   return 0;
1221
1222  no_more_source:
1223   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1224     {
1225       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1226       return 0;
1227     }
1228   if (bom_found)
1229     {
1230       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1231       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1232     }
1233   else
1234     {
1235       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1236       if (found)
1237         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1238     }
1239   return 1;
1240 }
1241
1242
1243 static void
1244 decode_coding_utf_8 (struct coding_system *coding)
1245 {
1246   const unsigned char *src = coding->source + coding->consumed;
1247   const unsigned char *src_end = coding->source + coding->src_bytes;
1248   const unsigned char *src_base;
1249   int *charbuf = coding->charbuf + coding->charbuf_used;
1250   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1251   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1252   bool multibytep = coding->src_multibyte;
1253   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1254   bool eol_dos
1255     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1256   int byte_after_cr = -1;
1257
1258   if (bom != utf_without_bom)
1259     {
1260       int c1, c2, c3;
1261
1262       src_base = src;
1263       ONE_MORE_BYTE (c1);
1264       if (! UTF_8_3_OCTET_LEADING_P (c1))
1265         src = src_base;
1266       else
1267         {
1268           ONE_MORE_BYTE (c2);
1269           if (! UTF_8_EXTRA_OCTET_P (c2))
1270             src = src_base;
1271           else
1272             {
1273               ONE_MORE_BYTE (c3);
1274               if (! UTF_8_EXTRA_OCTET_P (c3))
1275                 src = src_base;
1276               else
1277                 {
1278                   if ((c1 != UTF_8_BOM_1)
1279                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1280                     src = src_base;
1281                   else
1282                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1283                 }
1284             }
1285         }
1286     }
1287   CODING_UTF_8_BOM (coding) = utf_without_bom;
1288
1289   while (1)
1290     {
1291       int c, c1, c2, c3, c4, c5;
1292
1293       src_base = src;
1294       consumed_chars_base = consumed_chars;
1295
1296       if (charbuf >= charbuf_end)
1297         {
1298           if (byte_after_cr >= 0)
1299             src_base--;
1300           break;
1301         }
1302
1303       if (byte_after_cr >= 0)
1304         c1 = byte_after_cr, byte_after_cr = -1;
1305       else
1306         ONE_MORE_BYTE (c1);
1307       if (c1 < 0)
1308         {
1309           c = - c1;
1310         }
1311       else if (UTF_8_1_OCTET_P (c1))
1312         {
1313           if (eol_dos && c1 == '\r')
1314             ONE_MORE_BYTE (byte_after_cr);
1315           c = c1;
1316         }
1317       else
1318         {
1319           ONE_MORE_BYTE (c2);
1320           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1321             goto invalid_code;
1322           if (UTF_8_2_OCTET_LEADING_P (c1))
1323             {
1324               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1325               /* Reject overlong sequences here and below.  Encoders
1326                  producing them are incorrect, they can be misleading,
1327                  and they mess up read/write invariance.  */
1328               if (c < 128)
1329                 goto invalid_code;
1330             }
1331           else
1332             {
1333               ONE_MORE_BYTE (c3);
1334               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1335                 goto invalid_code;
1336               if (UTF_8_3_OCTET_LEADING_P (c1))
1337                 {
1338                   c = (((c1 & 0xF) << 12)
1339                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1340                   if (c < 0x800
1341                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1342                     goto invalid_code;
1343                 }
1344               else
1345                 {
1346                   ONE_MORE_BYTE (c4);
1347                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1348                     goto invalid_code;
1349                   if (UTF_8_4_OCTET_LEADING_P (c1))
1350                     {
1351                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1352                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1353                     if (c < 0x10000)
1354                       goto invalid_code;
1355                     }
1356                   else
1357                     {
1358                       ONE_MORE_BYTE (c5);
1359                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1360                         goto invalid_code;
1361                       if (UTF_8_5_OCTET_LEADING_P (c1))
1362                         {
1363                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1364                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1365                                | (c5 & 0x3F));
1366                           if ((c > MAX_CHAR) || (c < 0x200000))
1367                             goto invalid_code;
1368                         }
1369                       else
1370                         goto invalid_code;
1371                     }
1372                 }
1373             }
1374         }
1375
1376       *charbuf++ = c;
1377       continue;
1378
1379     invalid_code:
1380       src = src_base;
1381       consumed_chars = consumed_chars_base;
1382       ONE_MORE_BYTE (c);
1383       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1384       coding->errors++;
1385     }
1386
1387  no_more_source:
1388   coding->consumed_char += consumed_chars_base;
1389   coding->consumed = src_base - coding->source;
1390   coding->charbuf_used = charbuf - coding->charbuf;
1391 }
1392
1393
1394 static bool
1395 encode_coding_utf_8 (struct coding_system *coding)
1396 {
1397   bool multibytep = coding->dst_multibyte;
1398   int *charbuf = coding->charbuf;
1399   int *charbuf_end = charbuf + coding->charbuf_used;
1400   unsigned char *dst = coding->destination + coding->produced;
1401   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1402   ptrdiff_t produced_chars = 0;
1403   int c;
1404
1405   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1406     {
1407       ASSURE_DESTINATION (3);
1408       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1409       CODING_UTF_8_BOM (coding) = utf_without_bom;
1410     }
1411
1412   if (multibytep)
1413     {
1414       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1415
1416       while (charbuf < charbuf_end)
1417         {
1418           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1419
1420           ASSURE_DESTINATION (safe_room);
1421           c = *charbuf++;
1422           if (CHAR_BYTE8_P (c))
1423             {
1424               c = CHAR_TO_BYTE8 (c);
1425               EMIT_ONE_BYTE (c);
1426             }
1427           else
1428             {
1429               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1430               for (p = str; p < pend; p++)
1431                 EMIT_ONE_BYTE (*p);
1432             }
1433         }
1434     }
1435   else
1436     {
1437       int safe_room = MAX_MULTIBYTE_LENGTH;
1438
1439       while (charbuf < charbuf_end)
1440         {
1441           ASSURE_DESTINATION (safe_room);
1442           c = *charbuf++;
1443           if (CHAR_BYTE8_P (c))
1444             *dst++ = CHAR_TO_BYTE8 (c);
1445           else
1446             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1447           produced_chars++;
1448         }
1449     }
1450   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1451   coding->produced_char += produced_chars;
1452   coding->produced = dst - coding->destination;
1453   return 0;
1454 }
1455
1456
1457 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1458    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1459
1460 #define UTF_16_HIGH_SURROGATE_P(val) \
1461   (((val) & 0xFC00) == 0xD800)
1462
1463 #define UTF_16_LOW_SURROGATE_P(val) \
1464   (((val) & 0xFC00) == 0xDC00)
1465
1466
1467 static bool
1468 detect_coding_utf_16 (struct coding_system *coding,
1469                       struct coding_detection_info *detect_info)
1470 {
1471   const unsigned char *src = coding->source;
1472   const unsigned char *src_end = coding->source + coding->src_bytes;
1473   bool multibytep = coding->src_multibyte;
1474   int c1, c2;
1475
1476   detect_info->checked |= CATEGORY_MASK_UTF_16;
1477   if (coding->mode & CODING_MODE_LAST_BLOCK
1478       && (coding->src_chars & 1))
1479     {
1480       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1481       return 0;
1482     }
1483
1484   TWO_MORE_BYTES (c1, c2);
1485   if ((c1 == 0xFF) && (c2 == 0xFE))
1486     {
1487       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1488                              | CATEGORY_MASK_UTF_16_AUTO);
1489       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1490                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1491                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1492     }
1493   else if ((c1 == 0xFE) && (c2 == 0xFF))
1494     {
1495       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1496                              | CATEGORY_MASK_UTF_16_AUTO);
1497       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1498                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1499                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1500     }
1501   else if (c2 < 0)
1502     {
1503       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1504       return 0;
1505     }
1506   else
1507     {
1508       /* We check the dispersion of Eth and Oth bytes where E is even and
1509          O is odd.  If both are high, we assume binary data.*/
1510       unsigned char e[256], o[256];
1511       unsigned e_num = 1, o_num = 1;
1512
1513       memset (e, 0, 256);
1514       memset (o, 0, 256);
1515       e[c1] = 1;
1516       o[c2] = 1;
1517
1518       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1519                                 |CATEGORY_MASK_UTF_16_BE
1520                                 | CATEGORY_MASK_UTF_16_LE);
1521
1522       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1523              != CATEGORY_MASK_UTF_16)
1524         {
1525           TWO_MORE_BYTES (c1, c2);
1526           if (c2 < 0)
1527             break;
1528           if (! e[c1])
1529             {
1530               e[c1] = 1;
1531               e_num++;
1532               if (e_num >= 128)
1533                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1534             }
1535           if (! o[c2])
1536             {
1537               o[c2] = 1;
1538               o_num++;
1539               if (o_num >= 128)
1540                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1541             }
1542         }
1543       return 0;
1544     }
1545
1546  no_more_source:
1547   return 1;
1548 }
1549
1550 static void
1551 decode_coding_utf_16 (struct coding_system *coding)
1552 {
1553   const unsigned char *src = coding->source + coding->consumed;
1554   const unsigned char *src_end = coding->source + coding->src_bytes;
1555   const unsigned char *src_base;
1556   int *charbuf = coding->charbuf + coding->charbuf_used;
1557   /* We may produces at most 3 chars in one loop.  */
1558   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1559   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1560   bool multibytep = coding->src_multibyte;
1561   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1562   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1563   int surrogate = CODING_UTF_16_SURROGATE (coding);
1564   bool eol_dos
1565     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1566   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1567
1568   if (bom == utf_with_bom)
1569     {
1570       int c, c1, c2;
1571
1572       src_base = src;
1573       ONE_MORE_BYTE (c1);
1574       ONE_MORE_BYTE (c2);
1575       c = (c1 << 8) | c2;
1576
1577       if (endian == utf_16_big_endian
1578           ? c != 0xFEFF : c != 0xFFFE)
1579         {
1580           /* The first two bytes are not BOM.  Treat them as bytes
1581              for a normal character.  */
1582           src = src_base;
1583           coding->errors++;
1584         }
1585       CODING_UTF_16_BOM (coding) = utf_without_bom;
1586     }
1587   else if (bom == utf_detect_bom)
1588     {
1589       /* We have already tried to detect BOM and failed in
1590          detect_coding.  */
1591       CODING_UTF_16_BOM (coding) = utf_without_bom;
1592     }
1593
1594   while (1)
1595     {
1596       int c, c1, c2;
1597
1598       src_base = src;
1599       consumed_chars_base = consumed_chars;
1600
1601       if (charbuf >= charbuf_end)
1602         {
1603           if (byte_after_cr1 >= 0)
1604             src_base -= 2;
1605           break;
1606         }
1607
1608       if (byte_after_cr1 >= 0)
1609         c1 = byte_after_cr1, byte_after_cr1 = -1;
1610       else
1611         ONE_MORE_BYTE (c1);
1612       if (c1 < 0)
1613         {
1614           *charbuf++ = -c1;
1615           continue;
1616         }
1617       if (byte_after_cr2 >= 0)
1618         c2 = byte_after_cr2, byte_after_cr2 = -1;
1619       else
1620         ONE_MORE_BYTE (c2);
1621       if (c2 < 0)
1622         {
1623           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1624           *charbuf++ = -c2;
1625           continue;
1626         }
1627       c = (endian == utf_16_big_endian
1628            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1629
1630       if (surrogate)
1631         {
1632           if (! UTF_16_LOW_SURROGATE_P (c))
1633             {
1634               if (endian == utf_16_big_endian)
1635                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1636               else
1637                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1638               *charbuf++ = c1;
1639               *charbuf++ = c2;
1640               coding->errors++;
1641               if (UTF_16_HIGH_SURROGATE_P (c))
1642                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1643               else
1644                 *charbuf++ = c;
1645             }
1646           else
1647             {
1648               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1649               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1650               *charbuf++ = 0x10000 + c;
1651             }
1652         }
1653       else
1654         {
1655           if (UTF_16_HIGH_SURROGATE_P (c))
1656             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1657           else
1658             {
1659               if (eol_dos && c == '\r')
1660                 {
1661                   ONE_MORE_BYTE (byte_after_cr1);
1662                   ONE_MORE_BYTE (byte_after_cr2);
1663                 }
1664               *charbuf++ = c;
1665             }
1666         }
1667     }
1668
1669  no_more_source:
1670   coding->consumed_char += consumed_chars_base;
1671   coding->consumed = src_base - coding->source;
1672   coding->charbuf_used = charbuf - coding->charbuf;
1673 }
1674
1675 static bool
1676 encode_coding_utf_16 (struct coding_system *coding)
1677 {
1678   bool multibytep = coding->dst_multibyte;
1679   int *charbuf = coding->charbuf;
1680   int *charbuf_end = charbuf + coding->charbuf_used;
1681   unsigned char *dst = coding->destination + coding->produced;
1682   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1683   int safe_room = 8;
1684   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1685   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1686   ptrdiff_t produced_chars = 0;
1687   int c;
1688
1689   if (bom != utf_without_bom)
1690     {
1691       ASSURE_DESTINATION (safe_room);
1692       if (big_endian)
1693         EMIT_TWO_BYTES (0xFE, 0xFF);
1694       else
1695         EMIT_TWO_BYTES (0xFF, 0xFE);
1696       CODING_UTF_16_BOM (coding) = utf_without_bom;
1697     }
1698
1699   while (charbuf < charbuf_end)
1700     {
1701       ASSURE_DESTINATION (safe_room);
1702       c = *charbuf++;
1703       if (c > MAX_UNICODE_CHAR)
1704         c = coding->default_char;
1705
1706       if (c < 0x10000)
1707         {
1708           if (big_endian)
1709             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1710           else
1711             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1712         }
1713       else
1714         {
1715           int c1, c2;
1716
1717           c -= 0x10000;
1718           c1 = (c >> 10) + 0xD800;
1719           c2 = (c & 0x3FF) + 0xDC00;
1720           if (big_endian)
1721             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1722           else
1723             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1724         }
1725     }
1726   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1727   coding->produced = dst - coding->destination;
1728   coding->produced_char += produced_chars;
1729   return 0;
1730 }
1731
1732 \f
1733 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1734
1735 /* Emacs' internal format for representation of multiple character
1736    sets is a kind of multi-byte encoding, i.e. characters are
1737    represented by variable-length sequences of one-byte codes.
1738
1739    ASCII characters and control characters (e.g. `tab', `newline') are
1740    represented by one-byte sequences which are their ASCII codes, in
1741    the range 0x00 through 0x7F.
1742
1743    8-bit characters of the range 0x80..0x9F are represented by
1744    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1745    code + 0x20).
1746
1747    8-bit characters of the range 0xA0..0xFF are represented by
1748    one-byte sequences which are their 8-bit code.
1749
1750    The other characters are represented by a sequence of `base
1751    leading-code', optional `extended leading-code', and one or two
1752    `position-code's.  The length of the sequence is determined by the
1753    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1754    whereas extended leading-code and position-code take the range 0xA0
1755    through 0xFF.  See `charset.h' for more details about leading-code
1756    and position-code.
1757
1758    --- CODE RANGE of Emacs' internal format ---
1759    character set        range
1760    -------------        -----
1761    ascii                0x00..0x7F
1762    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1763    eight-bit-graphic    0xA0..0xBF
1764    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1765    ---------------------------------------------
1766
1767    As this is the internal character representation, the format is
1768    usually not used externally (i.e. in a file or in a data sent to a
1769    process).  But, it is possible to have a text externally in this
1770    format (i.e. by encoding by the coding system `emacs-mule').
1771
1772    In that case, a sequence of one-byte codes has a slightly different
1773    form.
1774
1775    At first, all characters in eight-bit-control are represented by
1776    one-byte sequences which are their 8-bit code.
1777
1778    Next, character composition data are represented by the byte
1779    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1780    where,
1781         METHOD is 0xF2 plus one of composition method (enum
1782         composition_method),
1783
1784         BYTES is 0xA0 plus a byte length of this composition data,
1785
1786         CHARS is 0xA0 plus a number of characters composed by this
1787         data,
1788
1789         COMPONENTs are characters of multibyte form or composition
1790         rules encoded by two-byte of ASCII codes.
1791
1792    In addition, for backward compatibility, the following formats are
1793    also recognized as composition data on decoding.
1794
1795    0x80 MSEQ ...
1796    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1797
1798    Here,
1799         MSEQ is a multibyte form but in these special format:
1800           ASCII: 0xA0 ASCII_CODE+0x80,
1801           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1802         RULE is a one byte code of the range 0xA0..0xF0 that
1803         represents a composition rule.
1804   */
1805
1806 char emacs_mule_bytes[256];
1807
1808
1809 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1810    Return true if a text is encoded in 'emacs-mule'.  */
1811
1812 static bool
1813 detect_coding_emacs_mule (struct coding_system *coding,
1814                           struct coding_detection_info *detect_info)
1815 {
1816   const unsigned char *src = coding->source, *src_base;
1817   const unsigned char *src_end = coding->source + coding->src_bytes;
1818   bool multibytep = coding->src_multibyte;
1819   ptrdiff_t consumed_chars = 0;
1820   int c;
1821   int found = 0;
1822
1823   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1824   /* A coding system of this category is always ASCII compatible.  */
1825   src += coding->head_ascii;
1826
1827   while (1)
1828     {
1829       src_base = src;
1830       ONE_MORE_BYTE (c);
1831       if (c < 0)
1832         continue;
1833       if (c == 0x80)
1834         {
1835           /* Perhaps the start of composite character.  We simply skip
1836              it because analyzing it is too heavy for detecting.  But,
1837              at least, we check that the composite character
1838              constitutes of more than 4 bytes.  */
1839           const unsigned char *src_start;
1840
1841         repeat:
1842           src_start = src;
1843           do
1844             {
1845               ONE_MORE_BYTE (c);
1846             }
1847           while (c >= 0xA0);
1848
1849           if (src - src_start <= 4)
1850             break;
1851           found = CATEGORY_MASK_EMACS_MULE;
1852           if (c == 0x80)
1853             goto repeat;
1854         }
1855
1856       if (c < 0x80)
1857         {
1858           if (c < 0x20
1859               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1860             break;
1861         }
1862       else
1863         {
1864           int more_bytes = emacs_mule_bytes[c] - 1;
1865
1866           while (more_bytes > 0)
1867             {
1868               ONE_MORE_BYTE (c);
1869               if (c < 0xA0)
1870                 {
1871                   src--;        /* Unread the last byte.  */
1872                   break;
1873                 }
1874               more_bytes--;
1875             }
1876           if (more_bytes != 0)
1877             break;
1878           found = CATEGORY_MASK_EMACS_MULE;
1879         }
1880     }
1881   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1882   return 0;
1883
1884  no_more_source:
1885   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1886     {
1887       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1888       return 0;
1889     }
1890   detect_info->found |= found;
1891   return 1;
1892 }
1893
1894
1895 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1896    character.  If CMP_STATUS indicates that we must expect MSEQ or
1897    RULE described above, decode it and return the negative value of
1898    the decoded character or rule.  If an invalid byte is found, return
1899    -1.  If SRC is too short, return -2.  */
1900
1901 static int
1902 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1903                  int *nbytes, int *nchars, int *id,
1904                  struct composition_status *cmp_status)
1905 {
1906   const unsigned char *src_end = coding->source + coding->src_bytes;
1907   const unsigned char *src_base = src;
1908   bool multibytep = coding->src_multibyte;
1909   int charset_ID;
1910   unsigned code;
1911   int c;
1912   int consumed_chars = 0;
1913   bool mseq_found = 0;
1914
1915   ONE_MORE_BYTE (c);
1916   if (c < 0)
1917     {
1918       c = -c;
1919       charset_ID = emacs_mule_charset[0];
1920     }
1921   else
1922     {
1923       if (c >= 0xA0)
1924         {
1925           if (cmp_status->state != COMPOSING_NO
1926               && cmp_status->old_form)
1927             {
1928               if (cmp_status->state == COMPOSING_CHAR)
1929                 {
1930                   if (c == 0xA0)
1931                     {
1932                       ONE_MORE_BYTE (c);
1933                       c -= 0x80;
1934                       if (c < 0)
1935                         goto invalid_code;
1936                     }
1937                   else
1938                     c -= 0x20;
1939                   mseq_found = 1;
1940                 }
1941               else
1942                 {
1943                   *nbytes = src - src_base;
1944                   *nchars = consumed_chars;
1945                   return -c;
1946                 }
1947             }
1948           else
1949             goto invalid_code;
1950         }
1951
1952       switch (emacs_mule_bytes[c])
1953         {
1954         case 2:
1955           if ((charset_ID = emacs_mule_charset[c]) < 0)
1956             goto invalid_code;
1957           ONE_MORE_BYTE (c);
1958           if (c < 0xA0)
1959             goto invalid_code;
1960           code = c & 0x7F;
1961           break;
1962
1963         case 3:
1964           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1965               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1966             {
1967               ONE_MORE_BYTE (c);
1968               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1969                 goto invalid_code;
1970               ONE_MORE_BYTE (c);
1971               if (c < 0xA0)
1972                 goto invalid_code;
1973               code = c & 0x7F;
1974             }
1975           else
1976             {
1977               if ((charset_ID = emacs_mule_charset[c]) < 0)
1978                 goto invalid_code;
1979               ONE_MORE_BYTE (c);
1980               if (c < 0xA0)
1981                 goto invalid_code;
1982               code = (c & 0x7F) << 8;
1983               ONE_MORE_BYTE (c);
1984               if (c < 0xA0)
1985                 goto invalid_code;
1986               code |= c & 0x7F;
1987             }
1988           break;
1989
1990         case 4:
1991           ONE_MORE_BYTE (c);
1992           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1993             goto invalid_code;
1994           ONE_MORE_BYTE (c);
1995           if (c < 0xA0)
1996             goto invalid_code;
1997           code = (c & 0x7F) << 8;
1998           ONE_MORE_BYTE (c);
1999           if (c < 0xA0)
2000             goto invalid_code;
2001           code |= c & 0x7F;
2002           break;
2003
2004         case 1:
2005           code = c;
2006           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2007           break;
2008
2009         default:
2010           emacs_abort ();
2011         }
2012       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2013                           CHARSET_FROM_ID (charset_ID), code, c);
2014       if (c < 0)
2015         goto invalid_code;
2016     }
2017   *nbytes = src - src_base;
2018   *nchars = consumed_chars;
2019   if (id)
2020     *id = charset_ID;
2021   return (mseq_found ? -c : c);
2022
2023  no_more_source:
2024   return -2;
2025
2026  invalid_code:
2027   return -1;
2028 }
2029
2030
2031 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2032
2033 /* Handle these composition sequence ('|': the end of header elements,
2034    BYTES and CHARS >= 0xA0):
2035
2036    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2037    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2038    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2039
2040    and these old form:
2041
2042    (4) relative composition: 0x80 | MSEQ ... MSEQ
2043    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2044
2045    When the starter 0x80 and the following header elements are found,
2046    this annotation header is produced.
2047
2048         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2049
2050    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2051    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2052
2053    Then, upon reading the following elements, these codes are produced
2054    until the composition end is found:
2055
2056    (1) CHAR ... CHAR
2057    (2) ALT ... ALT CHAR ... CHAR
2058    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2059    (4) CHAR ... CHAR
2060    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2061
2062    When the composition end is found, LENGTH and NCHARS in the
2063    annotation header is updated as below:
2064
2065    (1) LENGTH: unchanged, NCHARS: unchanged
2066    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2067    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2068    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2069    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2070
2071    If an error is found while composing, the annotation header is
2072    changed to the original composition header (plus filler -1s) as
2073    below:
2074
2075    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2076    (5)          [ 0x80 0xFF -1 -1- -1 ]
2077
2078    and the sequence [ -2 DECODED-RULE ] is changed to the original
2079    byte sequence as below:
2080         o the original byte sequence is B: [ B -1 ]
2081         o the original byte sequence is B1 B2: [ B1 B2 ]
2082
2083    Most of the routines are implemented by macros because many
2084    variables and labels in the caller decode_coding_emacs_mule must be
2085    accessible, and they are usually called just once (thus doesn't
2086    increase the size of compiled object).  */
2087
2088 /* Decode a composition rule represented by C as a component of
2089    composition sequence of Emacs 20 style.  Set RULE to the decoded
2090    rule. */
2091
2092 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2093   do {                                                  \
2094     int gref, nref;                                     \
2095                                                         \
2096     c -= 0xA0;                                          \
2097     if (c < 0 || c >= 81)                               \
2098       goto invalid_code;                                \
2099     gref = c / 9, nref = c % 9;                         \
2100     if (gref == 4) gref = 10;                           \
2101     if (nref == 4) nref = 10;                           \
2102     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2103   } while (0)
2104
2105
2106 /* Decode a composition rule represented by C and the following byte
2107    at SRC as a component of composition sequence of Emacs 21 style.
2108    Set RULE to the decoded rule.  */
2109
2110 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2111   do {                                                  \
2112     int gref, nref;                                     \
2113                                                         \
2114     gref = c - 0x20;                                    \
2115     if (gref < 0 || gref >= 81)                         \
2116       goto invalid_code;                                \
2117     ONE_MORE_BYTE (c);                                  \
2118     nref = c - 0x20;                                    \
2119     if (nref < 0 || nref >= 81)                         \
2120       goto invalid_code;                                \
2121     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2122   } while (0)
2123
2124
2125 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2126    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2127    byte length of this composition information, CHARS is the number of
2128    characters composed by this composition.  */
2129
2130 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2131   do {                                                                  \
2132     enum composition_method method = c - 0xF2;                          \
2133     int nbytes, nchars;                                                 \
2134                                                                         \
2135     ONE_MORE_BYTE (c);                                                  \
2136     if (c < 0)                                                          \
2137       goto invalid_code;                                                \
2138     nbytes = c - 0xA0;                                                  \
2139     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2140       goto invalid_code;                                                \
2141     ONE_MORE_BYTE (c);                                                  \
2142     nchars = c - 0xA0;                                                  \
2143     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2144       goto invalid_code;                                                \
2145     cmp_status->old_form = 0;                                           \
2146     cmp_status->method = method;                                        \
2147     if (method == COMPOSITION_RELATIVE)                                 \
2148       cmp_status->state = COMPOSING_CHAR;                               \
2149     else                                                                \
2150       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2151     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2152     cmp_status->nchars = nchars;                                        \
2153     cmp_status->ncomps = nbytes - 4;                                    \
2154     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2155   } while (0)
2156
2157
2158 /* Start of Emacs 20 style format for relative composition.  */
2159
2160 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2161   do {                                                          \
2162     cmp_status->old_form = 1;                                   \
2163     cmp_status->method = COMPOSITION_RELATIVE;                  \
2164     cmp_status->state = COMPOSING_CHAR;                         \
2165     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2166     cmp_status->nchars = cmp_status->ncomps = 0;                \
2167     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2168   } while (0)
2169
2170
2171 /* Start of Emacs 20 style format for rule-base composition.  */
2172
2173 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2174   do {                                                          \
2175     cmp_status->old_form = 1;                                   \
2176     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2177     cmp_status->state = COMPOSING_CHAR;                         \
2178     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2179     cmp_status->nchars = cmp_status->ncomps = 0;                \
2180     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2181   } while (0)
2182
2183
2184 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2185   do {                                                  \
2186     const unsigned char *current_src = src;             \
2187                                                         \
2188     ONE_MORE_BYTE (c);                                  \
2189     if (c < 0)                                          \
2190       goto invalid_code;                                \
2191     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2192         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2193       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2194     else if (c < 0xA0)                                  \
2195       goto invalid_code;                                \
2196     else if (c < 0xC0)                                  \
2197       {                                                 \
2198         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2199         /* Re-read C as a composition component.  */    \
2200         src = current_src;                              \
2201       }                                                 \
2202     else if (c == 0xFF)                                 \
2203       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2204     else                                                \
2205       goto invalid_code;                                \
2206   } while (0)
2207
2208 #define EMACS_MULE_COMPOSITION_END()                            \
2209   do {                                                          \
2210     int idx = - cmp_status->length;                             \
2211                                                                 \
2212     if (cmp_status->old_form)                                   \
2213       charbuf[idx + 2] = cmp_status->nchars;                    \
2214     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2215       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2216     cmp_status->state = COMPOSING_NO;                           \
2217   } while (0)
2218
2219
2220 static int
2221 emacs_mule_finish_composition (int *charbuf,
2222                                struct composition_status *cmp_status)
2223 {
2224   int idx = - cmp_status->length;
2225   int new_chars;
2226
2227   if (cmp_status->old_form && cmp_status->nchars > 0)
2228     {
2229       charbuf[idx + 2] = cmp_status->nchars;
2230       new_chars = 0;
2231       if (cmp_status->method == COMPOSITION_WITH_RULE
2232           && cmp_status->state == COMPOSING_CHAR)
2233         {
2234           /* The last rule was invalid.  */
2235           int rule = charbuf[-1] + 0xA0;
2236
2237           charbuf[-2] = BYTE8_TO_CHAR (rule);
2238           charbuf[-1] = -1;
2239           new_chars = 1;
2240         }
2241     }
2242   else
2243     {
2244       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2245
2246       if (cmp_status->method == COMPOSITION_WITH_RULE)
2247         {
2248           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2249           charbuf[idx++] = -3;
2250           charbuf[idx++] = 0;
2251           new_chars = 1;
2252         }
2253       else
2254         {
2255           int nchars = charbuf[idx + 1] + 0xA0;
2256           int nbytes = charbuf[idx + 2] + 0xA0;
2257
2258           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2259           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2260           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2261           charbuf[idx++] = -1;
2262           new_chars = 4;
2263         }
2264     }
2265   cmp_status->state = COMPOSING_NO;
2266   return new_chars;
2267 }
2268
2269 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2270   do {                                                                    \
2271     if (cmp_status->state != COMPOSING_NO)                                \
2272       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2273   } while (0)
2274
2275
2276 static void
2277 decode_coding_emacs_mule (struct coding_system *coding)
2278 {
2279   const unsigned char *src = coding->source + coding->consumed;
2280   const unsigned char *src_end = coding->source + coding->src_bytes;
2281   const unsigned char *src_base;
2282   int *charbuf = coding->charbuf + coding->charbuf_used;
2283   /* We may produce two annotations (charset and composition) in one
2284      loop and one more charset annotation at the end.  */
2285   int *charbuf_end
2286     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2287       /* We can produce up to 2 characters in a loop.  */
2288       - 1;
2289   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2290   bool multibytep = coding->src_multibyte;
2291   ptrdiff_t char_offset = coding->produced_char;
2292   ptrdiff_t last_offset = char_offset;
2293   int last_id = charset_ascii;
2294   bool eol_dos
2295     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2296   int byte_after_cr = -1;
2297   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2298
2299   if (cmp_status->state != COMPOSING_NO)
2300     {
2301       int i;
2302
2303       if (charbuf_end - charbuf < cmp_status->length)
2304         emacs_abort ();
2305       for (i = 0; i < cmp_status->length; i++)
2306         *charbuf++ = cmp_status->carryover[i];
2307       coding->annotated = 1;
2308     }
2309
2310   while (1)
2311     {
2312       int c, id IF_LINT (= 0);
2313
2314       src_base = src;
2315       consumed_chars_base = consumed_chars;
2316
2317       if (charbuf >= charbuf_end)
2318         {
2319           if (byte_after_cr >= 0)
2320             src_base--;
2321           break;
2322         }
2323
2324       if (byte_after_cr >= 0)
2325         c = byte_after_cr, byte_after_cr = -1;
2326       else
2327         ONE_MORE_BYTE (c);
2328
2329       if (c < 0 || c == 0x80)
2330         {
2331           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2332           if (c < 0)
2333             {
2334               *charbuf++ = -c;
2335               char_offset++;
2336             }
2337           else
2338             DECODE_EMACS_MULE_COMPOSITION_START ();
2339           continue;
2340         }
2341
2342       if (c < 0x80)
2343         {
2344           if (eol_dos && c == '\r')
2345             ONE_MORE_BYTE (byte_after_cr);
2346           id = charset_ascii;
2347           if (cmp_status->state != COMPOSING_NO)
2348             {
2349               if (cmp_status->old_form)
2350                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2351               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2352                 cmp_status->ncomps--;
2353             }
2354         }
2355       else
2356         {
2357           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2358           /* emacs_mule_char can load a charset map from a file, which
2359              allocates a large structure and might cause buffer text
2360              to be relocated as result.  Thus, we need to remember the
2361              original pointer to buffer text, and fix up all related
2362              pointers after the call.  */
2363           const unsigned char *orig = coding->source;
2364           ptrdiff_t offset;
2365
2366           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2367                                cmp_status);
2368           offset = coding->source - orig;
2369           if (offset)
2370             {
2371               src += offset;
2372               src_base += offset;
2373               src_end += offset;
2374             }
2375           if (c < 0)
2376             {
2377               if (c == -1)
2378                 goto invalid_code;
2379               if (c == -2)
2380                 break;
2381             }
2382           src = src_base + nbytes;
2383           consumed_chars = consumed_chars_base + nchars;
2384           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2385             cmp_status->ncomps -= nchars;
2386         }
2387
2388       /* Now if C >= 0, we found a normally encoded character, if C <
2389          0, we found an old-style composition component character or
2390          rule.  */
2391
2392       if (cmp_status->state == COMPOSING_NO)
2393         {
2394           if (last_id != id)
2395             {
2396               if (last_id != charset_ascii)
2397                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2398                                   last_id);
2399               last_id = id;
2400               last_offset = char_offset;
2401             }
2402           *charbuf++ = c;
2403           char_offset++;
2404         }
2405       else if (cmp_status->state == COMPOSING_CHAR)
2406         {
2407           if (cmp_status->old_form)
2408             {
2409               if (c >= 0)
2410                 {
2411                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2412                   *charbuf++ = c;
2413                   char_offset++;
2414                 }
2415               else
2416                 {
2417                   *charbuf++ = -c;
2418                   cmp_status->nchars++;
2419                   cmp_status->length++;
2420                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2421                     EMACS_MULE_COMPOSITION_END ();
2422                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2423                     cmp_status->state = COMPOSING_RULE;
2424                 }
2425             }
2426           else
2427             {
2428               *charbuf++ = c;
2429               cmp_status->length++;
2430               cmp_status->nchars--;
2431               if (cmp_status->nchars == 0)
2432                 EMACS_MULE_COMPOSITION_END ();
2433             }
2434         }
2435       else if (cmp_status->state == COMPOSING_RULE)
2436         {
2437           int rule;
2438
2439           if (c >= 0)
2440             {
2441               EMACS_MULE_COMPOSITION_END ();
2442               *charbuf++ = c;
2443               char_offset++;
2444             }
2445           else
2446             {
2447               c = -c;
2448               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2449               if (rule < 0)
2450                 goto invalid_code;
2451               *charbuf++ = -2;
2452               *charbuf++ = rule;
2453               cmp_status->length += 2;
2454               cmp_status->state = COMPOSING_CHAR;
2455             }
2456         }
2457       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2458         {
2459           *charbuf++ = c;
2460           cmp_status->length++;
2461           if (cmp_status->ncomps == 0)
2462             cmp_status->state = COMPOSING_CHAR;
2463           else if (cmp_status->ncomps > 0)
2464             {
2465               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2466                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2467             }
2468           else
2469             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2470         }
2471       else                      /* COMPOSING_COMPONENT_RULE */
2472         {
2473           int rule;
2474
2475           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2476           if (rule < 0)
2477             goto invalid_code;
2478           *charbuf++ = -2;
2479           *charbuf++ = rule;
2480           cmp_status->length += 2;
2481           cmp_status->ncomps--;
2482           if (cmp_status->ncomps > 0)
2483             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2484           else
2485             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2486         }
2487       continue;
2488
2489     invalid_code:
2490       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2491       src = src_base;
2492       consumed_chars = consumed_chars_base;
2493       ONE_MORE_BYTE (c);
2494       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2495       char_offset++;
2496       coding->errors++;
2497     }
2498
2499  no_more_source:
2500   if (cmp_status->state != COMPOSING_NO)
2501     {
2502       if (coding->mode & CODING_MODE_LAST_BLOCK)
2503         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2504       else
2505         {
2506           int i;
2507
2508           charbuf -= cmp_status->length;
2509           for (i = 0; i < cmp_status->length; i++)
2510             cmp_status->carryover[i] = charbuf[i];
2511         }
2512     }
2513   if (last_id != charset_ascii)
2514     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2515   coding->consumed_char += consumed_chars_base;
2516   coding->consumed = src_base - coding->source;
2517   coding->charbuf_used = charbuf - coding->charbuf;
2518 }
2519
2520
2521 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2522   do {                                          \
2523     if (id < 0xA0)                              \
2524       codes[0] = id, codes[1] = 0;              \
2525     else if (id < 0xE0)                         \
2526       codes[0] = 0x9A, codes[1] = id;           \
2527     else if (id < 0xF0)                         \
2528       codes[0] = 0x9B, codes[1] = id;           \
2529     else if (id < 0xF5)                         \
2530       codes[0] = 0x9C, codes[1] = id;           \
2531     else                                        \
2532       codes[0] = 0x9D, codes[1] = id;           \
2533   } while (0);
2534
2535
2536 static bool
2537 encode_coding_emacs_mule (struct coding_system *coding)
2538 {
2539   bool multibytep = coding->dst_multibyte;
2540   int *charbuf = coding->charbuf;
2541   int *charbuf_end = charbuf + coding->charbuf_used;
2542   unsigned char *dst = coding->destination + coding->produced;
2543   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2544   int safe_room = 8;
2545   ptrdiff_t produced_chars = 0;
2546   Lisp_Object attrs, charset_list;
2547   int c;
2548   int preferred_charset_id = -1;
2549
2550   CODING_GET_INFO (coding, attrs, charset_list);
2551   if (! EQ (charset_list, Vemacs_mule_charset_list))
2552     {
2553       charset_list = Vemacs_mule_charset_list;
2554       ASET (attrs, coding_attr_charset_list, charset_list);
2555     }
2556
2557   while (charbuf < charbuf_end)
2558     {
2559       ASSURE_DESTINATION (safe_room);
2560       c = *charbuf++;
2561
2562       if (c < 0)
2563         {
2564           /* Handle an annotation.  */
2565           switch (*charbuf)
2566             {
2567             case CODING_ANNOTATE_COMPOSITION_MASK:
2568               /* Not yet implemented.  */
2569               break;
2570             case CODING_ANNOTATE_CHARSET_MASK:
2571               preferred_charset_id = charbuf[3];
2572               if (preferred_charset_id >= 0
2573                   && NILP (Fmemq (make_number (preferred_charset_id),
2574                                   charset_list)))
2575                 preferred_charset_id = -1;
2576               break;
2577             default:
2578               emacs_abort ();
2579             }
2580           charbuf += -c - 1;
2581           continue;
2582         }
2583
2584       if (ASCII_CHAR_P (c))
2585         EMIT_ONE_ASCII_BYTE (c);
2586       else if (CHAR_BYTE8_P (c))
2587         {
2588           c = CHAR_TO_BYTE8 (c);
2589           EMIT_ONE_BYTE (c);
2590         }
2591       else
2592         {
2593           struct charset *charset;
2594           unsigned code;
2595           int dimension;
2596           int emacs_mule_id;
2597           unsigned char leading_codes[2];
2598
2599           if (preferred_charset_id >= 0)
2600             {
2601               bool result;
2602
2603               charset = CHARSET_FROM_ID (preferred_charset_id);
2604               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2605               if (result)
2606                 code = ENCODE_CHAR (charset, c);
2607               else
2608                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2609                                      &code, charset);
2610             }
2611           else
2612             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2613                                  &code, charset);
2614           if (! charset)
2615             {
2616               c = coding->default_char;
2617               if (ASCII_CHAR_P (c))
2618                 {
2619                   EMIT_ONE_ASCII_BYTE (c);
2620                   continue;
2621                 }
2622               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2623                                    &code, charset);
2624             }
2625           dimension = CHARSET_DIMENSION (charset);
2626           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2627           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2628           EMIT_ONE_BYTE (leading_codes[0]);
2629           if (leading_codes[1])
2630             EMIT_ONE_BYTE (leading_codes[1]);
2631           if (dimension == 1)
2632             EMIT_ONE_BYTE (code | 0x80);
2633           else
2634             {
2635               code |= 0x8080;
2636               EMIT_ONE_BYTE (code >> 8);
2637               EMIT_ONE_BYTE (code & 0xFF);
2638             }
2639         }
2640     }
2641   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2642   coding->produced_char += produced_chars;
2643   coding->produced = dst - coding->destination;
2644   return 0;
2645 }
2646
2647 \f
2648 /*** 7. ISO2022 handlers ***/
2649
2650 /* The following note describes the coding system ISO2022 briefly.
2651    Since the intention of this note is to help understand the
2652    functions in this file, some parts are NOT ACCURATE or are OVERLY
2653    SIMPLIFIED.  For thorough understanding, please refer to the
2654    original document of ISO2022.  This is equivalent to the standard
2655    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2656
2657    ISO2022 provides many mechanisms to encode several character sets
2658    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2659    is encoded using bytes less than 128.  This may make the encoded
2660    text a little bit longer, but the text passes more easily through
2661    several types of gateway, some of which strip off the MSB (Most
2662    Significant Bit).
2663
2664    There are two kinds of character sets: control character sets and
2665    graphic character sets.  The former contain control characters such
2666    as `newline' and `escape' to provide control functions (control
2667    functions are also provided by escape sequences).  The latter
2668    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2669    two control character sets and many graphic character sets.
2670
2671    Graphic character sets are classified into one of the following
2672    four classes, according to the number of bytes (DIMENSION) and
2673    number of characters in one dimension (CHARS) of the set:
2674    - DIMENSION1_CHARS94
2675    - DIMENSION1_CHARS96
2676    - DIMENSION2_CHARS94
2677    - DIMENSION2_CHARS96
2678
2679    In addition, each character set is assigned an identification tag,
2680    unique for each set, called the "final character" (denoted as <F>
2681    hereafter).  The <F> of each character set is decided by ECMA(*)
2682    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2683    (0x30..0x3F are for private use only).
2684
2685    Note (*): ECMA = European Computer Manufacturers Association
2686
2687    Here are examples of graphic character sets [NAME(<F>)]:
2688         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2689         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2690         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2691         o DIMENSION2_CHARS96 -- none for the moment
2692
2693    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2694         C0 [0x00..0x1F] -- control character plane 0
2695         GL [0x20..0x7F] -- graphic character plane 0
2696         C1 [0x80..0x9F] -- control character plane 1
2697         GR [0xA0..0xFF] -- graphic character plane 1
2698
2699    A control character set is directly designated and invoked to C0 or
2700    C1 by an escape sequence.  The most common case is that:
2701    - ISO646's  control character set is designated/invoked to C0, and
2702    - ISO6429's control character set is designated/invoked to C1,
2703    and usually these designations/invocations are omitted in encoded
2704    text.  In a 7-bit environment, only C0 can be used, and a control
2705    character for C1 is encoded by an appropriate escape sequence to
2706    fit into the environment.  All control characters for C1 are
2707    defined to have corresponding escape sequences.
2708
2709    A graphic character set is at first designated to one of four
2710    graphic registers (G0 through G3), then these graphic registers are
2711    invoked to GL or GR.  These designations and invocations can be
2712    done independently.  The most common case is that G0 is invoked to
2713    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2714    these invocations and designations are omitted in encoded text.
2715    In a 7-bit environment, only GL can be used.
2716
2717    When a graphic character set of CHARS94 is invoked to GL, codes
2718    0x20 and 0x7F of the GL area work as control characters SPACE and
2719    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2720    be used.
2721
2722    There are two ways of invocation: locking-shift and single-shift.
2723    With locking-shift, the invocation lasts until the next different
2724    invocation, whereas with single-shift, the invocation affects the
2725    following character only and doesn't affect the locking-shift
2726    state.  Invocations are done by the following control characters or
2727    escape sequences:
2728
2729    ----------------------------------------------------------------------
2730    abbrev  function                  cntrl escape seq   description
2731    ----------------------------------------------------------------------
2732    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2733    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2734    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2735    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2736    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2737    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2738    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2739    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2740    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2741    ----------------------------------------------------------------------
2742    (*) These are not used by any known coding system.
2743
2744    Control characters for these functions are defined by macros
2745    ISO_CODE_XXX in `coding.h'.
2746
2747    Designations are done by the following escape sequences:
2748    ----------------------------------------------------------------------
2749    escape sequence      description
2750    ----------------------------------------------------------------------
2751    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2752    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2753    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2754    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2755    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2756    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2757    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2758    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2759    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2760    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2761    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2762    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2763    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2764    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2765    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2766    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2767    ----------------------------------------------------------------------
2768
2769    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2770    of dimension 1, chars 94, and final character <F>, etc...
2771
2772    Note (*): Although these designations are not allowed in ISO2022,
2773    Emacs accepts them on decoding, and produces them on encoding
2774    CHARS96 character sets in a coding system which is characterized as
2775    7-bit environment, non-locking-shift, and non-single-shift.
2776
2777    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2778    '(' must be omitted.  We refer to this as "short-form" hereafter.
2779
2780    Now you may notice that there are a lot of ways of encoding the
2781    same multilingual text in ISO2022.  Actually, there exist many
2782    coding systems such as Compound Text (used in X11's inter client
2783    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2784    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2785    localized platforms), and all of these are variants of ISO2022.
2786
2787    In addition to the above, Emacs handles two more kinds of escape
2788    sequences: ISO6429's direction specification and Emacs' private
2789    sequence for specifying character composition.
2790
2791    ISO6429's direction specification takes the following form:
2792         o CSI ']'      -- end of the current direction
2793         o CSI '0' ']'  -- end of the current direction
2794         o CSI '1' ']'  -- start of left-to-right text
2795         o CSI '2' ']'  -- start of right-to-left text
2796    The control character CSI (0x9B: control sequence introducer) is
2797    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2798
2799    Character composition specification takes the following form:
2800         o ESC '0' -- start relative composition
2801         o ESC '1' -- end composition
2802         o ESC '2' -- start rule-base composition (*)
2803         o ESC '3' -- start relative composition with alternate chars  (**)
2804         o ESC '4' -- start rule-base composition with alternate chars  (**)
2805   Since these are not standard escape sequences of any ISO standard,
2806   the use of them with these meanings is restricted to Emacs only.
2807
2808   (*) This form is used only in Emacs 20.7 and older versions,
2809   but newer versions can safely decode it.
2810   (**) This form is used only in Emacs 21.1 and newer versions,
2811   and older versions can't decode it.
2812
2813   Here's a list of example usages of these composition escape
2814   sequences (categorized by `enum composition_method').
2815
2816   COMPOSITION_RELATIVE:
2817         ESC 0 CHAR [ CHAR ] ESC 1
2818   COMPOSITION_WITH_RULE:
2819         ESC 2 CHAR [ RULE CHAR ] ESC 1
2820   COMPOSITION_WITH_ALTCHARS:
2821         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2822   COMPOSITION_WITH_RULE_ALTCHARS:
2823         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2824
2825 static enum iso_code_class_type iso_code_class[256];
2826
2827 #define SAFE_CHARSET_P(coding, id)      \
2828   ((id) <= (coding)->max_charset_id     \
2829    && (coding)->safe_charsets[id] != 255)
2830
2831 static void
2832 setup_iso_safe_charsets (Lisp_Object attrs)
2833 {
2834   Lisp_Object charset_list, safe_charsets;
2835   Lisp_Object request;
2836   Lisp_Object reg_usage;
2837   Lisp_Object tail;
2838   EMACS_INT reg94, reg96;
2839   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2840   int max_charset_id;
2841
2842   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2843   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2844       && ! EQ (charset_list, Viso_2022_charset_list))
2845     {
2846       charset_list = Viso_2022_charset_list;
2847       ASET (attrs, coding_attr_charset_list, charset_list);
2848       ASET (attrs, coding_attr_safe_charsets, Qnil);
2849     }
2850
2851   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2852     return;
2853
2854   max_charset_id = 0;
2855   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2856     {
2857       int id = XINT (XCAR (tail));
2858       if (max_charset_id < id)
2859         max_charset_id = id;
2860     }
2861
2862   safe_charsets = make_uninit_string (max_charset_id + 1);
2863   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2864   request = AREF (attrs, coding_attr_iso_request);
2865   reg_usage = AREF (attrs, coding_attr_iso_usage);
2866   reg94 = XINT (XCAR (reg_usage));
2867   reg96 = XINT (XCDR (reg_usage));
2868
2869   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2870     {
2871       Lisp_Object id;
2872       Lisp_Object reg;
2873       struct charset *charset;
2874
2875       id = XCAR (tail);
2876       charset = CHARSET_FROM_ID (XINT (id));
2877       reg = Fcdr (Fassq (id, request));
2878       if (! NILP (reg))
2879         SSET (safe_charsets, XINT (id), XINT (reg));
2880       else if (charset->iso_chars_96)
2881         {
2882           if (reg96 < 4)
2883             SSET (safe_charsets, XINT (id), reg96);
2884         }
2885       else
2886         {
2887           if (reg94 < 4)
2888             SSET (safe_charsets, XINT (id), reg94);
2889         }
2890     }
2891   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2892 }
2893
2894
2895 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2896    Return true if a text is encoded in one of ISO-2022 based coding
2897    systems.  */
2898
2899 static bool
2900 detect_coding_iso_2022 (struct coding_system *coding,
2901                         struct coding_detection_info *detect_info)
2902 {
2903   const unsigned char *src = coding->source, *src_base = src;
2904   const unsigned char *src_end = coding->source + coding->src_bytes;
2905   bool multibytep = coding->src_multibyte;
2906   bool single_shifting = 0;
2907   int id;
2908   int c, c1;
2909   ptrdiff_t consumed_chars = 0;
2910   int i;
2911   int rejected = 0;
2912   int found = 0;
2913   int composition_count = -1;
2914
2915   detect_info->checked |= CATEGORY_MASK_ISO;
2916
2917   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2918     {
2919       struct coding_system *this = &(coding_categories[i]);
2920       Lisp_Object attrs, val;
2921
2922       if (this->id < 0)
2923         continue;
2924       attrs = CODING_ID_ATTRS (this->id);
2925       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2926           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2927         setup_iso_safe_charsets (attrs);
2928       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2929       this->max_charset_id = SCHARS (val) - 1;
2930       this->safe_charsets = SDATA (val);
2931     }
2932
2933   /* A coding system of this category is always ASCII compatible.  */
2934   src += coding->head_ascii;
2935
2936   while (rejected != CATEGORY_MASK_ISO)
2937     {
2938       src_base = src;
2939       ONE_MORE_BYTE (c);
2940       switch (c)
2941         {
2942         case ISO_CODE_ESC:
2943           if (inhibit_iso_escape_detection)
2944             break;
2945           single_shifting = 0;
2946           ONE_MORE_BYTE (c);
2947           if (c == 'N' || c == 'O')
2948             {
2949               /* ESC <Fe> for SS2 or SS3.  */
2950               single_shifting = 1;
2951               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2952             }
2953           else if (c == '1')
2954             {
2955               /* End of composition.  */
2956               if (composition_count < 0
2957                   || composition_count > MAX_COMPOSITION_COMPONENTS)
2958                 /* Invalid */
2959                 break;
2960               composition_count = -1;
2961               found |= CATEGORY_MASK_ISO;
2962             }
2963           else if (c >= '0' && c <= '4')
2964             {
2965               /* ESC <Fp> for start/end composition.  */
2966               composition_count = 0;
2967             }
2968           else
2969             {
2970               if (c >= '(' && c <= '/')
2971                 {
2972                   /* Designation sequence for a charset of dimension 1.  */
2973                   ONE_MORE_BYTE (c1);
2974                   if (c1 < ' ' || c1 >= 0x80
2975                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2976                     /* Invalid designation sequence.  Just ignore.  */
2977                     break;
2978                 }
2979               else if (c == '$')
2980                 {
2981                   /* Designation sequence for a charset of dimension 2.  */
2982                   ONE_MORE_BYTE (c);
2983                   if (c >= '@' && c <= 'B')
2984                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2985                     id = iso_charset_table[1][0][c];
2986                   else if (c >= '(' && c <= '/')
2987                     {
2988                       ONE_MORE_BYTE (c1);
2989                       if (c1 < ' ' || c1 >= 0x80
2990                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2991                         /* Invalid designation sequence.  Just ignore.  */
2992                         break;
2993                     }
2994                   else
2995                     /* Invalid designation sequence.  Just ignore it.  */
2996                     break;
2997                 }
2998               else
2999                 {
3000                   /* Invalid escape sequence.  Just ignore it.  */
3001                   break;
3002                 }
3003
3004               /* We found a valid designation sequence for CHARSET.  */
3005               rejected |= CATEGORY_MASK_ISO_8BIT;
3006               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3007                                   id))
3008                 found |= CATEGORY_MASK_ISO_7;
3009               else
3010                 rejected |= CATEGORY_MASK_ISO_7;
3011               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3012                                   id))
3013                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3014               else
3015                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3016               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3017                                   id))
3018                 found |= CATEGORY_MASK_ISO_7_ELSE;
3019               else
3020                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3021               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3022                                   id))
3023                 found |= CATEGORY_MASK_ISO_8_ELSE;
3024               else
3025                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3026             }
3027           break;
3028
3029         case ISO_CODE_SO:
3030         case ISO_CODE_SI:
3031           /* Locking shift out/in.  */
3032           if (inhibit_iso_escape_detection)
3033             break;
3034           single_shifting = 0;
3035           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3036           break;
3037
3038         case ISO_CODE_CSI:
3039           /* Control sequence introducer.  */
3040           single_shifting = 0;
3041           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3042           found |= CATEGORY_MASK_ISO_8_ELSE;
3043           goto check_extra_latin;
3044
3045         case ISO_CODE_SS2:
3046         case ISO_CODE_SS3:
3047           /* Single shift.   */
3048           if (inhibit_iso_escape_detection)
3049             break;
3050           single_shifting = 0;
3051           rejected |= CATEGORY_MASK_ISO_7BIT;
3052           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3053               & CODING_ISO_FLAG_SINGLE_SHIFT)
3054             {
3055               found |= CATEGORY_MASK_ISO_8_1;
3056               single_shifting = 1;
3057             }
3058           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3059               & CODING_ISO_FLAG_SINGLE_SHIFT)
3060             {
3061               found |= CATEGORY_MASK_ISO_8_2;
3062               single_shifting = 1;
3063             }
3064           if (single_shifting)
3065             break;
3066           goto check_extra_latin;
3067
3068         default:
3069           if (c < 0)
3070             continue;
3071           if (c < 0x80)
3072             {
3073               if (composition_count >= 0)
3074                 composition_count++;
3075               single_shifting = 0;
3076               break;
3077             }
3078           if (c >= 0xA0)
3079             {
3080               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3081               found |= CATEGORY_MASK_ISO_8_1;
3082               /* Check the length of succeeding codes of the range
3083                  0xA0..0FF.  If the byte length is even, we include
3084                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3085                  only when we are not single shifting.  */
3086               if (! single_shifting
3087                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3088                 {
3089                   int len = 1;
3090                   while (src < src_end)
3091                     {
3092                       src_base = src;
3093                       ONE_MORE_BYTE (c);
3094                       if (c < 0xA0)
3095                         {
3096                           src = src_base;
3097                           break;
3098                         }
3099                       len++;
3100                     }
3101
3102                   if (len & 1 && src < src_end)
3103                     {
3104                       rejected |= CATEGORY_MASK_ISO_8_2;
3105                       if (composition_count >= 0)
3106                         composition_count += len;
3107                     }
3108                   else
3109                     {
3110                       found |= CATEGORY_MASK_ISO_8_2;
3111                       if (composition_count >= 0)
3112                         composition_count += len / 2;
3113                     }
3114                 }
3115               break;
3116             }
3117         check_extra_latin:
3118           if (! VECTORP (Vlatin_extra_code_table)
3119               || NILP (AREF (Vlatin_extra_code_table, c)))
3120             {
3121               rejected = CATEGORY_MASK_ISO;
3122               break;
3123             }
3124           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3125               & CODING_ISO_FLAG_LATIN_EXTRA)
3126             found |= CATEGORY_MASK_ISO_8_1;
3127           else
3128             rejected |= CATEGORY_MASK_ISO_8_1;
3129           rejected |= CATEGORY_MASK_ISO_8_2;
3130           break;
3131         }
3132     }
3133   detect_info->rejected |= CATEGORY_MASK_ISO;
3134   return 0;
3135
3136  no_more_source:
3137   detect_info->rejected |= rejected;
3138   detect_info->found |= (found & ~rejected);
3139   return 1;
3140 }
3141
3142
3143 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3144    escape sequence should be kept.  */
3145 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3146   do {                                                                  \
3147     int id, prev;                                                       \
3148                                                                         \
3149     if (final < '0' || final >= 128                                     \
3150         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3151         || !SAFE_CHARSET_P (coding, id))                                \
3152       {                                                                 \
3153         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3154         chars_96 = -1;                                                  \
3155         break;                                                          \
3156       }                                                                 \
3157     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3158     if (id == charset_jisx0201_roman)                                   \
3159       {                                                                 \
3160         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3161           id = charset_ascii;                                           \
3162       }                                                                 \
3163     else if (id == charset_jisx0208_1978)                               \
3164       {                                                                 \
3165         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3166           id = charset_jisx0208;                                        \
3167       }                                                                 \
3168     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3169     /* If there was an invalid designation to REG previously, and this  \
3170        designation is ASCII to REG, we should keep this designation     \
3171        sequence.  */                                                    \
3172     if (prev == -2 && id == charset_ascii)                              \
3173       chars_96 = -1;                                                    \
3174   } while (0)
3175
3176
3177 /* Handle these composition sequence (ALT: alternate char):
3178
3179    (1) relative composition: ESC 0 CHAR ... ESC 1
3180    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3181    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3182    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3183
3184    When the start sequence (ESC 0/2/3/4) is found, this annotation
3185    header is produced.
3186
3187         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3188
3189    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3190    produced until the end sequence (ESC 1) is found:
3191
3192    (1) CHAR ... CHAR
3193    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3194    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3195    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3196
3197    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3198    annotation header is updated as below:
3199
3200    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3201    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3202    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3203    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3204
3205    If an error is found while composing, the annotation header is
3206    changed to:
3207
3208         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3209
3210    and the sequence [ -2 DECODED-RULE ] is changed to the original
3211    byte sequence as below:
3212         o the original byte sequence is B: [ B -1 ]
3213         o the original byte sequence is B1 B2: [ B1 B2 ]
3214    and the sequence [ -1 -1 ] is changed to the original byte
3215    sequence:
3216         [ ESC '0' ]
3217 */
3218
3219 /* Decode a composition rule C1 and maybe one more byte from the
3220    source, and set RULE to the encoded composition rule.  If the rule
3221    is invalid, goto invalid_code.  */
3222
3223 #define DECODE_COMPOSITION_RULE(rule)                                   \
3224   do {                                                                  \
3225     rule = c1 - 32;                                                     \
3226     if (rule < 0)                                                       \
3227       goto invalid_code;                                                \
3228     if (rule < 81)              /* old format (before ver.21) */        \
3229       {                                                                 \
3230         int gref = (rule) / 9;                                          \
3231         int nref = (rule) % 9;                                          \
3232         if (gref == 4) gref = 10;                                       \
3233         if (nref == 4) nref = 10;                                       \
3234         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3235       }                                                                 \
3236     else                        /* new format (after ver.21) */         \
3237       {                                                                 \
3238         int b;                                                          \
3239                                                                         \
3240         ONE_MORE_BYTE (b);                                              \
3241         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3242           goto invalid_code;                                            \
3243         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3244         rule += 0x100;   /* Distinguish it from the old format.  */     \
3245       }                                                                 \
3246   } while (0)
3247
3248 #define ENCODE_COMPOSITION_RULE(rule)                           \
3249   do {                                                          \
3250     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3251                                                                 \
3252     if (rule < 0x100)           /* old format */                \
3253       {                                                         \
3254         if (gref == 10) gref = 4;                               \
3255         if (nref == 10) nref = 4;                               \
3256         charbuf[idx] = 32 + gref * 9 + nref;                    \
3257         charbuf[idx + 1] = -1;                                  \
3258         new_chars++;                                            \
3259       }                                                         \
3260     else                                /* new format */        \
3261       {                                                         \
3262         charbuf[idx] = 32 + 81 + gref;                          \
3263         charbuf[idx + 1] = 32 + nref;                           \
3264         new_chars += 2;                                         \
3265       }                                                         \
3266   } while (0)
3267
3268 /* Finish the current composition as invalid.  */
3269
3270 static int
3271 finish_composition (int *charbuf, struct composition_status *cmp_status)
3272 {
3273   int idx = - cmp_status->length;
3274   int new_chars;
3275
3276   /* Recover the original ESC sequence */
3277   charbuf[idx++] = ISO_CODE_ESC;
3278   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3279                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3280                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3281                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3282                     : '4');
3283   charbuf[idx++] = -2;
3284   charbuf[idx++] = 0;
3285   charbuf[idx++] = -1;
3286   new_chars = cmp_status->nchars;
3287   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3288     for (; idx < 0; idx++)
3289       {
3290         int elt = charbuf[idx];
3291
3292         if (elt == -2)
3293           {
3294             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3295             idx++;
3296           }
3297         else if (elt == -1)
3298           {
3299             charbuf[idx++] = ISO_CODE_ESC;
3300             charbuf[idx] = '0';
3301             new_chars += 2;
3302           }
3303       }
3304   cmp_status->state = COMPOSING_NO;
3305   return new_chars;
3306 }
3307
3308 /* If characters are under composition, finish the composition.  */
3309 #define MAYBE_FINISH_COMPOSITION()                              \
3310   do {                                                          \
3311     if (cmp_status->state != COMPOSING_NO)                      \
3312       char_offset += finish_composition (charbuf, cmp_status);  \
3313   } while (0)
3314
3315 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3316
3317    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3318    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3319    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3320    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3321
3322    Produce this annotation sequence now:
3323
3324    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3325 */
3326
3327 #define DECODE_COMPOSITION_START(c1)                                       \
3328   do {                                                                     \
3329     if (c1 == '0'                                                          \
3330         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3331              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3332             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3333                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3334       {                                                                    \
3335         *charbuf++ = -1;                                                   \
3336         *charbuf++= -1;                                                    \
3337         cmp_status->state = COMPOSING_CHAR;                                \
3338         cmp_status->length += 2;                                           \
3339       }                                                                    \
3340     else                                                                   \
3341       {                                                                    \
3342         MAYBE_FINISH_COMPOSITION ();                                       \
3343         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3344                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3345                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3346                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3347         cmp_status->state                                                  \
3348           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3349         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3350         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3351         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3352         coding->annotated = 1;                                             \
3353       }                                                                    \
3354   } while (0)
3355
3356
3357 /* Handle composition end sequence ESC 1.  */
3358
3359 #define DECODE_COMPOSITION_END()                                        \
3360   do {                                                                  \
3361     if (cmp_status->nchars == 0                                         \
3362         || ((cmp_status->state == COMPOSING_CHAR)                       \
3363             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3364       {                                                                 \
3365         MAYBE_FINISH_COMPOSITION ();                                    \
3366         goto invalid_code;                                              \
3367       }                                                                 \
3368     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3369       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3370     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3371       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3372     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3373     char_offset += cmp_status->nchars;                                  \
3374     cmp_status->state = COMPOSING_NO;                                   \
3375   } while (0)
3376
3377 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3378
3379 #define STORE_COMPOSITION_RULE(rule)    \
3380   do {                                  \
3381     *charbuf++ = -2;                    \
3382     *charbuf++ = rule;                  \
3383     cmp_status->length += 2;            \
3384     cmp_status->state--;                \
3385   } while (0)
3386
3387 /* Store a composed char or a component char C in charbuf, and update
3388    cmp_status.  */
3389
3390 #define STORE_COMPOSITION_CHAR(c)                                       \
3391   do {                                                                  \
3392     *charbuf++ = (c);                                                   \
3393     cmp_status->length++;                                               \
3394     if (cmp_status->state == COMPOSING_CHAR)                            \
3395       cmp_status->nchars++;                                             \
3396     else                                                                \
3397       cmp_status->ncomps++;                                             \
3398     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3399         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3400             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3401       cmp_status->state++;                                              \
3402   } while (0)
3403
3404
3405 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3406
3407 static void
3408 decode_coding_iso_2022 (struct coding_system *coding)
3409 {
3410   const unsigned char *src = coding->source + coding->consumed;
3411   const unsigned char *src_end = coding->source + coding->src_bytes;
3412   const unsigned char *src_base;
3413   int *charbuf = coding->charbuf + coding->charbuf_used;
3414   /* We may produce two annotations (charset and composition) in one
3415      loop and one more charset annotation at the end.  */
3416   int *charbuf_end
3417     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3418   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3419   bool multibytep = coding->src_multibyte;
3420   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3421   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3422   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3423   int charset_id_2, charset_id_3;
3424   struct charset *charset;
3425   int c;
3426   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3427   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3428   ptrdiff_t char_offset = coding->produced_char;
3429   ptrdiff_t last_offset = char_offset;
3430   int last_id = charset_ascii;
3431   bool eol_dos
3432     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3433   int byte_after_cr = -1;
3434   int i;
3435
3436   setup_iso_safe_charsets (attrs);
3437   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3438
3439   if (cmp_status->state != COMPOSING_NO)
3440     {
3441       if (charbuf_end - charbuf < cmp_status->length)
3442         emacs_abort ();
3443       for (i = 0; i < cmp_status->length; i++)
3444         *charbuf++ = cmp_status->carryover[i];
3445       coding->annotated = 1;
3446     }
3447
3448   while (1)
3449     {
3450       int c1, c2, c3;
3451
3452       src_base = src;
3453       consumed_chars_base = consumed_chars;
3454
3455       if (charbuf >= charbuf_end)
3456         {
3457           if (byte_after_cr >= 0)
3458             src_base--;
3459           break;
3460         }
3461
3462       if (byte_after_cr >= 0)
3463         c1 = byte_after_cr, byte_after_cr = -1;
3464       else
3465         ONE_MORE_BYTE (c1);
3466       if (c1 < 0)
3467         goto invalid_code;
3468
3469       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3470         {
3471           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3472           char_offset++;
3473           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3474           continue;
3475         }
3476
3477       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3478         {
3479           if (c1 == ISO_CODE_ESC)
3480             {
3481               if (src + 1 >= src_end)
3482                 goto no_more_source;
3483               *charbuf++ = ISO_CODE_ESC;
3484               char_offset++;
3485               if (src[0] == '%' && src[1] == '@')
3486                 {
3487                   src += 2;
3488                   consumed_chars += 2;
3489                   char_offset += 2;
3490                   /* We are sure charbuf can contain two more chars. */
3491                   *charbuf++ = '%';
3492                   *charbuf++ = '@';
3493                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3494                 }
3495             }
3496           else
3497             {
3498               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3499               char_offset++;
3500             }
3501           continue;
3502         }
3503
3504       if ((cmp_status->state == COMPOSING_RULE
3505            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3506           && c1 != ISO_CODE_ESC)
3507         {
3508           int rule;
3509
3510           DECODE_COMPOSITION_RULE (rule);
3511           STORE_COMPOSITION_RULE (rule);
3512           continue;
3513         }
3514
3515       /* We produce at most one character.  */
3516       switch (iso_code_class [c1])
3517         {
3518         case ISO_0x20_or_0x7F:
3519           if (charset_id_0 < 0
3520               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3521             /* This is SPACE or DEL.  */
3522             charset = CHARSET_FROM_ID (charset_ascii);
3523           else
3524             charset = CHARSET_FROM_ID (charset_id_0);
3525           break;
3526
3527         case ISO_graphic_plane_0:
3528           if (charset_id_0 < 0)
3529             charset = CHARSET_FROM_ID (charset_ascii);
3530           else
3531             charset = CHARSET_FROM_ID (charset_id_0);
3532           break;
3533
3534         case ISO_0xA0_or_0xFF:
3535           if (charset_id_1 < 0
3536               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3537               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3538             goto invalid_code;
3539           /* This is a graphic character, we fall down ... */
3540
3541         case ISO_graphic_plane_1:
3542           if (charset_id_1 < 0)
3543             goto invalid_code;
3544           charset = CHARSET_FROM_ID (charset_id_1);
3545           break;
3546
3547         case ISO_control_0:
3548           if (eol_dos && c1 == '\r')
3549             ONE_MORE_BYTE (byte_after_cr);
3550           MAYBE_FINISH_COMPOSITION ();
3551           charset = CHARSET_FROM_ID (charset_ascii);
3552           break;
3553
3554         case ISO_control_1:
3555           goto invalid_code;
3556
3557         case ISO_shift_out:
3558           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3559               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3560             goto invalid_code;
3561           CODING_ISO_INVOCATION (coding, 0) = 1;
3562           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3563           continue;
3564
3565         case ISO_shift_in:
3566           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3567             goto invalid_code;
3568           CODING_ISO_INVOCATION (coding, 0) = 0;
3569           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3570           continue;
3571
3572         case ISO_single_shift_2_7:
3573           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3574             goto invalid_code;
3575         case ISO_single_shift_2:
3576           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3577             goto invalid_code;
3578           /* SS2 is handled as an escape sequence of ESC 'N' */
3579           c1 = 'N';
3580           goto label_escape_sequence;
3581
3582         case ISO_single_shift_3:
3583           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3584             goto invalid_code;
3585           /* SS2 is handled as an escape sequence of ESC 'O' */
3586           c1 = 'O';
3587           goto label_escape_sequence;
3588
3589         case ISO_control_sequence_introducer:
3590           /* CSI is handled as an escape sequence of ESC '[' ...  */
3591           c1 = '[';
3592           goto label_escape_sequence;
3593
3594         case ISO_escape:
3595           ONE_MORE_BYTE (c1);
3596         label_escape_sequence:
3597           /* Escape sequences handled here are invocation,
3598              designation, direction specification, and character
3599              composition specification.  */
3600           switch (c1)
3601             {
3602             case '&':           /* revision of following character set */
3603               ONE_MORE_BYTE (c1);
3604               if (!(c1 >= '@' && c1 <= '~'))
3605                 goto invalid_code;
3606               ONE_MORE_BYTE (c1);
3607               if (c1 != ISO_CODE_ESC)
3608                 goto invalid_code;
3609               ONE_MORE_BYTE (c1);
3610               goto label_escape_sequence;
3611
3612             case '$':           /* designation of 2-byte character set */
3613               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3614                 goto invalid_code;
3615               {
3616                 int reg, chars96;
3617
3618                 ONE_MORE_BYTE (c1);
3619                 if (c1 >= '@' && c1 <= 'B')
3620                   {     /* designation of JISX0208.1978, GB2312.1980,
3621                            or JISX0208.1980 */
3622                     reg = 0, chars96 = 0;
3623                   }
3624                 else if (c1 >= 0x28 && c1 <= 0x2B)
3625                   { /* designation of DIMENSION2_CHARS94 character set */
3626                     reg = c1 - 0x28, chars96 = 0;
3627                     ONE_MORE_BYTE (c1);
3628                   }
3629                 else if (c1 >= 0x2C && c1 <= 0x2F)
3630                   { /* designation of DIMENSION2_CHARS96 character set */
3631                     reg = c1 - 0x2C, chars96 = 1;
3632                     ONE_MORE_BYTE (c1);
3633                   }
3634                 else
3635                   goto invalid_code;
3636                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3637                 /* We must update these variables now.  */
3638                 if (reg == 0)
3639                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3640                 else if (reg == 1)
3641                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3642                 if (chars96 < 0)
3643                   goto invalid_code;
3644               }
3645               continue;
3646
3647             case 'n':           /* invocation of locking-shift-2 */
3648               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3649                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3650                 goto invalid_code;
3651               CODING_ISO_INVOCATION (coding, 0) = 2;
3652               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3653               continue;
3654
3655             case 'o':           /* invocation of locking-shift-3 */
3656               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3657                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3658                 goto invalid_code;
3659               CODING_ISO_INVOCATION (coding, 0) = 3;
3660               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3661               continue;
3662
3663             case 'N':           /* invocation of single-shift-2 */
3664               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3665                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3666                 goto invalid_code;
3667               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3668               if (charset_id_2 < 0)
3669                 charset = CHARSET_FROM_ID (charset_ascii);
3670               else
3671                 charset = CHARSET_FROM_ID (charset_id_2);
3672               ONE_MORE_BYTE (c1);
3673               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3674                 goto invalid_code;
3675               break;
3676
3677             case 'O':           /* invocation of single-shift-3 */
3678               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3679                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3680                 goto invalid_code;
3681               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3682               if (charset_id_3 < 0)
3683                 charset = CHARSET_FROM_ID (charset_ascii);
3684               else
3685                 charset = CHARSET_FROM_ID (charset_id_3);
3686               ONE_MORE_BYTE (c1);
3687               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3688                 goto invalid_code;
3689               break;
3690
3691             case '0': case '2': case '3': case '4': /* start composition */
3692               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3693                 goto invalid_code;
3694               if (last_id != charset_ascii)
3695                 {
3696                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3697                   last_id = charset_ascii;
3698                   last_offset = char_offset;
3699                 }
3700               DECODE_COMPOSITION_START (c1);
3701               continue;
3702
3703             case '1':           /* end composition */
3704               if (cmp_status->state == COMPOSING_NO)
3705                 goto invalid_code;
3706               DECODE_COMPOSITION_END ();
3707               continue;
3708
3709             case '[':           /* specification of direction */
3710               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3711                 goto invalid_code;
3712               /* For the moment, nested direction is not supported.
3713                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3714                  left-to-right, and nonzero means right-to-left.  */
3715               ONE_MORE_BYTE (c1);
3716               switch (c1)
3717                 {
3718                 case ']':       /* end of the current direction */
3719                   coding->mode &= ~CODING_MODE_DIRECTION;
3720
3721                 case '0':       /* end of the current direction */
3722                 case '1':       /* start of left-to-right direction */
3723                   ONE_MORE_BYTE (c1);
3724                   if (c1 == ']')
3725                     coding->mode &= ~CODING_MODE_DIRECTION;
3726                   else
3727                     goto invalid_code;
3728                   break;
3729
3730                 case '2':       /* start of right-to-left direction */
3731                   ONE_MORE_BYTE (c1);
3732                   if (c1 == ']')
3733                     coding->mode |= CODING_MODE_DIRECTION;
3734                   else
3735                     goto invalid_code;
3736                   break;
3737
3738                 default:
3739                   goto invalid_code;
3740                 }
3741               continue;
3742
3743             case '%':
3744               ONE_MORE_BYTE (c1);
3745               if (c1 == '/')
3746                 {
3747                   /* CTEXT extended segment:
3748                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3749                      We keep these bytes as is for the moment.
3750                      They may be decoded by post-read-conversion.  */
3751                   int dim, M, L;
3752                   int size;
3753
3754                   ONE_MORE_BYTE (dim);
3755                   if (dim < '0' || dim > '4')
3756                     goto invalid_code;
3757                   ONE_MORE_BYTE (M);
3758                   if (M < 128)
3759                     goto invalid_code;
3760                   ONE_MORE_BYTE (L);
3761                   if (L < 128)
3762                     goto invalid_code;
3763                   size = ((M - 128) * 128) + (L - 128);
3764                   if (charbuf + 6 > charbuf_end)
3765                     goto break_loop;
3766                   *charbuf++ = ISO_CODE_ESC;
3767                   *charbuf++ = '%';
3768                   *charbuf++ = '/';
3769                   *charbuf++ = dim;
3770                   *charbuf++ = BYTE8_TO_CHAR (M);
3771                   *charbuf++ = BYTE8_TO_CHAR (L);
3772                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3773                 }
3774               else if (c1 == 'G')
3775                 {
3776                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3777                      ESC % G --UTF-8-BYTES-- ESC % @
3778                      We keep these bytes as is for the moment.
3779                      They may be decoded by post-read-conversion.  */
3780                   if (charbuf + 3 > charbuf_end)
3781                     goto break_loop;
3782                   *charbuf++ = ISO_CODE_ESC;
3783                   *charbuf++ = '%';
3784                   *charbuf++ = 'G';
3785                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3786                 }
3787               else
3788                 goto invalid_code;
3789               continue;
3790               break;
3791
3792             default:
3793               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3794                 goto invalid_code;
3795               {
3796                 int reg, chars96;
3797
3798                 if (c1 >= 0x28 && c1 <= 0x2B)
3799                   { /* designation of DIMENSION1_CHARS94 character set */
3800                     reg = c1 - 0x28, chars96 = 0;
3801                     ONE_MORE_BYTE (c1);
3802                   }
3803                 else if (c1 >= 0x2C && c1 <= 0x2F)
3804                   { /* designation of DIMENSION1_CHARS96 character set */
3805                     reg = c1 - 0x2C, chars96 = 1;
3806                     ONE_MORE_BYTE (c1);
3807                   }
3808                 else
3809                   goto invalid_code;
3810                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3811                 /* We must update these variables now.  */
3812                 if (reg == 0)
3813                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3814                 else if (reg == 1)
3815                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3816                 if (chars96 < 0)
3817                   goto invalid_code;
3818               }
3819               continue;
3820             }
3821           break;
3822
3823         default:
3824           emacs_abort ();
3825         }
3826
3827       if (cmp_status->state == COMPOSING_NO
3828           && charset->id != charset_ascii
3829           && last_id != charset->id)
3830         {
3831           if (last_id != charset_ascii)
3832             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3833           last_id = charset->id;
3834           last_offset = char_offset;
3835         }
3836
3837       /* Now we know CHARSET and 1st position code C1 of a character.
3838          Produce a decoded character while getting 2nd and 3rd
3839          position codes C2, C3 if necessary.  */
3840       if (CHARSET_DIMENSION (charset) > 1)
3841         {
3842           ONE_MORE_BYTE (c2);
3843           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3844               || ((c1 & 0x80) != (c2 & 0x80)))
3845             /* C2 is not in a valid range.  */
3846             goto invalid_code;
3847           if (CHARSET_DIMENSION (charset) == 2)
3848             c1 = (c1 << 8) | c2;
3849           else
3850             {
3851               ONE_MORE_BYTE (c3);
3852               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3853                   || ((c1 & 0x80) != (c3 & 0x80)))
3854                 /* C3 is not in a valid range.  */
3855                 goto invalid_code;
3856               c1 = (c1 << 16) | (c2 << 8) | c2;
3857             }
3858         }
3859       c1 &= 0x7F7F7F;
3860       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3861       if (c < 0)
3862         {
3863           MAYBE_FINISH_COMPOSITION ();
3864           for (; src_base < src; src_base++, char_offset++)
3865             {
3866               if (ASCII_BYTE_P (*src_base))
3867                 *charbuf++ = *src_base;
3868               else
3869                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3870             }
3871         }
3872       else if (cmp_status->state == COMPOSING_NO)
3873         {
3874           *charbuf++ = c;
3875           char_offset++;
3876         }
3877       else if ((cmp_status->state == COMPOSING_CHAR
3878                 ? cmp_status->nchars
3879                 : cmp_status->ncomps)
3880                >= MAX_COMPOSITION_COMPONENTS)
3881         {
3882           /* Too long composition.  */
3883           MAYBE_FINISH_COMPOSITION ();
3884           *charbuf++ = c;
3885           char_offset++;
3886         }
3887       else
3888         STORE_COMPOSITION_CHAR (c);
3889       continue;
3890
3891     invalid_code:
3892       MAYBE_FINISH_COMPOSITION ();
3893       src = src_base;
3894       consumed_chars = consumed_chars_base;
3895       ONE_MORE_BYTE (c);
3896       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3897       char_offset++;
3898       coding->errors++;
3899       continue;
3900
3901     break_loop:
3902       break;
3903     }
3904
3905  no_more_source:
3906   if (cmp_status->state != COMPOSING_NO)
3907     {
3908       if (coding->mode & CODING_MODE_LAST_BLOCK)
3909         MAYBE_FINISH_COMPOSITION ();
3910       else
3911         {
3912           charbuf -= cmp_status->length;
3913           for (i = 0; i < cmp_status->length; i++)
3914             cmp_status->carryover[i] = charbuf[i];
3915         }
3916     }
3917   else if (last_id != charset_ascii)
3918     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3919   coding->consumed_char += consumed_chars_base;
3920   coding->consumed = src_base - coding->source;
3921   coding->charbuf_used = charbuf - coding->charbuf;
3922 }
3923
3924
3925 /* ISO2022 encoding stuff.  */
3926
3927 /*
3928    It is not enough to say just "ISO2022" on encoding, we have to
3929    specify more details.  In Emacs, each coding system of ISO2022
3930    variant has the following specifications:
3931         1. Initial designation to G0 thru G3.
3932         2. Allows short-form designation?
3933         3. ASCII should be designated to G0 before control characters?
3934         4. ASCII should be designated to G0 at end of line?
3935         5. 7-bit environment or 8-bit environment?
3936         6. Use locking-shift?
3937         7. Use Single-shift?
3938    And the following two are only for Japanese:
3939         8. Use ASCII in place of JIS0201-1976-Roman?
3940         9. Use JISX0208-1983 in place of JISX0208-1978?
3941    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3942    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3943    details.
3944 */
3945
3946 /* Produce codes (escape sequence) for designating CHARSET to graphic
3947    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3948    '@', 'A', or 'B' and the coding system CODING allows, produce
3949    designation sequence of short-form.  */
3950
3951 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3952   do {                                                                  \
3953     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3954     const char *intermediate_char_94 = "()*+";                          \
3955     const char *intermediate_char_96 = ",-./";                          \
3956     int revision = -1;                                                  \
3957                                                                         \
3958     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3959       revision = CHARSET_ISO_REVISION (charset);                        \
3960                                                                         \
3961     if (revision >= 0)                                                  \
3962       {                                                                 \
3963         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3964         EMIT_ONE_BYTE ('@' + revision);                                 \
3965       }                                                                 \
3966     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3967     if (CHARSET_DIMENSION (charset) == 1)                               \
3968       {                                                                 \
3969         int b;                                                          \
3970         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3971           b = intermediate_char_94[reg];                                \
3972         else                                                            \
3973           b = intermediate_char_96[reg];                                \
3974         EMIT_ONE_ASCII_BYTE (b);                                        \
3975       }                                                                 \
3976     else                                                                \
3977       {                                                                 \
3978         EMIT_ONE_ASCII_BYTE ('$');                                      \
3979         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3980           {                                                             \
3981             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3982                 || reg != 0                                             \
3983                 || final_char < '@' || final_char > 'B')                \
3984               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3985           }                                                             \
3986         else                                                            \
3987           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3988       }                                                                 \
3989     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3990                                                                         \
3991     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3992   } while (0)
3993
3994
3995 /* The following two macros produce codes (control character or escape
3996    sequence) for ISO2022 single-shift functions (single-shift-2 and
3997    single-shift-3).  */
3998
3999 #define ENCODE_SINGLE_SHIFT_2                                           \
4000   do {                                                                  \
4001     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4002       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4003     else                                                                \
4004       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4005     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4006   } while (0)
4007
4008
4009 #define ENCODE_SINGLE_SHIFT_3                                           \
4010   do {                                                                  \
4011     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4012       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4013     else                                                                \
4014       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4015     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4016   } while (0)
4017
4018
4019 /* The following four macros produce codes (control character or
4020    escape sequence) for ISO2022 locking-shift functions (shift-in,
4021    shift-out, locking-shift-2, and locking-shift-3).  */
4022
4023 #define ENCODE_SHIFT_IN                                 \
4024   do {                                                  \
4025     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4026     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4027   } while (0)
4028
4029
4030 #define ENCODE_SHIFT_OUT                                \
4031   do {                                                  \
4032     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4033     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4034   } while (0)
4035
4036
4037 #define ENCODE_LOCKING_SHIFT_2                          \
4038   do {                                                  \
4039     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4040     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4041   } while (0)
4042
4043
4044 #define ENCODE_LOCKING_SHIFT_3                          \
4045   do {                                                  \
4046     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4047     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4048   } while (0)
4049
4050
4051 /* Produce codes for a DIMENSION1 character whose character set is
4052    CHARSET and whose position-code is C1.  Designation and invocation
4053    sequences are also produced in advance if necessary.  */
4054
4055 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4056   do {                                                                  \
4057     int id = CHARSET_ID (charset);                                      \
4058                                                                         \
4059     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4060         && id == charset_ascii)                                         \
4061       {                                                                 \
4062         id = charset_jisx0201_roman;                                    \
4063         charset = CHARSET_FROM_ID (id);                                 \
4064       }                                                                 \
4065                                                                         \
4066     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4067       {                                                                 \
4068         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4069           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4070         else                                                            \
4071           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4072         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4073         break;                                                          \
4074       }                                                                 \
4075     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4076       {                                                                 \
4077         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4078         break;                                                          \
4079       }                                                                 \
4080     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4081       {                                                                 \
4082         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4083         break;                                                          \
4084       }                                                                 \
4085     else                                                                \
4086       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4087          must invoke it, or, at first, designate it to some graphic     \
4088          register.  Then repeat the loop to actually produce the        \
4089          character.  */                                                 \
4090       dst = encode_invocation_designation (charset, coding, dst,        \
4091                                            &produced_chars);            \
4092   } while (1)
4093
4094
4095 /* Produce codes for a DIMENSION2 character whose character set is
4096    CHARSET and whose position-codes are C1 and C2.  Designation and
4097    invocation codes are also produced in advance if necessary.  */
4098
4099 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4100   do {                                                                  \
4101     int id = CHARSET_ID (charset);                                      \
4102                                                                         \
4103     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4104         && id == charset_jisx0208)                                      \
4105       {                                                                 \
4106         id = charset_jisx0208_1978;                                     \
4107         charset = CHARSET_FROM_ID (id);                                 \
4108       }                                                                 \
4109                                                                         \
4110     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4111       {                                                                 \
4112         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4113           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4114         else                                                            \
4115           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4116         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4117         break;                                                          \
4118       }                                                                 \
4119     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4120       {                                                                 \
4121         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4122         break;                                                          \
4123       }                                                                 \
4124     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4125       {                                                                 \
4126         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4127         break;                                                          \
4128       }                                                                 \
4129     else                                                                \
4130       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4131          must invoke it, or, at first, designate it to some graphic     \
4132          register.  Then repeat the loop to actually produce the        \
4133          character.  */                                                 \
4134       dst = encode_invocation_designation (charset, coding, dst,        \
4135                                            &produced_chars);            \
4136   } while (1)
4137
4138
4139 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4140   do {                                                                     \
4141     unsigned code;                                                         \
4142     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4143                                                                            \
4144     if (CHARSET_DIMENSION (charset) == 1)                                  \
4145       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4146     else                                                                   \
4147       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4148   } while (0)
4149
4150
4151 /* Produce designation and invocation codes at a place pointed by DST
4152    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4153    Return new DST.  */
4154
4155 static unsigned char *
4156 encode_invocation_designation (struct charset *charset,
4157                                struct coding_system *coding,
4158                                unsigned char *dst, ptrdiff_t *p_nchars)
4159 {
4160   bool multibytep = coding->dst_multibyte;
4161   ptrdiff_t produced_chars = *p_nchars;
4162   int reg;                      /* graphic register number */
4163   int id = CHARSET_ID (charset);
4164
4165   /* At first, check designations.  */
4166   for (reg = 0; reg < 4; reg++)
4167     if (id == CODING_ISO_DESIGNATION (coding, reg))
4168       break;
4169
4170   if (reg >= 4)
4171     {
4172       /* CHARSET is not yet designated to any graphic registers.  */
4173       /* At first check the requested designation.  */
4174       reg = CODING_ISO_REQUEST (coding, id);
4175       if (reg < 0)
4176         /* Since CHARSET requests no special designation, designate it
4177            to graphic register 0.  */
4178         reg = 0;
4179
4180       ENCODE_DESIGNATION (charset, reg, coding);
4181     }
4182
4183   if (CODING_ISO_INVOCATION (coding, 0) != reg
4184       && CODING_ISO_INVOCATION (coding, 1) != reg)
4185     {
4186       /* Since the graphic register REG is not invoked to any graphic
4187          planes, invoke it to graphic plane 0.  */
4188       switch (reg)
4189         {
4190         case 0:                 /* graphic register 0 */
4191           ENCODE_SHIFT_IN;
4192           break;
4193
4194         case 1:                 /* graphic register 1 */
4195           ENCODE_SHIFT_OUT;
4196           break;
4197
4198         case 2:                 /* graphic register 2 */
4199           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4200             ENCODE_SINGLE_SHIFT_2;
4201           else
4202             ENCODE_LOCKING_SHIFT_2;
4203           break;
4204
4205         case 3:                 /* graphic register 3 */
4206           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4207             ENCODE_SINGLE_SHIFT_3;
4208           else
4209             ENCODE_LOCKING_SHIFT_3;
4210           break;
4211         }
4212     }
4213
4214   *p_nchars = produced_chars;
4215   return dst;
4216 }
4217
4218
4219 /* Produce codes for designation and invocation to reset the graphic
4220    planes and registers to initial state.  */
4221 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4222   do {                                                                  \
4223     int reg;                                                            \
4224     struct charset *charset;                                            \
4225                                                                         \
4226     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4227       ENCODE_SHIFT_IN;                                                  \
4228     for (reg = 0; reg < 4; reg++)                                       \
4229       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4230           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4231               != CODING_ISO_INITIAL (coding, reg)))                     \
4232         {                                                               \
4233           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4234           ENCODE_DESIGNATION (charset, reg, coding);                    \
4235         }                                                               \
4236   } while (0)
4237
4238
4239 /* Produce designation sequences of charsets in the line started from
4240    CHARBUF to a place pointed by DST, and return the number of
4241    produced bytes.  DST should not directly point a buffer text area
4242    which may be relocated by char_charset call.
4243
4244    If the current block ends before any end-of-line, we may fail to
4245    find all the necessary designations.  */
4246
4247 static ptrdiff_t
4248 encode_designation_at_bol (struct coding_system *coding,
4249                            int *charbuf, int *charbuf_end,
4250                            unsigned char *dst)
4251 {
4252   unsigned char *orig = dst;
4253   struct charset *charset;
4254   /* Table of charsets to be designated to each graphic register.  */
4255   int r[4];
4256   int c, found = 0, reg;
4257   ptrdiff_t produced_chars = 0;
4258   bool multibytep = coding->dst_multibyte;
4259   Lisp_Object attrs;
4260   Lisp_Object charset_list;
4261
4262   attrs = CODING_ID_ATTRS (coding->id);
4263   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4264   if (EQ (charset_list, Qiso_2022))
4265     charset_list = Viso_2022_charset_list;
4266
4267   for (reg = 0; reg < 4; reg++)
4268     r[reg] = -1;
4269
4270   while (charbuf < charbuf_end && found < 4)
4271     {
4272       int id;
4273
4274       c = *charbuf++;
4275       if (c == '\n')
4276         break;
4277       charset = char_charset (c, charset_list, NULL);
4278       id = CHARSET_ID (charset);
4279       reg = CODING_ISO_REQUEST (coding, id);
4280       if (reg >= 0 && r[reg] < 0)
4281         {
4282           found++;
4283           r[reg] = id;
4284         }
4285     }
4286
4287   if (found)
4288     {
4289       for (reg = 0; reg < 4; reg++)
4290         if (r[reg] >= 0
4291             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4292           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4293     }
4294
4295   return dst - orig;
4296 }
4297
4298 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4299
4300 static bool
4301 encode_coding_iso_2022 (struct coding_system *coding)
4302 {
4303   bool multibytep = coding->dst_multibyte;
4304   int *charbuf = coding->charbuf;
4305   int *charbuf_end = charbuf + coding->charbuf_used;
4306   unsigned char *dst = coding->destination + coding->produced;
4307   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4308   int safe_room = 16;
4309   bool bol_designation
4310     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4311        && CODING_ISO_BOL (coding));
4312   ptrdiff_t produced_chars = 0;
4313   Lisp_Object attrs, eol_type, charset_list;
4314   bool ascii_compatible;
4315   int c;
4316   int preferred_charset_id = -1;
4317
4318   CODING_GET_INFO (coding, attrs, charset_list);
4319   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4320   if (VECTORP (eol_type))
4321     eol_type = Qunix;
4322
4323   setup_iso_safe_charsets (attrs);
4324   /* Charset list may have been changed.  */
4325   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4326   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4327
4328   ascii_compatible
4329     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4330        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4331                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4332
4333   while (charbuf < charbuf_end)
4334     {
4335       ASSURE_DESTINATION (safe_room);
4336
4337       if (bol_designation)
4338         {
4339           /* We have to produce designation sequences if any now.  */
4340           unsigned char desig_buf[16];
4341           int nbytes;
4342           ptrdiff_t offset;
4343
4344           charset_map_loaded = 0;
4345           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4346                                               desig_buf);
4347           if (charset_map_loaded
4348               && (offset = coding_change_destination (coding)))
4349             {
4350               dst += offset;
4351               dst_end += offset;
4352             }
4353           memcpy (dst, desig_buf, nbytes);
4354           dst += nbytes;
4355           /* We are sure that designation sequences are all ASCII bytes.  */
4356           produced_chars += nbytes;
4357           bol_designation = 0;
4358           ASSURE_DESTINATION (safe_room);
4359         }
4360
4361       c = *charbuf++;
4362
4363       if (c < 0)
4364         {
4365           /* Handle an annotation.  */
4366           switch (*charbuf)
4367             {
4368             case CODING_ANNOTATE_COMPOSITION_MASK:
4369               /* Not yet implemented.  */
4370               break;
4371             case CODING_ANNOTATE_CHARSET_MASK:
4372               preferred_charset_id = charbuf[2];
4373               if (preferred_charset_id >= 0
4374                   && NILP (Fmemq (make_number (preferred_charset_id),
4375                                   charset_list)))
4376                 preferred_charset_id = -1;
4377               break;
4378             default:
4379               emacs_abort ();
4380             }
4381           charbuf += -c - 1;
4382           continue;
4383         }
4384
4385       /* Now encode the character C.  */
4386       if (c < 0x20 || c == 0x7F)
4387         {
4388           if (c == '\n'
4389               || (c == '\r' && EQ (eol_type, Qmac)))
4390             {
4391               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4392                 ENCODE_RESET_PLANE_AND_REGISTER ();
4393               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4394                 {
4395                   int i;
4396
4397                   for (i = 0; i < 4; i++)
4398                     CODING_ISO_DESIGNATION (coding, i)
4399                       = CODING_ISO_INITIAL (coding, i);
4400                 }
4401               bol_designation = ((CODING_ISO_FLAGS (coding)
4402                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4403                                  != 0);
4404             }
4405           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4406             ENCODE_RESET_PLANE_AND_REGISTER ();
4407           EMIT_ONE_ASCII_BYTE (c);
4408         }
4409       else if (ASCII_CHAR_P (c))
4410         {
4411           if (ascii_compatible)
4412             EMIT_ONE_ASCII_BYTE (c);
4413           else
4414             {
4415               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4416               ENCODE_ISO_CHARACTER (charset, c);
4417             }
4418         }
4419       else if (CHAR_BYTE8_P (c))
4420         {
4421           c = CHAR_TO_BYTE8 (c);
4422           EMIT_ONE_BYTE (c);
4423         }
4424       else
4425         {
4426           struct charset *charset;
4427
4428           if (preferred_charset_id >= 0)
4429             {
4430               bool result;
4431
4432               charset = CHARSET_FROM_ID (preferred_charset_id);
4433               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4434               if (! result)
4435                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4436                                      NULL, charset);
4437             }
4438           else
4439             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4440                                  NULL, charset);
4441           if (!charset)
4442             {
4443               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4444                 {
4445                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4446                   charset = CHARSET_FROM_ID (charset_ascii);
4447                 }
4448               else
4449                 {
4450                   c = coding->default_char;
4451                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4452                                        charset_list, NULL, charset);
4453                 }
4454             }
4455           ENCODE_ISO_CHARACTER (charset, c);
4456         }
4457     }
4458
4459   if (coding->mode & CODING_MODE_LAST_BLOCK
4460       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4461     {
4462       ASSURE_DESTINATION (safe_room);
4463       ENCODE_RESET_PLANE_AND_REGISTER ();
4464     }
4465   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4466   CODING_ISO_BOL (coding) = bol_designation;
4467   coding->produced_char += produced_chars;
4468   coding->produced = dst - coding->destination;
4469   return 0;
4470 }
4471
4472 \f
4473 /*** 8,9. SJIS and BIG5 handlers ***/
4474
4475 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4476    quite widely.  So, for the moment, Emacs supports them in the bare
4477    C code.  But, in the future, they may be supported only by CCL.  */
4478
4479 /* SJIS is a coding system encoding three character sets: ASCII, right
4480    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4481    as is.  A character of charset katakana-jisx0201 is encoded by
4482    "position-code + 0x80".  A character of charset japanese-jisx0208
4483    is encoded in 2-byte but two position-codes are divided and shifted
4484    so that it fit in the range below.
4485
4486    --- CODE RANGE of SJIS ---
4487    (character set)      (range)
4488    ASCII                0x00 .. 0x7F
4489    KATAKANA-JISX0201    0xA0 .. 0xDF
4490    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4491             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4492    -------------------------------
4493
4494 */
4495
4496 /* BIG5 is a coding system encoding two character sets: ASCII and
4497    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4498    character set and is encoded in two-byte.
4499
4500    --- CODE RANGE of BIG5 ---
4501    (character set)      (range)
4502    ASCII                0x00 .. 0x7F
4503    Big5 (1st byte)      0xA1 .. 0xFE
4504         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4505    --------------------------
4506
4507   */
4508
4509 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4510    Return true if a text is encoded in SJIS.  */
4511
4512 static bool
4513 detect_coding_sjis (struct coding_system *coding,
4514                     struct coding_detection_info *detect_info)
4515 {
4516   const unsigned char *src = coding->source, *src_base;
4517   const unsigned char *src_end = coding->source + coding->src_bytes;
4518   bool multibytep = coding->src_multibyte;
4519   ptrdiff_t consumed_chars = 0;
4520   int found = 0;
4521   int c;
4522   Lisp_Object attrs, charset_list;
4523   int max_first_byte_of_2_byte_code;
4524
4525   CODING_GET_INFO (coding, attrs, charset_list);
4526   max_first_byte_of_2_byte_code
4527     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4528
4529   detect_info->checked |= CATEGORY_MASK_SJIS;
4530   /* A coding system of this category is always ASCII compatible.  */
4531   src += coding->head_ascii;
4532
4533   while (1)
4534     {
4535       src_base = src;
4536       ONE_MORE_BYTE (c);
4537       if (c < 0x80)
4538         continue;
4539       if ((c >= 0x81 && c <= 0x9F)
4540           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4541         {
4542           ONE_MORE_BYTE (c);
4543           if (c < 0x40 || c == 0x7F || c > 0xFC)
4544             break;
4545           found = CATEGORY_MASK_SJIS;
4546         }
4547       else if (c >= 0xA0 && c < 0xE0)
4548         found = CATEGORY_MASK_SJIS;
4549       else
4550         break;
4551     }
4552   detect_info->rejected |= CATEGORY_MASK_SJIS;
4553   return 0;
4554
4555  no_more_source:
4556   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4557     {
4558       detect_info->rejected |= CATEGORY_MASK_SJIS;
4559       return 0;
4560     }
4561   detect_info->found |= found;
4562   return 1;
4563 }
4564
4565 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4566    Return true if a text is encoded in BIG5.  */
4567
4568 static bool
4569 detect_coding_big5 (struct coding_system *coding,
4570                     struct coding_detection_info *detect_info)
4571 {
4572   const unsigned char *src = coding->source, *src_base;
4573   const unsigned char *src_end = coding->source + coding->src_bytes;
4574   bool multibytep = coding->src_multibyte;
4575   ptrdiff_t consumed_chars = 0;
4576   int found = 0;
4577   int c;
4578
4579   detect_info->checked |= CATEGORY_MASK_BIG5;
4580   /* A coding system of this category is always ASCII compatible.  */
4581   src += coding->head_ascii;
4582
4583   while (1)
4584     {
4585       src_base = src;
4586       ONE_MORE_BYTE (c);
4587       if (c < 0x80)
4588         continue;
4589       if (c >= 0xA1)
4590         {
4591           ONE_MORE_BYTE (c);
4592           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4593             return 0;
4594           found = CATEGORY_MASK_BIG5;
4595         }
4596       else
4597         break;
4598     }
4599   detect_info->rejected |= CATEGORY_MASK_BIG5;
4600   return 0;
4601
4602  no_more_source:
4603   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4604     {
4605       detect_info->rejected |= CATEGORY_MASK_BIG5;
4606       return 0;
4607     }
4608   detect_info->found |= found;
4609   return 1;
4610 }
4611
4612 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4613
4614 static void
4615 decode_coding_sjis (struct coding_system *coding)
4616 {
4617   const unsigned char *src = coding->source + coding->consumed;
4618   const unsigned char *src_end = coding->source + coding->src_bytes;
4619   const unsigned char *src_base;
4620   int *charbuf = coding->charbuf + coding->charbuf_used;
4621   /* We may produce one charset annotation in one loop and one more at
4622      the end.  */
4623   int *charbuf_end
4624     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4625   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4626   bool multibytep = coding->src_multibyte;
4627   struct charset *charset_roman, *charset_kanji, *charset_kana;
4628   struct charset *charset_kanji2;
4629   Lisp_Object attrs, charset_list, val;
4630   ptrdiff_t char_offset = coding->produced_char;
4631   ptrdiff_t last_offset = char_offset;
4632   int last_id = charset_ascii;
4633   bool eol_dos
4634     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4635   int byte_after_cr = -1;
4636
4637   CODING_GET_INFO (coding, attrs, charset_list);
4638
4639   val = charset_list;
4640   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4641   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4642   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4643   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4644
4645   while (1)
4646     {
4647       int c, c1;
4648       struct charset *charset;
4649
4650       src_base = src;
4651       consumed_chars_base = consumed_chars;
4652
4653       if (charbuf >= charbuf_end)
4654         {
4655           if (byte_after_cr >= 0)
4656             src_base--;
4657           break;
4658         }
4659
4660       if (byte_after_cr >= 0)
4661         c = byte_after_cr, byte_after_cr = -1;
4662       else
4663         ONE_MORE_BYTE (c);
4664       if (c < 0)
4665         goto invalid_code;
4666       if (c < 0x80)
4667         {
4668           if (eol_dos && c == '\r')
4669             ONE_MORE_BYTE (byte_after_cr);
4670           charset = charset_roman;
4671         }
4672       else if (c == 0x80 || c == 0xA0)
4673         goto invalid_code;
4674       else if (c >= 0xA1 && c <= 0xDF)
4675         {
4676           /* SJIS -> JISX0201-Kana */
4677           c &= 0x7F;
4678           charset = charset_kana;
4679         }
4680       else if (c <= 0xEF)
4681         {
4682           /* SJIS -> JISX0208 */
4683           ONE_MORE_BYTE (c1);
4684           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4685             goto invalid_code;
4686           c = (c << 8) | c1;
4687           SJIS_TO_JIS (c);
4688           charset = charset_kanji;
4689         }
4690       else if (c <= 0xFC && charset_kanji2)
4691         {
4692           /* SJIS -> JISX0213-2 */
4693           ONE_MORE_BYTE (c1);
4694           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4695             goto invalid_code;
4696           c = (c << 8) | c1;
4697           SJIS_TO_JIS2 (c);
4698           charset = charset_kanji2;
4699         }
4700       else
4701         goto invalid_code;
4702       if (charset->id != charset_ascii
4703           && last_id != charset->id)
4704         {
4705           if (last_id != charset_ascii)
4706             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4707           last_id = charset->id;
4708           last_offset = char_offset;
4709         }
4710       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4711       *charbuf++ = c;
4712       char_offset++;
4713       continue;
4714
4715     invalid_code:
4716       src = src_base;
4717       consumed_chars = consumed_chars_base;
4718       ONE_MORE_BYTE (c);
4719       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4720       char_offset++;
4721       coding->errors++;
4722     }
4723
4724  no_more_source:
4725   if (last_id != charset_ascii)
4726     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4727   coding->consumed_char += consumed_chars_base;
4728   coding->consumed = src_base - coding->source;
4729   coding->charbuf_used = charbuf - coding->charbuf;
4730 }
4731
4732 static void
4733 decode_coding_big5 (struct coding_system *coding)
4734 {
4735   const unsigned char *src = coding->source + coding->consumed;
4736   const unsigned char *src_end = coding->source + coding->src_bytes;
4737   const unsigned char *src_base;
4738   int *charbuf = coding->charbuf + coding->charbuf_used;
4739   /* We may produce one charset annotation in one loop and one more at
4740      the end.  */
4741   int *charbuf_end
4742     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4743   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4744   bool multibytep = coding->src_multibyte;
4745   struct charset *charset_roman, *charset_big5;
4746   Lisp_Object attrs, charset_list, val;
4747   ptrdiff_t char_offset = coding->produced_char;
4748   ptrdiff_t last_offset = char_offset;
4749   int last_id = charset_ascii;
4750   bool eol_dos
4751     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4752   int byte_after_cr = -1;
4753
4754   CODING_GET_INFO (coding, attrs, charset_list);
4755   val = charset_list;
4756   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4757   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4758
4759   while (1)
4760     {
4761       int c, c1;
4762       struct charset *charset;
4763
4764       src_base = src;
4765       consumed_chars_base = consumed_chars;
4766
4767       if (charbuf >= charbuf_end)
4768         {
4769           if (byte_after_cr >= 0)
4770             src_base--;
4771           break;
4772         }
4773
4774       if (byte_after_cr >= 0)
4775         c = byte_after_cr, byte_after_cr = -1;
4776       else
4777         ONE_MORE_BYTE (c);
4778
4779       if (c < 0)
4780         goto invalid_code;
4781       if (c < 0x80)
4782         {
4783           if (eol_dos && c == '\r')
4784             ONE_MORE_BYTE (byte_after_cr);
4785           charset = charset_roman;
4786         }
4787       else
4788         {
4789           /* BIG5 -> Big5 */
4790           if (c < 0xA1 || c > 0xFE)
4791             goto invalid_code;
4792           ONE_MORE_BYTE (c1);
4793           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4794             goto invalid_code;
4795           c = c << 8 | c1;
4796           charset = charset_big5;
4797         }
4798       if (charset->id != charset_ascii
4799           && last_id != charset->id)
4800         {
4801           if (last_id != charset_ascii)
4802             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4803           last_id = charset->id;
4804           last_offset = char_offset;
4805         }
4806       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4807       *charbuf++ = c;
4808       char_offset++;
4809       continue;
4810
4811     invalid_code:
4812       src = src_base;
4813       consumed_chars = consumed_chars_base;
4814       ONE_MORE_BYTE (c);
4815       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4816       char_offset++;
4817       coding->errors++;
4818     }
4819
4820  no_more_source:
4821   if (last_id != charset_ascii)
4822     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4823   coding->consumed_char += consumed_chars_base;
4824   coding->consumed = src_base - coding->source;
4825   coding->charbuf_used = charbuf - coding->charbuf;
4826 }
4827
4828 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4829    This function can encode charsets `ascii', `katakana-jisx0201',
4830    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4831    are sure that all these charsets are registered as official charset
4832    (i.e. do not have extended leading-codes).  Characters of other
4833    charsets are produced without any encoding.  */
4834
4835 static bool
4836 encode_coding_sjis (struct coding_system *coding)
4837 {
4838   bool multibytep = coding->dst_multibyte;
4839   int *charbuf = coding->charbuf;
4840   int *charbuf_end = charbuf + coding->charbuf_used;
4841   unsigned char *dst = coding->destination + coding->produced;
4842   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4843   int safe_room = 4;
4844   ptrdiff_t produced_chars = 0;
4845   Lisp_Object attrs, charset_list, val;
4846   bool ascii_compatible;
4847   struct charset *charset_kanji, *charset_kana;
4848   struct charset *charset_kanji2;
4849   int c;
4850
4851   CODING_GET_INFO (coding, attrs, charset_list);
4852   val = XCDR (charset_list);
4853   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4854   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4855   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4856
4857   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4858
4859   while (charbuf < charbuf_end)
4860     {
4861       ASSURE_DESTINATION (safe_room);
4862       c = *charbuf++;
4863       /* Now encode the character C.  */
4864       if (ASCII_CHAR_P (c) && ascii_compatible)
4865         EMIT_ONE_ASCII_BYTE (c);
4866       else if (CHAR_BYTE8_P (c))
4867         {
4868           c = CHAR_TO_BYTE8 (c);
4869           EMIT_ONE_BYTE (c);
4870         }
4871       else
4872         {
4873           unsigned code;
4874           struct charset *charset;
4875           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4876                                &code, charset);
4877
4878           if (!charset)
4879             {
4880               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4881                 {
4882                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4883                   charset = CHARSET_FROM_ID (charset_ascii);
4884                 }
4885               else
4886                 {
4887                   c = coding->default_char;
4888                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4889                                        charset_list, &code, charset);
4890                 }
4891             }
4892           if (code == CHARSET_INVALID_CODE (charset))
4893             emacs_abort ();
4894           if (charset == charset_kanji)
4895             {
4896               int c1, c2;
4897               JIS_TO_SJIS (code);
4898               c1 = code >> 8, c2 = code & 0xFF;
4899               EMIT_TWO_BYTES (c1, c2);
4900             }
4901           else if (charset == charset_kana)
4902             EMIT_ONE_BYTE (code | 0x80);
4903           else if (charset_kanji2 && charset == charset_kanji2)
4904             {
4905               int c1, c2;
4906
4907               c1 = code >> 8;
4908               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4909                   || c1 == 0x28
4910                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4911                 {
4912                   JIS_TO_SJIS2 (code);
4913                   c1 = code >> 8, c2 = code & 0xFF;
4914                   EMIT_TWO_BYTES (c1, c2);
4915                 }
4916               else
4917                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4918             }
4919           else
4920             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4921         }
4922     }
4923   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4924   coding->produced_char += produced_chars;
4925   coding->produced = dst - coding->destination;
4926   return 0;
4927 }
4928
4929 static bool
4930 encode_coding_big5 (struct coding_system *coding)
4931 {
4932   bool multibytep = coding->dst_multibyte;
4933   int *charbuf = coding->charbuf;
4934   int *charbuf_end = charbuf + coding->charbuf_used;
4935   unsigned char *dst = coding->destination + coding->produced;
4936   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4937   int safe_room = 4;
4938   ptrdiff_t produced_chars = 0;
4939   Lisp_Object attrs, charset_list, val;
4940   bool ascii_compatible;
4941   struct charset *charset_big5;
4942   int c;
4943
4944   CODING_GET_INFO (coding, attrs, charset_list);
4945   val = XCDR (charset_list);
4946   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4947   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4948
4949   while (charbuf < charbuf_end)
4950     {
4951       ASSURE_DESTINATION (safe_room);
4952       c = *charbuf++;
4953       /* Now encode the character C.  */
4954       if (ASCII_CHAR_P (c) && ascii_compatible)
4955         EMIT_ONE_ASCII_BYTE (c);
4956       else if (CHAR_BYTE8_P (c))
4957         {
4958           c = CHAR_TO_BYTE8 (c);
4959           EMIT_ONE_BYTE (c);
4960         }
4961       else
4962         {
4963           unsigned code;
4964           struct charset *charset;
4965           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4966                                &code, charset);
4967
4968           if (! charset)
4969             {
4970               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4971                 {
4972                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4973                   charset = CHARSET_FROM_ID (charset_ascii);
4974                 }
4975               else
4976                 {
4977                   c = coding->default_char;
4978                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4979                                        charset_list, &code, charset);
4980                 }
4981             }
4982           if (code == CHARSET_INVALID_CODE (charset))
4983             emacs_abort ();
4984           if (charset == charset_big5)
4985             {
4986               int c1, c2;
4987
4988               c1 = code >> 8, c2 = code & 0xFF;
4989               EMIT_TWO_BYTES (c1, c2);
4990             }
4991           else
4992             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4993         }
4994     }
4995   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4996   coding->produced_char += produced_chars;
4997   coding->produced = dst - coding->destination;
4998   return 0;
4999 }
5000
5001 \f
5002 /*** 10. CCL handlers ***/
5003
5004 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5005    Return true if a text is encoded in a coding system of which
5006    encoder/decoder are written in CCL program.  */
5007
5008 static bool
5009 detect_coding_ccl (struct coding_system *coding,
5010                    struct coding_detection_info *detect_info)
5011 {
5012   const unsigned char *src = coding->source, *src_base;
5013   const unsigned char *src_end = coding->source + coding->src_bytes;
5014   bool multibytep = coding->src_multibyte;
5015   ptrdiff_t consumed_chars = 0;
5016   int found = 0;
5017   unsigned char *valids;
5018   ptrdiff_t head_ascii = coding->head_ascii;
5019   Lisp_Object attrs;
5020
5021   detect_info->checked |= CATEGORY_MASK_CCL;
5022
5023   coding = &coding_categories[coding_category_ccl];
5024   valids = CODING_CCL_VALIDS (coding);
5025   attrs = CODING_ID_ATTRS (coding->id);
5026   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5027     src += head_ascii;
5028
5029   while (1)
5030     {
5031       int c;
5032
5033       src_base = src;
5034       ONE_MORE_BYTE (c);
5035       if (c < 0 || ! valids[c])
5036         break;
5037       if ((valids[c] > 1))
5038         found = CATEGORY_MASK_CCL;
5039     }
5040   detect_info->rejected |= CATEGORY_MASK_CCL;
5041   return 0;
5042
5043  no_more_source:
5044   detect_info->found |= found;
5045   return 1;
5046 }
5047
5048 static void
5049 decode_coding_ccl (struct coding_system *coding)
5050 {
5051   const unsigned char *src = coding->source + coding->consumed;
5052   const unsigned char *src_end = coding->source + coding->src_bytes;
5053   int *charbuf = coding->charbuf + coding->charbuf_used;
5054   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5055   ptrdiff_t consumed_chars = 0;
5056   bool multibytep = coding->src_multibyte;
5057   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5058   int source_charbuf[1024];
5059   int source_byteidx[1025];
5060   Lisp_Object attrs, charset_list;
5061
5062   CODING_GET_INFO (coding, attrs, charset_list);
5063
5064   while (1)
5065     {
5066       const unsigned char *p = src;
5067       ptrdiff_t offset;
5068       int i = 0;
5069
5070       if (multibytep)
5071         {
5072           while (i < 1024 && p < src_end)
5073             {
5074               source_byteidx[i] = p - src;
5075               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5076             }
5077           source_byteidx[i] = p - src;
5078         }
5079       else
5080         while (i < 1024 && p < src_end)
5081           source_charbuf[i++] = *p++;
5082
5083       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5084         ccl->last_block = 1;
5085       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5086       charset_map_loaded = 0;
5087       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5088                   charset_list);
5089       if (charset_map_loaded
5090           && (offset = coding_change_source (coding)))
5091         {
5092           p += offset;
5093           src += offset;
5094           src_end += offset;
5095         }
5096       charbuf += ccl->produced;
5097       if (multibytep)
5098         src += source_byteidx[ccl->consumed];
5099       else
5100         src += ccl->consumed;
5101       consumed_chars += ccl->consumed;
5102       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5103         break;
5104     }
5105
5106   switch (ccl->status)
5107     {
5108     case CCL_STAT_SUSPEND_BY_SRC:
5109       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5110       break;
5111     case CCL_STAT_SUSPEND_BY_DST:
5112       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5113       break;
5114     case CCL_STAT_QUIT:
5115     case CCL_STAT_INVALID_CMD:
5116       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5117       break;
5118     default:
5119       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5120       break;
5121     }
5122   coding->consumed_char += consumed_chars;
5123   coding->consumed = src - coding->source;
5124   coding->charbuf_used = charbuf - coding->charbuf;
5125 }
5126
5127 static bool
5128 encode_coding_ccl (struct coding_system *coding)
5129 {
5130   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5131   bool multibytep = coding->dst_multibyte;
5132   int *charbuf = coding->charbuf;
5133   int *charbuf_end = charbuf + coding->charbuf_used;
5134   unsigned char *dst = coding->destination + coding->produced;
5135   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5136   int destination_charbuf[1024];
5137   ptrdiff_t produced_chars = 0;
5138   int i;
5139   Lisp_Object attrs, charset_list;
5140
5141   CODING_GET_INFO (coding, attrs, charset_list);
5142   if (coding->consumed_char == coding->src_chars
5143       && coding->mode & CODING_MODE_LAST_BLOCK)
5144     ccl->last_block = 1;
5145
5146   do
5147     {
5148       ptrdiff_t offset;
5149
5150       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5151       charset_map_loaded = 0;
5152       ccl_driver (ccl, charbuf, destination_charbuf,
5153                   charbuf_end - charbuf, 1024, charset_list);
5154       if (charset_map_loaded
5155           && (offset = coding_change_destination (coding)))
5156         dst += offset;
5157       if (multibytep)
5158         {
5159           ASSURE_DESTINATION (ccl->produced * 2);
5160           for (i = 0; i < ccl->produced; i++)
5161             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5162         }
5163       else
5164         {
5165           ASSURE_DESTINATION (ccl->produced);
5166           for (i = 0; i < ccl->produced; i++)
5167             *dst++ = destination_charbuf[i] & 0xFF;
5168           produced_chars += ccl->produced;
5169         }
5170       charbuf += ccl->consumed;
5171       if (ccl->status == CCL_STAT_QUIT
5172           || ccl->status == CCL_STAT_INVALID_CMD)
5173         break;
5174     }
5175   while (charbuf < charbuf_end);
5176
5177   switch (ccl->status)
5178     {
5179     case CCL_STAT_SUSPEND_BY_SRC:
5180       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5181       break;
5182     case CCL_STAT_SUSPEND_BY_DST:
5183       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5184       break;
5185     case CCL_STAT_QUIT:
5186     case CCL_STAT_INVALID_CMD:
5187       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5188       break;
5189     default:
5190       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5191       break;
5192     }
5193
5194   coding->produced_char += produced_chars;
5195   coding->produced = dst - coding->destination;
5196   return 0;
5197 }
5198
5199 \f
5200 /*** 10, 11. no-conversion handlers ***/
5201
5202 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5203
5204 static void
5205 decode_coding_raw_text (struct coding_system *coding)
5206 {
5207   bool eol_dos
5208     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5209
5210   coding->chars_at_source = 1;
5211   coding->consumed_char = coding->src_chars;
5212   coding->consumed = coding->src_bytes;
5213   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5214     {
5215       coding->consumed_char--;
5216       coding->consumed--;
5217       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5218     }
5219   else
5220     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5221 }
5222
5223 static bool
5224 encode_coding_raw_text (struct coding_system *coding)
5225 {
5226   bool multibytep = coding->dst_multibyte;
5227   int *charbuf = coding->charbuf;
5228   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5229   unsigned char *dst = coding->destination + coding->produced;
5230   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5231   ptrdiff_t produced_chars = 0;
5232   int c;
5233
5234   if (multibytep)
5235     {
5236       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5237
5238       if (coding->src_multibyte)
5239         while (charbuf < charbuf_end)
5240           {
5241             ASSURE_DESTINATION (safe_room);
5242             c = *charbuf++;
5243             if (ASCII_CHAR_P (c))
5244               EMIT_ONE_ASCII_BYTE (c);
5245             else if (CHAR_BYTE8_P (c))
5246               {
5247                 c = CHAR_TO_BYTE8 (c);
5248                 EMIT_ONE_BYTE (c);
5249               }
5250             else
5251               {
5252                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5253
5254                 CHAR_STRING_ADVANCE (c, p1);
5255                 do
5256                   {
5257                     EMIT_ONE_BYTE (*p0);
5258                     p0++;
5259                   }
5260                 while (p0 < p1);
5261               }
5262           }
5263       else
5264         while (charbuf < charbuf_end)
5265           {
5266             ASSURE_DESTINATION (safe_room);
5267             c = *charbuf++;
5268             EMIT_ONE_BYTE (c);
5269           }
5270     }
5271   else
5272     {
5273       if (coding->src_multibyte)
5274         {
5275           int safe_room = MAX_MULTIBYTE_LENGTH;
5276
5277           while (charbuf < charbuf_end)
5278             {
5279               ASSURE_DESTINATION (safe_room);
5280               c = *charbuf++;
5281               if (ASCII_CHAR_P (c))
5282                 *dst++ = c;
5283               else if (CHAR_BYTE8_P (c))
5284                 *dst++ = CHAR_TO_BYTE8 (c);
5285               else
5286                 CHAR_STRING_ADVANCE (c, dst);
5287             }
5288         }
5289       else
5290         {
5291           ASSURE_DESTINATION (charbuf_end - charbuf);
5292           while (charbuf < charbuf_end && dst < dst_end)
5293             *dst++ = *charbuf++;
5294         }
5295       produced_chars = dst - (coding->destination + coding->produced);
5296     }
5297   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5298   coding->produced_char += produced_chars;
5299   coding->produced = dst - coding->destination;
5300   return 0;
5301 }
5302
5303 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5304    Return true if a text is encoded in a charset-based coding system.  */
5305
5306 static bool
5307 detect_coding_charset (struct coding_system *coding,
5308                        struct coding_detection_info *detect_info)
5309 {
5310   const unsigned char *src = coding->source, *src_base;
5311   const unsigned char *src_end = coding->source + coding->src_bytes;
5312   bool multibytep = coding->src_multibyte;
5313   ptrdiff_t consumed_chars = 0;
5314   Lisp_Object attrs, valids, name;
5315   int found = 0;
5316   ptrdiff_t head_ascii = coding->head_ascii;
5317   bool check_latin_extra = 0;
5318
5319   detect_info->checked |= CATEGORY_MASK_CHARSET;
5320
5321   coding = &coding_categories[coding_category_charset];
5322   attrs = CODING_ID_ATTRS (coding->id);
5323   valids = AREF (attrs, coding_attr_charset_valids);
5324   name = CODING_ID_NAME (coding->id);
5325   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5326                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5327       || strncmp (SSDATA (SYMBOL_NAME (name)),
5328                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5329     check_latin_extra = 1;
5330
5331   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5332     src += head_ascii;
5333
5334   while (1)
5335     {
5336       int c;
5337       Lisp_Object val;
5338       struct charset *charset;
5339       int dim, idx;
5340
5341       src_base = src;
5342       ONE_MORE_BYTE (c);
5343       if (c < 0)
5344         continue;
5345       val = AREF (valids, c);
5346       if (NILP (val))
5347         break;
5348       if (c >= 0x80)
5349         {
5350           if (c < 0xA0
5351               && check_latin_extra
5352               && (!VECTORP (Vlatin_extra_code_table)
5353                   || NILP (AREF (Vlatin_extra_code_table, c))))
5354             break;
5355           found = CATEGORY_MASK_CHARSET;
5356         }
5357       if (INTEGERP (val))
5358         {
5359           charset = CHARSET_FROM_ID (XFASTINT (val));
5360           dim = CHARSET_DIMENSION (charset);
5361           for (idx = 1; idx < dim; idx++)
5362             {
5363               if (src == src_end)
5364                 goto too_short;
5365               ONE_MORE_BYTE (c);
5366               if (c < charset->code_space[(dim - 1 - idx) * 4]
5367                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5368                 break;
5369             }
5370           if (idx < dim)
5371             break;
5372         }
5373       else
5374         {
5375           idx = 1;
5376           for (; CONSP (val); val = XCDR (val))
5377             {
5378               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5379               dim = CHARSET_DIMENSION (charset);
5380               while (idx < dim)
5381                 {
5382                   if (src == src_end)
5383                     goto too_short;
5384                   ONE_MORE_BYTE (c);
5385                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5386                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5387                     break;
5388                   idx++;
5389                 }
5390               if (idx == dim)
5391                 {
5392                   val = Qnil;
5393                   break;
5394                 }
5395             }
5396           if (CONSP (val))
5397             break;
5398         }
5399     }
5400  too_short:
5401   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5402   return 0;
5403
5404  no_more_source:
5405   detect_info->found |= found;
5406   return 1;
5407 }
5408
5409 static void
5410 decode_coding_charset (struct coding_system *coding)
5411 {
5412   const unsigned char *src = coding->source + coding->consumed;
5413   const unsigned char *src_end = coding->source + coding->src_bytes;
5414   const unsigned char *src_base;
5415   int *charbuf = coding->charbuf + coding->charbuf_used;
5416   /* We may produce one charset annotation in one loop and one more at
5417      the end.  */
5418   int *charbuf_end
5419     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5420   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5421   bool multibytep = coding->src_multibyte;
5422   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5423   Lisp_Object valids;
5424   ptrdiff_t char_offset = coding->produced_char;
5425   ptrdiff_t last_offset = char_offset;
5426   int last_id = charset_ascii;
5427   bool eol_dos
5428     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5429   int byte_after_cr = -1;
5430
5431   valids = AREF (attrs, coding_attr_charset_valids);
5432
5433   while (1)
5434     {
5435       int c;
5436       Lisp_Object val;
5437       struct charset *charset;
5438       int dim;
5439       int len = 1;
5440       unsigned code;
5441
5442       src_base = src;
5443       consumed_chars_base = consumed_chars;
5444
5445       if (charbuf >= charbuf_end)
5446         {
5447           if (byte_after_cr >= 0)
5448             src_base--;
5449           break;
5450         }
5451
5452       if (byte_after_cr >= 0)
5453         {
5454           c = byte_after_cr;
5455           byte_after_cr = -1;
5456         }
5457       else
5458         {
5459           ONE_MORE_BYTE (c);
5460           if (eol_dos && c == '\r')
5461             ONE_MORE_BYTE (byte_after_cr);
5462         }
5463       if (c < 0)
5464         goto invalid_code;
5465       code = c;
5466
5467       val = AREF (valids, c);
5468       if (! INTEGERP (val) && ! CONSP (val))
5469         goto invalid_code;
5470       if (INTEGERP (val))
5471         {
5472           charset = CHARSET_FROM_ID (XFASTINT (val));
5473           dim = CHARSET_DIMENSION (charset);
5474           while (len < dim)
5475             {
5476               ONE_MORE_BYTE (c);
5477               code = (code << 8) | c;
5478               len++;
5479             }
5480           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5481                               charset, code, c);
5482         }
5483       else
5484         {
5485           /* VAL is a list of charset IDs.  It is assured that the
5486              list is sorted by charset dimensions (smaller one
5487              comes first).  */
5488           while (CONSP (val))
5489             {
5490               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5491               dim = CHARSET_DIMENSION (charset);
5492               while (len < dim)
5493                 {
5494                   ONE_MORE_BYTE (c);
5495                   code = (code << 8) | c;
5496                   len++;
5497                 }
5498               CODING_DECODE_CHAR (coding, src, src_base,
5499                                   src_end, charset, code, c);
5500               if (c >= 0)
5501                 break;
5502               val = XCDR (val);
5503             }
5504         }
5505       if (c < 0)
5506         goto invalid_code;
5507       if (charset->id != charset_ascii
5508           && last_id != charset->id)
5509         {
5510           if (last_id != charset_ascii)
5511             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5512           last_id = charset->id;
5513           last_offset = char_offset;
5514         }
5515
5516       *charbuf++ = c;
5517       char_offset++;
5518       continue;
5519
5520     invalid_code:
5521       src = src_base;
5522       consumed_chars = consumed_chars_base;
5523       ONE_MORE_BYTE (c);
5524       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5525       char_offset++;
5526       coding->errors++;
5527     }
5528
5529  no_more_source:
5530   if (last_id != charset_ascii)
5531     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5532   coding->consumed_char += consumed_chars_base;
5533   coding->consumed = src_base - coding->source;
5534   coding->charbuf_used = charbuf - coding->charbuf;
5535 }
5536
5537 static bool
5538 encode_coding_charset (struct coding_system *coding)
5539 {
5540   bool multibytep = coding->dst_multibyte;
5541   int *charbuf = coding->charbuf;
5542   int *charbuf_end = charbuf + coding->charbuf_used;
5543   unsigned char *dst = coding->destination + coding->produced;
5544   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5545   int safe_room = MAX_MULTIBYTE_LENGTH;
5546   ptrdiff_t produced_chars = 0;
5547   Lisp_Object attrs, charset_list;
5548   bool ascii_compatible;
5549   int c;
5550
5551   CODING_GET_INFO (coding, attrs, charset_list);
5552   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5553
5554   while (charbuf < charbuf_end)
5555     {
5556       struct charset *charset;
5557       unsigned code;
5558
5559       ASSURE_DESTINATION (safe_room);
5560       c = *charbuf++;
5561       if (ascii_compatible && ASCII_CHAR_P (c))
5562         EMIT_ONE_ASCII_BYTE (c);
5563       else if (CHAR_BYTE8_P (c))
5564         {
5565           c = CHAR_TO_BYTE8 (c);
5566           EMIT_ONE_BYTE (c);
5567         }
5568       else
5569         {
5570           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5571                                &code, charset);
5572
5573           if (charset)
5574             {
5575               if (CHARSET_DIMENSION (charset) == 1)
5576                 EMIT_ONE_BYTE (code);
5577               else if (CHARSET_DIMENSION (charset) == 2)
5578                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5579               else if (CHARSET_DIMENSION (charset) == 3)
5580                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5581               else
5582                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5583                                  (code >> 8) & 0xFF, code & 0xFF);
5584             }
5585           else
5586             {
5587               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5588                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5589               else
5590                 c = coding->default_char;
5591               EMIT_ONE_BYTE (c);
5592             }
5593         }
5594     }
5595
5596   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5597   coding->produced_char += produced_chars;
5598   coding->produced = dst - coding->destination;
5599   return 0;
5600 }
5601
5602 \f
5603 /*** 7. C library functions ***/
5604
5605 /* Setup coding context CODING from information about CODING_SYSTEM.
5606    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5607    CODING_SYSTEM is invalid, signal an error.  */
5608
5609 void
5610 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5611 {
5612   Lisp_Object attrs;
5613   Lisp_Object eol_type;
5614   Lisp_Object coding_type;
5615   Lisp_Object val;
5616
5617   if (NILP (coding_system))
5618     coding_system = Qundecided;
5619
5620   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5621
5622   attrs = CODING_ID_ATTRS (coding->id);
5623   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5624
5625   coding->mode = 0;
5626   coding->head_ascii = -1;
5627   if (VECTORP (eol_type))
5628     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5629                             | CODING_REQUIRE_DETECTION_MASK);
5630   else if (! EQ (eol_type, Qunix))
5631     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5632                             | CODING_REQUIRE_ENCODING_MASK);
5633   else
5634     coding->common_flags = 0;
5635   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5636     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5637   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5638     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5639   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5640     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5641
5642   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5643   coding->max_charset_id = SCHARS (val) - 1;
5644   coding->safe_charsets = SDATA (val);
5645   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5646   coding->carryover_bytes = 0;
5647
5648   coding_type = CODING_ATTR_TYPE (attrs);
5649   if (EQ (coding_type, Qundecided))
5650     {
5651       coding->detector = NULL;
5652       coding->decoder = decode_coding_raw_text;
5653       coding->encoder = encode_coding_raw_text;
5654       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5655     }
5656   else if (EQ (coding_type, Qiso_2022))
5657     {
5658       int i;
5659       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5660
5661       /* Invoke graphic register 0 to plane 0.  */
5662       CODING_ISO_INVOCATION (coding, 0) = 0;
5663       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5664       CODING_ISO_INVOCATION (coding, 1)
5665         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5666       /* Setup the initial status of designation.  */
5667       for (i = 0; i < 4; i++)
5668         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5669       /* Not single shifting initially.  */
5670       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5671       /* Beginning of buffer should also be regarded as bol. */
5672       CODING_ISO_BOL (coding) = 1;
5673       coding->detector = detect_coding_iso_2022;
5674       coding->decoder = decode_coding_iso_2022;
5675       coding->encoder = encode_coding_iso_2022;
5676       if (flags & CODING_ISO_FLAG_SAFE)
5677         coding->mode |= CODING_MODE_SAFE_ENCODING;
5678       coding->common_flags
5679         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5680             | CODING_REQUIRE_FLUSHING_MASK);
5681       if (flags & CODING_ISO_FLAG_COMPOSITION)
5682         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5683       if (flags & CODING_ISO_FLAG_DESIGNATION)
5684         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5685       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5686         {
5687           setup_iso_safe_charsets (attrs);
5688           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5689           coding->max_charset_id = SCHARS (val) - 1;
5690           coding->safe_charsets = SDATA (val);
5691         }
5692       CODING_ISO_FLAGS (coding) = flags;
5693       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5694       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5695       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5696       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5697     }
5698   else if (EQ (coding_type, Qcharset))
5699     {
5700       coding->detector = detect_coding_charset;
5701       coding->decoder = decode_coding_charset;
5702       coding->encoder = encode_coding_charset;
5703       coding->common_flags
5704         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5705     }
5706   else if (EQ (coding_type, Qutf_8))
5707     {
5708       val = AREF (attrs, coding_attr_utf_bom);
5709       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5710                                    : EQ (val, Qt) ? utf_with_bom
5711                                    : utf_without_bom);
5712       coding->detector = detect_coding_utf_8;
5713       coding->decoder = decode_coding_utf_8;
5714       coding->encoder = encode_coding_utf_8;
5715       coding->common_flags
5716         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5717       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5718         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5719     }
5720   else if (EQ (coding_type, Qutf_16))
5721     {
5722       val = AREF (attrs, coding_attr_utf_bom);
5723       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5724                                     : EQ (val, Qt) ? utf_with_bom
5725                                     : utf_without_bom);
5726       val = AREF (attrs, coding_attr_utf_16_endian);
5727       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5728                                        : utf_16_little_endian);
5729       CODING_UTF_16_SURROGATE (coding) = 0;
5730       coding->detector = detect_coding_utf_16;
5731       coding->decoder = decode_coding_utf_16;
5732       coding->encoder = encode_coding_utf_16;
5733       coding->common_flags
5734         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5735       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5736         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5737     }
5738   else if (EQ (coding_type, Qccl))
5739     {
5740       coding->detector = detect_coding_ccl;
5741       coding->decoder = decode_coding_ccl;
5742       coding->encoder = encode_coding_ccl;
5743       coding->common_flags
5744         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5745             | CODING_REQUIRE_FLUSHING_MASK);
5746     }
5747   else if (EQ (coding_type, Qemacs_mule))
5748     {
5749       coding->detector = detect_coding_emacs_mule;
5750       coding->decoder = decode_coding_emacs_mule;
5751       coding->encoder = encode_coding_emacs_mule;
5752       coding->common_flags
5753         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5754       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5755           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5756         {
5757           Lisp_Object tail, safe_charsets;
5758           int max_charset_id = 0;
5759
5760           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5761                tail = XCDR (tail))
5762             if (max_charset_id < XFASTINT (XCAR (tail)))
5763               max_charset_id = XFASTINT (XCAR (tail));
5764           safe_charsets = make_uninit_string (max_charset_id + 1);
5765           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5766           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5767                tail = XCDR (tail))
5768             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5769           coding->max_charset_id = max_charset_id;
5770           coding->safe_charsets = SDATA (safe_charsets);
5771         }
5772       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5773       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5774     }
5775   else if (EQ (coding_type, Qshift_jis))
5776     {
5777       coding->detector = detect_coding_sjis;
5778       coding->decoder = decode_coding_sjis;
5779       coding->encoder = encode_coding_sjis;
5780       coding->common_flags
5781         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5782     }
5783   else if (EQ (coding_type, Qbig5))
5784     {
5785       coding->detector = detect_coding_big5;
5786       coding->decoder = decode_coding_big5;
5787       coding->encoder = encode_coding_big5;
5788       coding->common_flags
5789         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5790     }
5791   else                          /* EQ (coding_type, Qraw_text) */
5792     {
5793       coding->detector = NULL;
5794       coding->decoder = decode_coding_raw_text;
5795       coding->encoder = encode_coding_raw_text;
5796       if (! EQ (eol_type, Qunix))
5797         {
5798           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5799           if (! VECTORP (eol_type))
5800             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5801         }
5802
5803     }
5804
5805   return;
5806 }
5807
5808 /* Return a list of charsets supported by CODING.  */
5809
5810 Lisp_Object
5811 coding_charset_list (struct coding_system *coding)
5812 {
5813   Lisp_Object attrs, charset_list;
5814
5815   CODING_GET_INFO (coding, attrs, charset_list);
5816   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5817     {
5818       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5819
5820       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5821         charset_list = Viso_2022_charset_list;
5822     }
5823   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5824     {
5825       charset_list = Vemacs_mule_charset_list;
5826     }
5827   return charset_list;
5828 }
5829
5830
5831 /* Return a list of charsets supported by CODING-SYSTEM.  */
5832
5833 Lisp_Object
5834 coding_system_charset_list (Lisp_Object coding_system)
5835 {
5836   ptrdiff_t id;
5837   Lisp_Object attrs, charset_list;
5838
5839   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5840   attrs = CODING_ID_ATTRS (id);
5841
5842   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5843     {
5844       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5845
5846       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5847         charset_list = Viso_2022_charset_list;
5848       else
5849         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5850     }
5851   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5852     {
5853       charset_list = Vemacs_mule_charset_list;
5854     }
5855   else
5856     {
5857       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5858     }
5859   return charset_list;
5860 }
5861
5862
5863 /* Return raw-text or one of its subsidiaries that has the same
5864    eol_type as CODING-SYSTEM.  */
5865
5866 Lisp_Object
5867 raw_text_coding_system (Lisp_Object coding_system)
5868 {
5869   Lisp_Object spec, attrs;
5870   Lisp_Object eol_type, raw_text_eol_type;
5871
5872   if (NILP (coding_system))
5873     return Qraw_text;
5874   spec = CODING_SYSTEM_SPEC (coding_system);
5875   attrs = AREF (spec, 0);
5876
5877   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5878     return coding_system;
5879
5880   eol_type = AREF (spec, 2);
5881   if (VECTORP (eol_type))
5882     return Qraw_text;
5883   spec = CODING_SYSTEM_SPEC (Qraw_text);
5884   raw_text_eol_type = AREF (spec, 2);
5885   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5886           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5887           : AREF (raw_text_eol_type, 2));
5888 }
5889
5890
5891 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5892    the subsidiary that has the same eol-spec as PARENT (if it is not
5893    nil and specifies end-of-line format) or the system's setting
5894    (system_eol_type).  */
5895
5896 Lisp_Object
5897 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5898 {
5899   Lisp_Object spec, eol_type;
5900
5901   if (NILP (coding_system))
5902     coding_system = Qraw_text;
5903   spec = CODING_SYSTEM_SPEC (coding_system);
5904   eol_type = AREF (spec, 2);
5905   if (VECTORP (eol_type))
5906     {
5907       Lisp_Object parent_eol_type;
5908
5909       if (! NILP (parent))
5910         {
5911           Lisp_Object parent_spec;
5912
5913           parent_spec = CODING_SYSTEM_SPEC (parent);
5914           parent_eol_type = AREF (parent_spec, 2);
5915           if (VECTORP (parent_eol_type))
5916             parent_eol_type = system_eol_type;
5917         }
5918       else
5919         parent_eol_type = system_eol_type;
5920       if (EQ (parent_eol_type, Qunix))
5921         coding_system = AREF (eol_type, 0);
5922       else if (EQ (parent_eol_type, Qdos))
5923         coding_system = AREF (eol_type, 1);
5924       else if (EQ (parent_eol_type, Qmac))
5925         coding_system = AREF (eol_type, 2);
5926     }
5927   return coding_system;
5928 }
5929
5930
5931 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5932    decided for writing to a process.  If not, complement them, and
5933    return a new coding system.  */
5934
5935 Lisp_Object
5936 complement_process_encoding_system (Lisp_Object coding_system)
5937 {
5938   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5939   Lisp_Object spec, attrs;
5940   int i;
5941
5942   for (i = 0; i < 3; i++)
5943     {
5944       if (i == 1)
5945         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5946       else if (i == 2)
5947         coding_system = preferred_coding_system ();
5948       spec = CODING_SYSTEM_SPEC (coding_system);
5949       if (NILP (spec))
5950         continue;
5951       attrs = AREF (spec, 0);
5952       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5953         coding_base = CODING_ATTR_BASE_NAME (attrs);
5954       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5955         eol_base = coding_system;
5956       if (! NILP (coding_base) && ! NILP (eol_base))
5957         break;
5958     }
5959
5960   if (i > 0)
5961     /* The original CODING_SYSTEM didn't specify text-conversion or
5962        eol-conversion.  Be sure that we return a fully complemented
5963        coding system.  */
5964     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5965   return coding_system;
5966 }
5967
5968
5969 /* Emacs has a mechanism to automatically detect a coding system if it
5970    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5971    it's impossible to distinguish some coding systems accurately
5972    because they use the same range of codes.  So, at first, coding
5973    systems are categorized into 7, those are:
5974
5975    o coding-category-emacs-mule
5976
5977         The category for a coding system which has the same code range
5978         as Emacs' internal format.  Assigned the coding-system (Lisp
5979         symbol) `emacs-mule' by default.
5980
5981    o coding-category-sjis
5982
5983         The category for a coding system which has the same code range
5984         as SJIS.  Assigned the coding-system (Lisp
5985         symbol) `japanese-shift-jis' by default.
5986
5987    o coding-category-iso-7
5988
5989         The category for a coding system which has the same code range
5990         as ISO2022 of 7-bit environment.  This doesn't use any locking
5991         shift and single shift functions.  This can encode/decode all
5992         charsets.  Assigned the coding-system (Lisp symbol)
5993         `iso-2022-7bit' by default.
5994
5995    o coding-category-iso-7-tight
5996
5997         Same as coding-category-iso-7 except that this can
5998         encode/decode only the specified charsets.
5999
6000    o coding-category-iso-8-1
6001
6002         The category for a coding system which has the same code range
6003         as ISO2022 of 8-bit environment and graphic plane 1 used only
6004         for DIMENSION1 charset.  This doesn't use any locking shift
6005         and single shift functions.  Assigned the coding-system (Lisp
6006         symbol) `iso-latin-1' by default.
6007
6008    o coding-category-iso-8-2
6009
6010         The category for a coding system which has the same code range
6011         as ISO2022 of 8-bit environment and graphic plane 1 used only
6012         for DIMENSION2 charset.  This doesn't use any locking shift
6013         and single shift functions.  Assigned the coding-system (Lisp
6014         symbol) `japanese-iso-8bit' by default.
6015
6016    o coding-category-iso-7-else
6017
6018         The category for a coding system which has the same code range
6019         as ISO2022 of 7-bit environment but uses locking shift or
6020         single shift functions.  Assigned the coding-system (Lisp
6021         symbol) `iso-2022-7bit-lock' by default.
6022
6023    o coding-category-iso-8-else
6024
6025         The category for a coding system which has the same code range
6026         as ISO2022 of 8-bit environment but uses locking shift or
6027         single shift functions.  Assigned the coding-system (Lisp
6028         symbol) `iso-2022-8bit-ss2' by default.
6029
6030    o coding-category-big5
6031
6032         The category for a coding system which has the same code range
6033         as BIG5.  Assigned the coding-system (Lisp symbol)
6034         `cn-big5' by default.
6035
6036    o coding-category-utf-8
6037
6038         The category for a coding system which has the same code range
6039         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6040         symbol) `utf-8' by default.
6041
6042    o coding-category-utf-16-be
6043
6044         The category for a coding system in which a text has an
6045         Unicode signature (cf. Unicode Standard) in the order of BIG
6046         endian at the head.  Assigned the coding-system (Lisp symbol)
6047         `utf-16-be' by default.
6048
6049    o coding-category-utf-16-le
6050
6051         The category for a coding system in which a text has an
6052         Unicode signature (cf. Unicode Standard) in the order of
6053         LITTLE endian at the head.  Assigned the coding-system (Lisp
6054         symbol) `utf-16-le' by default.
6055
6056    o coding-category-ccl
6057
6058         The category for a coding system of which encoder/decoder is
6059         written in CCL programs.  The default value is nil, i.e., no
6060         coding system is assigned.
6061
6062    o coding-category-binary
6063
6064         The category for a coding system not categorized in any of the
6065         above.  Assigned the coding-system (Lisp symbol)
6066         `no-conversion' by default.
6067
6068    Each of them is a Lisp symbol and the value is an actual
6069    `coding-system's (this is also a Lisp symbol) assigned by a user.
6070    What Emacs does actually is to detect a category of coding system.
6071    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6072    decide only one possible category, it selects a category of the
6073    highest priority.  Priorities of categories are also specified by a
6074    user in a Lisp variable `coding-category-list'.
6075
6076 */
6077
6078 #define EOL_SEEN_NONE   0
6079 #define EOL_SEEN_LF     1
6080 #define EOL_SEEN_CR     2
6081 #define EOL_SEEN_CRLF   4
6082
6083 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6084    SOURCE is encoded.  If CATEGORY is one of
6085    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6086    two-byte, else they are encoded by one-byte.
6087
6088    Return one of EOL_SEEN_XXX.  */
6089
6090 #define MAX_EOL_CHECK_COUNT 3
6091
6092 static int
6093 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6094             enum coding_category category)
6095 {
6096   const unsigned char *src = source, *src_end = src + src_bytes;
6097   unsigned char c;
6098   int total  = 0;
6099   int eol_seen = EOL_SEEN_NONE;
6100
6101   if ((1 << category) & CATEGORY_MASK_UTF_16)
6102     {
6103       bool msb = category == (coding_category_utf_16_le
6104                               | coding_category_utf_16_le_nosig);
6105       bool lsb = !msb;
6106
6107       while (src + 1 < src_end)
6108         {
6109           c = src[lsb];
6110           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6111             {
6112               int this_eol;
6113
6114               if (c == '\n')
6115                 this_eol = EOL_SEEN_LF;
6116               else if (src + 3 >= src_end
6117                        || src[msb + 2] != 0
6118                        || src[lsb + 2] != '\n')
6119                 this_eol = EOL_SEEN_CR;
6120               else
6121                 {
6122                   this_eol = EOL_SEEN_CRLF;
6123                   src += 2;
6124                 }
6125
6126               if (eol_seen == EOL_SEEN_NONE)
6127                 /* This is the first end-of-line.  */
6128                 eol_seen = this_eol;
6129               else if (eol_seen != this_eol)
6130                 {
6131                   /* The found type is different from what found before.
6132                      Allow for stray ^M characters in DOS EOL files.  */
6133                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6134                       || (eol_seen == EOL_SEEN_CRLF
6135                           && this_eol == EOL_SEEN_CR))
6136                     eol_seen = EOL_SEEN_CRLF;
6137                   else
6138                     {
6139                       eol_seen = EOL_SEEN_LF;
6140                       break;
6141                     }
6142                 }
6143               if (++total == MAX_EOL_CHECK_COUNT)
6144                 break;
6145             }
6146           src += 2;
6147         }
6148     }
6149   else
6150     while (src < src_end)
6151       {
6152         c = *src++;
6153         if (c == '\n' || c == '\r')
6154           {
6155             int this_eol;
6156
6157             if (c == '\n')
6158               this_eol = EOL_SEEN_LF;
6159             else if (src >= src_end || *src != '\n')
6160               this_eol = EOL_SEEN_CR;
6161             else
6162               this_eol = EOL_SEEN_CRLF, src++;
6163
6164             if (eol_seen == EOL_SEEN_NONE)
6165               /* This is the first end-of-line.  */
6166               eol_seen = this_eol;
6167             else if (eol_seen != this_eol)
6168               {
6169                 /* The found type is different from what found before.
6170                    Allow for stray ^M characters in DOS EOL files.  */
6171                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6172                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6173                   eol_seen = EOL_SEEN_CRLF;
6174                 else
6175                   {
6176                     eol_seen = EOL_SEEN_LF;
6177                     break;
6178                   }
6179               }
6180             if (++total == MAX_EOL_CHECK_COUNT)
6181               break;
6182           }
6183       }
6184   return eol_seen;
6185 }
6186
6187
6188 static Lisp_Object
6189 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6190 {
6191   Lisp_Object eol_type;
6192
6193   eol_type = CODING_ID_EOL_TYPE (coding->id);
6194   if (eol_seen & EOL_SEEN_LF)
6195     {
6196       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6197       eol_type = Qunix;
6198     }
6199   else if (eol_seen & EOL_SEEN_CRLF)
6200     {
6201       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6202       eol_type = Qdos;
6203     }
6204   else if (eol_seen & EOL_SEEN_CR)
6205     {
6206       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6207       eol_type = Qmac;
6208     }
6209   return eol_type;
6210 }
6211
6212 /* Detect how a text specified in CODING is encoded.  If a coding
6213    system is detected, update fields of CODING by the detected coding
6214    system.  */
6215
6216 static void
6217 detect_coding (struct coding_system *coding)
6218 {
6219   const unsigned char *src, *src_end;
6220   unsigned int saved_mode = coding->mode;
6221
6222   coding->consumed = coding->consumed_char = 0;
6223   coding->produced = coding->produced_char = 0;
6224   coding_set_source (coding);
6225
6226   src_end = coding->source + coding->src_bytes;
6227   coding->head_ascii = 0;
6228
6229   /* If we have not yet decided the text encoding type, detect it
6230      now.  */
6231   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6232     {
6233       int c, i;
6234       struct coding_detection_info detect_info;
6235       bool null_byte_found = 0, eight_bit_found = 0;
6236
6237       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6238       for (src = coding->source; src < src_end; src++)
6239         {
6240           c = *src;
6241           if (c & 0x80)
6242             {
6243               eight_bit_found = 1;
6244               if (null_byte_found)
6245                 break;
6246             }
6247           else if (c < 0x20)
6248             {
6249               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6250                   && ! inhibit_iso_escape_detection
6251                   && ! detect_info.checked)
6252                 {
6253                   if (detect_coding_iso_2022 (coding, &detect_info))
6254                     {
6255                       /* We have scanned the whole data.  */
6256                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6257                         {
6258                           /* We didn't find an 8-bit code.  We may
6259                              have found a null-byte, but it's very
6260                              rare that a binary file conforms to
6261                              ISO-2022.  */
6262                           src = src_end;
6263                           coding->head_ascii = src - coding->source;
6264                         }
6265                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6266                       break;
6267                     }
6268                 }
6269               else if (! c && !inhibit_null_byte_detection)
6270                 {
6271                   null_byte_found = 1;
6272                   if (eight_bit_found)
6273                     break;
6274                 }
6275               if (! eight_bit_found)
6276                 coding->head_ascii++;
6277             }
6278           else if (! eight_bit_found)
6279             coding->head_ascii++;
6280         }
6281
6282       if (null_byte_found || eight_bit_found
6283           || coding->head_ascii < coding->src_bytes
6284           || detect_info.found)
6285         {
6286           enum coding_category category;
6287           struct coding_system *this;
6288
6289           if (coding->head_ascii == coding->src_bytes)
6290             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6291             for (i = 0; i < coding_category_raw_text; i++)
6292               {
6293                 category = coding_priorities[i];
6294                 this = coding_categories + category;
6295                 if (detect_info.found & (1 << category))
6296                   break;
6297               }
6298           else
6299             {
6300               if (null_byte_found)
6301                 {
6302                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6303                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6304                 }
6305               for (i = 0; i < coding_category_raw_text; i++)
6306                 {
6307                   category = coding_priorities[i];
6308                   this = coding_categories + category;
6309                   /* Some of this->detector (e.g. detect_coding_sjis)
6310                      require this information.  */
6311                   coding->id = this->id;
6312                   if (this->id < 0)
6313                     {
6314                       /* No coding system of this category is defined.  */
6315                       detect_info.rejected |= (1 << category);
6316                     }
6317                   else if (category >= coding_category_raw_text)
6318                     continue;
6319                   else if (detect_info.checked & (1 << category))
6320                     {
6321                       if (detect_info.found & (1 << category))
6322                         break;
6323                     }
6324                   else if ((*(this->detector)) (coding, &detect_info)
6325                            && detect_info.found & (1 << category))
6326                     {
6327                       if (category == coding_category_utf_16_auto)
6328                         {
6329                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6330                             category = coding_category_utf_16_le;
6331                           else
6332                             category = coding_category_utf_16_be;
6333                         }
6334                       break;
6335                     }
6336                 }
6337             }
6338
6339           if (i < coding_category_raw_text)
6340             setup_coding_system (CODING_ID_NAME (this->id), coding);
6341           else if (null_byte_found)
6342             setup_coding_system (Qno_conversion, coding);
6343           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6344                    == CATEGORY_MASK_ANY)
6345             setup_coding_system (Qraw_text, coding);
6346           else if (detect_info.rejected)
6347             for (i = 0; i < coding_category_raw_text; i++)
6348               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6349                 {
6350                   this = coding_categories + coding_priorities[i];
6351                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6352                   break;
6353                 }
6354         }
6355     }
6356   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6357            == coding_category_utf_8_auto)
6358     {
6359       Lisp_Object coding_systems;
6360       struct coding_detection_info detect_info;
6361
6362       coding_systems
6363         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6364       detect_info.found = detect_info.rejected = 0;
6365       coding->head_ascii = 0;
6366       if (CONSP (coding_systems)
6367           && detect_coding_utf_8 (coding, &detect_info))
6368         {
6369           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6370             setup_coding_system (XCAR (coding_systems), coding);
6371           else
6372             setup_coding_system (XCDR (coding_systems), coding);
6373         }
6374     }
6375   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6376            == coding_category_utf_16_auto)
6377     {
6378       Lisp_Object coding_systems;
6379       struct coding_detection_info detect_info;
6380
6381       coding_systems
6382         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6383       detect_info.found = detect_info.rejected = 0;
6384       coding->head_ascii = 0;
6385       if (CONSP (coding_systems)
6386           && detect_coding_utf_16 (coding, &detect_info))
6387         {
6388           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6389             setup_coding_system (XCAR (coding_systems), coding);
6390           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6391             setup_coding_system (XCDR (coding_systems), coding);
6392         }
6393     }
6394   coding->mode = saved_mode;
6395 }
6396
6397
6398 static void
6399 decode_eol (struct coding_system *coding)
6400 {
6401   Lisp_Object eol_type;
6402   unsigned char *p, *pbeg, *pend;
6403
6404   eol_type = CODING_ID_EOL_TYPE (coding->id);
6405   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6406     return;
6407
6408   if (NILP (coding->dst_object))
6409     pbeg = coding->destination;
6410   else
6411     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6412   pend = pbeg + coding->produced;
6413
6414   if (VECTORP (eol_type))
6415     {
6416       int eol_seen = EOL_SEEN_NONE;
6417
6418       for (p = pbeg; p < pend; p++)
6419         {
6420           if (*p == '\n')
6421             eol_seen |= EOL_SEEN_LF;
6422           else if (*p == '\r')
6423             {
6424               if (p + 1 < pend && *(p + 1) == '\n')
6425                 {
6426                   eol_seen |= EOL_SEEN_CRLF;
6427                   p++;
6428                 }
6429               else
6430                 eol_seen |= EOL_SEEN_CR;
6431             }
6432         }
6433       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6434       if ((eol_seen & EOL_SEEN_CRLF) != 0
6435           && (eol_seen & EOL_SEEN_CR) != 0
6436           && (eol_seen & EOL_SEEN_LF) == 0)
6437         eol_seen = EOL_SEEN_CRLF;
6438       else if (eol_seen != EOL_SEEN_NONE
6439           && eol_seen != EOL_SEEN_LF
6440           && eol_seen != EOL_SEEN_CRLF
6441           && eol_seen != EOL_SEEN_CR)
6442         eol_seen = EOL_SEEN_LF;
6443       if (eol_seen != EOL_SEEN_NONE)
6444         eol_type = adjust_coding_eol_type (coding, eol_seen);
6445     }
6446
6447   if (EQ (eol_type, Qmac))
6448     {
6449       for (p = pbeg; p < pend; p++)
6450         if (*p == '\r')
6451           *p = '\n';
6452     }
6453   else if (EQ (eol_type, Qdos))
6454     {
6455       ptrdiff_t n = 0;
6456
6457       if (NILP (coding->dst_object))
6458         {
6459           /* Start deleting '\r' from the tail to minimize the memory
6460              movement.  */
6461           for (p = pend - 2; p >= pbeg; p--)
6462             if (*p == '\r')
6463               {
6464                 memmove (p, p + 1, pend-- - p - 1);
6465                 n++;
6466               }
6467         }
6468       else
6469         {
6470           ptrdiff_t pos_byte = coding->dst_pos_byte;
6471           ptrdiff_t pos = coding->dst_pos;
6472           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6473
6474           while (pos < pos_end)
6475             {
6476               p = BYTE_POS_ADDR (pos_byte);
6477               if (*p == '\r' && p[1] == '\n')
6478                 {
6479                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6480                   n++;
6481                   pos_end--;
6482                 }
6483               pos++;
6484               if (coding->dst_multibyte)
6485                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6486               else
6487                 pos_byte++;
6488             }
6489         }
6490       coding->produced -= n;
6491       coding->produced_char -= n;
6492     }
6493 }
6494
6495
6496 /* Return a translation table (or list of them) from coding system
6497    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6498    not ENCODEP). */
6499
6500 static Lisp_Object
6501 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6502 {
6503   Lisp_Object standard, translation_table;
6504   Lisp_Object val;
6505
6506   if (NILP (Venable_character_translation))
6507     {
6508       if (max_lookup)
6509         *max_lookup = 0;
6510       return Qnil;
6511     }
6512   if (encodep)
6513     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6514       standard = Vstandard_translation_table_for_encode;
6515   else
6516     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6517       standard = Vstandard_translation_table_for_decode;
6518   if (NILP (translation_table))
6519     translation_table = standard;
6520   else
6521     {
6522       if (SYMBOLP (translation_table))
6523         translation_table = Fget (translation_table, Qtranslation_table);
6524       else if (CONSP (translation_table))
6525         {
6526           translation_table = Fcopy_sequence (translation_table);
6527           for (val = translation_table; CONSP (val); val = XCDR (val))
6528             if (SYMBOLP (XCAR (val)))
6529               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6530         }
6531       if (CHAR_TABLE_P (standard))
6532         {
6533           if (CONSP (translation_table))
6534             translation_table = nconc2 (translation_table,
6535                                         Fcons (standard, Qnil));
6536           else
6537             translation_table = Fcons (translation_table,
6538                                        Fcons (standard, Qnil));
6539         }
6540     }
6541
6542   if (max_lookup)
6543     {
6544       *max_lookup = 1;
6545       if (CHAR_TABLE_P (translation_table)
6546           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6547         {
6548           val = XCHAR_TABLE (translation_table)->extras[1];
6549           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6550             *max_lookup = XFASTINT (val);
6551         }
6552       else if (CONSP (translation_table))
6553         {
6554           Lisp_Object tail;
6555
6556           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6557             if (CHAR_TABLE_P (XCAR (tail))
6558                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6559               {
6560                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6561                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6562                   *max_lookup = XFASTINT (tailval);
6563               }
6564         }
6565     }
6566   return translation_table;
6567 }
6568
6569 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6570   do {                                                          \
6571     trans = Qnil;                                               \
6572     if (CHAR_TABLE_P (table))                                   \
6573       {                                                         \
6574         trans = CHAR_TABLE_REF (table, c);                      \
6575         if (CHARACTERP (trans))                                 \
6576           c = XFASTINT (trans), trans = Qnil;                   \
6577       }                                                         \
6578     else if (CONSP (table))                                     \
6579       {                                                         \
6580         Lisp_Object tail;                                       \
6581                                                                 \
6582         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6583           if (CHAR_TABLE_P (XCAR (tail)))                       \
6584             {                                                   \
6585               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6586               if (CHARACTERP (trans))                           \
6587                 c = XFASTINT (trans), trans = Qnil;             \
6588               else if (! NILP (trans))                          \
6589                 break;                                          \
6590             }                                                   \
6591       }                                                         \
6592   } while (0)
6593
6594
6595 /* Return a translation of character(s) at BUF according to TRANS.
6596    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6597    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6598    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6599    translation is found, and Qnil if not found..
6600    If BUF is too short to lookup characters in FROM, return Qt.  */
6601
6602 static Lisp_Object
6603 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6604 {
6605
6606   if (INTEGERP (trans))
6607     return trans;
6608   for (; CONSP (trans); trans = XCDR (trans))
6609     {
6610       Lisp_Object val = XCAR (trans);
6611       Lisp_Object from = XCAR (val);
6612       ptrdiff_t len = ASIZE (from);
6613       ptrdiff_t i;
6614
6615       for (i = 0; i < len; i++)
6616         {
6617           if (buf + i == buf_end)
6618             return Qt;
6619           if (XINT (AREF (from, i)) != buf[i])
6620             break;
6621         }
6622       if (i == len)
6623         return val;
6624     }
6625   return Qnil;
6626 }
6627
6628
6629 static int
6630 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6631                bool last_block)
6632 {
6633   unsigned char *dst = coding->destination + coding->produced;
6634   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6635   ptrdiff_t produced;
6636   ptrdiff_t produced_chars = 0;
6637   int carryover = 0;
6638
6639   if (! coding->chars_at_source)
6640     {
6641       /* Source characters are in coding->charbuf.  */
6642       int *buf = coding->charbuf;
6643       int *buf_end = buf + coding->charbuf_used;
6644
6645       if (EQ (coding->src_object, coding->dst_object))
6646         {
6647           coding_set_source (coding);
6648           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6649         }
6650
6651       while (buf < buf_end)
6652         {
6653           int c = *buf;
6654           ptrdiff_t i;
6655
6656           if (c >= 0)
6657             {
6658               ptrdiff_t from_nchars = 1, to_nchars = 1;
6659               Lisp_Object trans = Qnil;
6660
6661               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6662               if (! NILP (trans))
6663                 {
6664                   trans = get_translation (trans, buf, buf_end);
6665                   if (INTEGERP (trans))
6666                     c = XINT (trans);
6667                   else if (CONSP (trans))
6668                     {
6669                       from_nchars = ASIZE (XCAR (trans));
6670                       trans = XCDR (trans);
6671                       if (INTEGERP (trans))
6672                         c = XINT (trans);
6673                       else
6674                         {
6675                           to_nchars = ASIZE (trans);
6676                           c = XINT (AREF (trans, 0));
6677                         }
6678                     }
6679                   else if (EQ (trans, Qt) && ! last_block)
6680                     break;
6681                 }
6682
6683               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6684                 {
6685                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6686                        / MAX_MULTIBYTE_LENGTH)
6687                       < to_nchars)
6688                     memory_full (SIZE_MAX);
6689                   dst = alloc_destination (coding,
6690                                            buf_end - buf
6691                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6692                                            dst);
6693                   if (EQ (coding->src_object, coding->dst_object))
6694                     {
6695                       coding_set_source (coding);
6696                       dst_end = (((unsigned char *) coding->source)
6697                                  + coding->consumed);
6698                     }
6699                   else
6700                     dst_end = coding->destination + coding->dst_bytes;
6701                 }
6702
6703               for (i = 0; i < to_nchars; i++)
6704                 {
6705                   if (i > 0)
6706                     c = XINT (AREF (trans, i));
6707                   if (coding->dst_multibyte
6708                       || ! CHAR_BYTE8_P (c))
6709                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6710                   else
6711                     *dst++ = CHAR_TO_BYTE8 (c);
6712                 }
6713               produced_chars += to_nchars;
6714               buf += from_nchars;
6715             }
6716           else
6717             /* This is an annotation datum.  (-C) is the length.  */
6718             buf += -c;
6719         }
6720       carryover = buf_end - buf;
6721     }
6722   else
6723     {
6724       /* Source characters are at coding->source.  */
6725       const unsigned char *src = coding->source;
6726       const unsigned char *src_end = src + coding->consumed;
6727
6728       if (EQ (coding->dst_object, coding->src_object))
6729         dst_end = (unsigned char *) src;
6730       if (coding->src_multibyte != coding->dst_multibyte)
6731         {
6732           if (coding->src_multibyte)
6733             {
6734               bool multibytep = 1;
6735               ptrdiff_t consumed_chars = 0;
6736
6737               while (1)
6738                 {
6739                   const unsigned char *src_base = src;
6740                   int c;
6741
6742                   ONE_MORE_BYTE (c);
6743                   if (dst == dst_end)
6744                     {
6745                       if (EQ (coding->src_object, coding->dst_object))
6746                         dst_end = (unsigned char *) src;
6747                       if (dst == dst_end)
6748                         {
6749                           ptrdiff_t offset = src - coding->source;
6750
6751                           dst = alloc_destination (coding, src_end - src + 1,
6752                                                    dst);
6753                           dst_end = coding->destination + coding->dst_bytes;
6754                           coding_set_source (coding);
6755                           src = coding->source + offset;
6756                           src_end = coding->source + coding->consumed;
6757                           if (EQ (coding->src_object, coding->dst_object))
6758                             dst_end = (unsigned char *) src;
6759                         }
6760                     }
6761                   *dst++ = c;
6762                   produced_chars++;
6763                 }
6764             no_more_source:
6765               ;
6766             }
6767           else
6768             while (src < src_end)
6769               {
6770                 bool multibytep = 1;
6771                 int c = *src++;
6772
6773                 if (dst >= dst_end - 1)
6774                   {
6775                     if (EQ (coding->src_object, coding->dst_object))
6776                       dst_end = (unsigned char *) src;
6777                     if (dst >= dst_end - 1)
6778                       {
6779                         ptrdiff_t offset = src - coding->source;
6780                         ptrdiff_t more_bytes;
6781
6782                         if (EQ (coding->src_object, coding->dst_object))
6783                           more_bytes = ((src_end - src) / 2) + 2;
6784                         else
6785                           more_bytes = src_end - src + 2;
6786                         dst = alloc_destination (coding, more_bytes, dst);
6787                         dst_end = coding->destination + coding->dst_bytes;
6788                         coding_set_source (coding);
6789                         src = coding->source + offset;
6790                         src_end = coding->source + coding->consumed;
6791                         if (EQ (coding->src_object, coding->dst_object))
6792                           dst_end = (unsigned char *) src;
6793                       }
6794                   }
6795                 EMIT_ONE_BYTE (c);
6796               }
6797         }
6798       else
6799         {
6800           if (!EQ (coding->src_object, coding->dst_object))
6801             {
6802               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6803
6804               if (require > 0)
6805                 {
6806                   ptrdiff_t offset = src - coding->source;
6807
6808                   dst = alloc_destination (coding, require, dst);
6809                   coding_set_source (coding);
6810                   src = coding->source + offset;
6811                   src_end = coding->source + coding->consumed;
6812                 }
6813             }
6814           produced_chars = coding->consumed_char;
6815           while (src < src_end)
6816             *dst++ = *src++;
6817         }
6818     }
6819
6820   produced = dst - (coding->destination + coding->produced);
6821   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6822     insert_from_gap (produced_chars, produced);
6823   coding->produced += produced;
6824   coding->produced_char += produced_chars;
6825   return carryover;
6826 }
6827
6828 /* Compose text in CODING->object according to the annotation data at
6829    CHARBUF.  CHARBUF is an array:
6830      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6831  */
6832
6833 static void
6834 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6835 {
6836   int len;
6837   ptrdiff_t to;
6838   enum composition_method method;
6839   Lisp_Object components;
6840
6841   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6842   to = pos + charbuf[2];
6843   method = (enum composition_method) (charbuf[4]);
6844
6845   if (method == COMPOSITION_RELATIVE)
6846     components = Qnil;
6847   else
6848     {
6849       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6850       int i, j;
6851
6852       if (method == COMPOSITION_WITH_RULE)
6853         len = charbuf[2] * 3 - 2;
6854       charbuf += MAX_ANNOTATION_LENGTH;
6855       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6856       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6857         {
6858           if (charbuf[i] >= 0)
6859             args[j] = make_number (charbuf[i]);
6860           else
6861             {
6862               i++;
6863               args[j] = make_number (charbuf[i] % 0x100);
6864             }
6865         }
6866       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6867     }
6868   compose_text (pos, to, components, Qnil, coding->dst_object);
6869 }
6870
6871
6872 /* Put `charset' property on text in CODING->object according to
6873    the annotation data at CHARBUF.  CHARBUF is an array:
6874      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6875  */
6876
6877 static void
6878 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6879 {
6880   ptrdiff_t from = pos - charbuf[2];
6881   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6882
6883   Fput_text_property (make_number (from), make_number (pos),
6884                       Qcharset, CHARSET_NAME (charset),
6885                       coding->dst_object);
6886 }
6887
6888
6889 #define CHARBUF_SIZE 0x4000
6890
6891 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6892   do {                                                                  \
6893     int size = CHARBUF_SIZE;                                            \
6894                                                                         \
6895     coding->charbuf = NULL;                                             \
6896     while (size > 1024)                                                 \
6897       {                                                                 \
6898         coding->charbuf = alloca (sizeof (int) * size);                 \
6899         if (coding->charbuf)                                            \
6900           break;                                                        \
6901         size >>= 1;                                                     \
6902       }                                                                 \
6903     if (! coding->charbuf)                                              \
6904       {                                                                 \
6905         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6906         return;                                                         \
6907       }                                                                 \
6908     coding->charbuf_size = size;                                        \
6909   } while (0)
6910
6911
6912 static void
6913 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6914 {
6915   int *charbuf = coding->charbuf;
6916   int *charbuf_end = charbuf + coding->charbuf_used;
6917
6918   if (NILP (coding->dst_object))
6919     return;
6920
6921   while (charbuf < charbuf_end)
6922     {
6923       if (*charbuf >= 0)
6924         pos++, charbuf++;
6925       else
6926         {
6927           int len = -*charbuf;
6928
6929           if (len > 2)
6930             switch (charbuf[1])
6931               {
6932               case CODING_ANNOTATE_COMPOSITION_MASK:
6933                 produce_composition (coding, charbuf, pos);
6934                 break;
6935               case CODING_ANNOTATE_CHARSET_MASK:
6936                 produce_charset (coding, charbuf, pos);
6937                 break;
6938               }
6939           charbuf += len;
6940         }
6941     }
6942 }
6943
6944 /* Decode the data at CODING->src_object into CODING->dst_object.
6945    CODING->src_object is a buffer, a string, or nil.
6946    CODING->dst_object is a buffer.
6947
6948    If CODING->src_object is a buffer, it must be the current buffer.
6949    In this case, if CODING->src_pos is positive, it is a position of
6950    the source text in the buffer, otherwise, the source text is in the
6951    gap area of the buffer, and CODING->src_pos specifies the offset of
6952    the text from GPT (which must be the same as PT).  If this is the
6953    same buffer as CODING->dst_object, CODING->src_pos must be
6954    negative.
6955
6956    If CODING->src_object is a string, CODING->src_pos is an index to
6957    that string.
6958
6959    If CODING->src_object is nil, CODING->source must already point to
6960    the non-relocatable memory area.  In this case, CODING->src_pos is
6961    an offset from CODING->source.
6962
6963    The decoded data is inserted at the current point of the buffer
6964    CODING->dst_object.
6965 */
6966
6967 static void
6968 decode_coding (struct coding_system *coding)
6969 {
6970   Lisp_Object attrs;
6971   Lisp_Object undo_list;
6972   Lisp_Object translation_table;
6973   struct ccl_spec cclspec;
6974   int carryover;
6975   int i;
6976
6977   if (BUFFERP (coding->src_object)
6978       && coding->src_pos > 0
6979       && coding->src_pos < GPT
6980       && coding->src_pos + coding->src_chars > GPT)
6981     move_gap_both (coding->src_pos, coding->src_pos_byte);
6982
6983   undo_list = Qt;
6984   if (BUFFERP (coding->dst_object))
6985     {
6986       set_buffer_internal (XBUFFER (coding->dst_object));
6987       if (GPT != PT)
6988         move_gap_both (PT, PT_BYTE);
6989
6990       /* We must disable undo_list in order to record the whole insert
6991          transaction via record_insert at the end.  But doing so also
6992          disables the recording of the first change to the undo_list.
6993          Therefore we check for first change here and record it via
6994          record_first_change if needed.  */
6995       if (MODIFF <= SAVE_MODIFF)
6996         record_first_change ();
6997
6998       undo_list = BVAR (current_buffer, undo_list);
6999       bset_undo_list (current_buffer, Qt);
7000     }
7001
7002   coding->consumed = coding->consumed_char = 0;
7003   coding->produced = coding->produced_char = 0;
7004   coding->chars_at_source = 0;
7005   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7006   coding->errors = 0;
7007
7008   ALLOC_CONVERSION_WORK_AREA (coding);
7009
7010   attrs = CODING_ID_ATTRS (coding->id);
7011   translation_table = get_translation_table (attrs, 0, NULL);
7012
7013   carryover = 0;
7014   if (coding->decoder == decode_coding_ccl)
7015     {
7016       coding->spec.ccl = &cclspec;
7017       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7018     }
7019   do
7020     {
7021       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7022
7023       coding_set_source (coding);
7024       coding->annotated = 0;
7025       coding->charbuf_used = carryover;
7026       (*(coding->decoder)) (coding);
7027       coding_set_destination (coding);
7028       carryover = produce_chars (coding, translation_table, 0);
7029       if (coding->annotated)
7030         produce_annotation (coding, pos);
7031       for (i = 0; i < carryover; i++)
7032         coding->charbuf[i]
7033           = coding->charbuf[coding->charbuf_used - carryover + i];
7034     }
7035   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7036          || (coding->consumed < coding->src_bytes
7037              && (coding->result == CODING_RESULT_SUCCESS
7038                  || coding->result == CODING_RESULT_INVALID_SRC)));
7039
7040   if (carryover > 0)
7041     {
7042       coding_set_destination (coding);
7043       coding->charbuf_used = carryover;
7044       produce_chars (coding, translation_table, 1);
7045     }
7046
7047   coding->carryover_bytes = 0;
7048   if (coding->consumed < coding->src_bytes)
7049     {
7050       int nbytes = coding->src_bytes - coding->consumed;
7051       const unsigned char *src;
7052
7053       coding_set_source (coding);
7054       coding_set_destination (coding);
7055       src = coding->source + coding->consumed;
7056
7057       if (coding->mode & CODING_MODE_LAST_BLOCK)
7058         {
7059           /* Flush out unprocessed data as binary chars.  We are sure
7060              that the number of data is less than the size of
7061              coding->charbuf.  */
7062           coding->charbuf_used = 0;
7063           coding->chars_at_source = 0;
7064
7065           while (nbytes-- > 0)
7066             {
7067               int c = *src++;
7068
7069               if (c & 0x80)
7070                 c = BYTE8_TO_CHAR (c);
7071               coding->charbuf[coding->charbuf_used++] = c;
7072             }
7073           produce_chars (coding, Qnil, 1);
7074         }
7075       else
7076         {
7077           /* Record unprocessed bytes in coding->carryover.  We are
7078              sure that the number of data is less than the size of
7079              coding->carryover.  */
7080           unsigned char *p = coding->carryover;
7081
7082           if (nbytes > sizeof coding->carryover)
7083             nbytes = sizeof coding->carryover;
7084           coding->carryover_bytes = nbytes;
7085           while (nbytes-- > 0)
7086             *p++ = *src++;
7087         }
7088       coding->consumed = coding->src_bytes;
7089     }
7090
7091   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7092       && !inhibit_eol_conversion)
7093     decode_eol (coding);
7094   if (BUFFERP (coding->dst_object))
7095     {
7096       bset_undo_list (current_buffer, undo_list);
7097       record_insert (coding->dst_pos, coding->produced_char);
7098     }
7099 }
7100
7101
7102 /* Extract an annotation datum from a composition starting at POS and
7103    ending before LIMIT of CODING->src_object (buffer or string), store
7104    the data in BUF, set *STOP to a starting position of the next
7105    composition (if any) or to LIMIT, and return the address of the
7106    next element of BUF.
7107
7108    If such an annotation is not found, set *STOP to a starting
7109    position of a composition after POS (if any) or to LIMIT, and
7110    return BUF.  */
7111
7112 static int *
7113 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7114                                struct coding_system *coding, int *buf,
7115                                ptrdiff_t *stop)
7116 {
7117   ptrdiff_t start, end;
7118   Lisp_Object prop;
7119
7120   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7121       || end > limit)
7122     *stop = limit;
7123   else if (start > pos)
7124     *stop = start;
7125   else
7126     {
7127       if (start == pos)
7128         {
7129           /* We found a composition.  Store the corresponding
7130              annotation data in BUF.  */
7131           int *head = buf;
7132           enum composition_method method = COMPOSITION_METHOD (prop);
7133           int nchars = COMPOSITION_LENGTH (prop);
7134
7135           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7136           if (method != COMPOSITION_RELATIVE)
7137             {
7138               Lisp_Object components;
7139               ptrdiff_t i, len, i_byte;
7140
7141               components = COMPOSITION_COMPONENTS (prop);
7142               if (VECTORP (components))
7143                 {
7144                   len = ASIZE (components);
7145                   for (i = 0; i < len; i++)
7146                     *buf++ = XINT (AREF (components, i));
7147                 }
7148               else if (STRINGP (components))
7149                 {
7150                   len = SCHARS (components);
7151                   i = i_byte = 0;
7152                   while (i < len)
7153                     {
7154                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7155                       buf++;
7156                     }
7157                 }
7158               else if (INTEGERP (components))
7159                 {
7160                   len = 1;
7161                   *buf++ = XINT (components);
7162                 }
7163               else if (CONSP (components))
7164                 {
7165                   for (len = 0; CONSP (components);
7166                        len++, components = XCDR (components))
7167                     *buf++ = XINT (XCAR (components));
7168                 }
7169               else
7170                 emacs_abort ();
7171               *head -= len;
7172             }
7173         }
7174
7175       if (find_composition (end, limit, &start, &end, &prop,
7176                             coding->src_object)
7177           && end <= limit)
7178         *stop = start;
7179       else
7180         *stop = limit;
7181     }
7182   return buf;
7183 }
7184
7185
7186 /* Extract an annotation datum from a text property `charset' at POS of
7187    CODING->src_object (buffer of string), store the data in BUF, set
7188    *STOP to the position where the value of `charset' property changes
7189    (limiting by LIMIT), and return the address of the next element of
7190    BUF.
7191
7192    If the property value is nil, set *STOP to the position where the
7193    property value is non-nil (limiting by LIMIT), and return BUF.  */
7194
7195 static int *
7196 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7197                            struct coding_system *coding, int *buf,
7198                            ptrdiff_t *stop)
7199 {
7200   Lisp_Object val, next;
7201   int id;
7202
7203   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7204   if (! NILP (val) && CHARSETP (val))
7205     id = XINT (CHARSET_SYMBOL_ID (val));
7206   else
7207     id = -1;
7208   ADD_CHARSET_DATA (buf, 0, id);
7209   next = Fnext_single_property_change (make_number (pos), Qcharset,
7210                                        coding->src_object,
7211                                        make_number (limit));
7212   *stop = XINT (next);
7213   return buf;
7214 }
7215
7216
7217 static void
7218 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7219                int max_lookup)
7220 {
7221   int *buf = coding->charbuf;
7222   int *buf_end = coding->charbuf + coding->charbuf_size;
7223   const unsigned char *src = coding->source + coding->consumed;
7224   const unsigned char *src_end = coding->source + coding->src_bytes;
7225   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7226   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7227   bool multibytep = coding->src_multibyte;
7228   Lisp_Object eol_type;
7229   int c;
7230   ptrdiff_t stop, stop_composition, stop_charset;
7231   int *lookup_buf = NULL;
7232
7233   if (! NILP (translation_table))
7234     lookup_buf = alloca (sizeof (int) * max_lookup);
7235
7236   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7237   if (VECTORP (eol_type))
7238     eol_type = Qunix;
7239
7240   /* Note: composition handling is not yet implemented.  */
7241   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7242
7243   if (NILP (coding->src_object))
7244     stop = stop_composition = stop_charset = end_pos;
7245   else
7246     {
7247       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7248         stop = stop_composition = pos;
7249       else
7250         stop = stop_composition = end_pos;
7251       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7252         stop = stop_charset = pos;
7253       else
7254         stop_charset = end_pos;
7255     }
7256
7257   /* Compensate for CRLF and conversion.  */
7258   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7259   while (buf < buf_end)
7260     {
7261       Lisp_Object trans;
7262
7263       if (pos == stop)
7264         {
7265           if (pos == end_pos)
7266             break;
7267           if (pos == stop_composition)
7268             buf = handle_composition_annotation (pos, end_pos, coding,
7269                                                  buf, &stop_composition);
7270           if (pos == stop_charset)
7271             buf = handle_charset_annotation (pos, end_pos, coding,
7272                                              buf, &stop_charset);
7273           stop = (stop_composition < stop_charset
7274                   ? stop_composition : stop_charset);
7275         }
7276
7277       if (! multibytep)
7278         {
7279           int bytes;
7280
7281           if (coding->encoder == encode_coding_raw_text
7282               || coding->encoder == encode_coding_ccl)
7283             c = *src++, pos++;
7284           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7285             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7286           else
7287             c = BYTE8_TO_CHAR (*src), src++, pos++;
7288         }
7289       else
7290         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7291       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7292         c = '\n';
7293       if (! EQ (eol_type, Qunix))
7294         {
7295           if (c == '\n')
7296             {
7297               if (EQ (eol_type, Qdos))
7298                 *buf++ = '\r';
7299               else
7300                 c = '\r';
7301             }
7302         }
7303
7304       trans = Qnil;
7305       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7306       if (NILP (trans))
7307         *buf++ = c;
7308       else
7309         {
7310           ptrdiff_t from_nchars = 1, to_nchars = 1;
7311           int *lookup_buf_end;
7312           const unsigned char *p = src;
7313           int i;
7314
7315           lookup_buf[0] = c;
7316           for (i = 1; i < max_lookup && p < src_end; i++)
7317             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7318           lookup_buf_end = lookup_buf + i;
7319           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7320           if (INTEGERP (trans))
7321             c = XINT (trans);
7322           else if (CONSP (trans))
7323             {
7324               from_nchars = ASIZE (XCAR (trans));
7325               trans = XCDR (trans);
7326               if (INTEGERP (trans))
7327                 c = XINT (trans);
7328               else
7329                 {
7330                   to_nchars = ASIZE (trans);
7331                   if (buf_end - buf < to_nchars)
7332                     break;
7333                   c = XINT (AREF (trans, 0));
7334                 }
7335             }
7336           else
7337             break;
7338           *buf++ = c;
7339           for (i = 1; i < to_nchars; i++)
7340             *buf++ = XINT (AREF (trans, i));
7341           for (i = 1; i < from_nchars; i++, pos++)
7342             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7343         }
7344     }
7345
7346   coding->consumed = src - coding->source;
7347   coding->consumed_char = pos - coding->src_pos;
7348   coding->charbuf_used = buf - coding->charbuf;
7349   coding->chars_at_source = 0;
7350 }
7351
7352
7353 /* Encode the text at CODING->src_object into CODING->dst_object.
7354    CODING->src_object is a buffer or a string.
7355    CODING->dst_object is a buffer or nil.
7356
7357    If CODING->src_object is a buffer, it must be the current buffer.
7358    In this case, if CODING->src_pos is positive, it is a position of
7359    the source text in the buffer, otherwise. the source text is in the
7360    gap area of the buffer, and coding->src_pos specifies the offset of
7361    the text from GPT (which must be the same as PT).  If this is the
7362    same buffer as CODING->dst_object, CODING->src_pos must be
7363    negative and CODING should not have `pre-write-conversion'.
7364
7365    If CODING->src_object is a string, CODING should not have
7366    `pre-write-conversion'.
7367
7368    If CODING->dst_object is a buffer, the encoded data is inserted at
7369    the current point of that buffer.
7370
7371    If CODING->dst_object is nil, the encoded data is placed at the
7372    memory area specified by CODING->destination.  */
7373
7374 static void
7375 encode_coding (struct coding_system *coding)
7376 {
7377   Lisp_Object attrs;
7378   Lisp_Object translation_table;
7379   int max_lookup;
7380   struct ccl_spec cclspec;
7381
7382   attrs = CODING_ID_ATTRS (coding->id);
7383   if (coding->encoder == encode_coding_raw_text)
7384     translation_table = Qnil, max_lookup = 0;
7385   else
7386     translation_table = get_translation_table (attrs, 1, &max_lookup);
7387
7388   if (BUFFERP (coding->dst_object))
7389     {
7390       set_buffer_internal (XBUFFER (coding->dst_object));
7391       coding->dst_multibyte
7392         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7393     }
7394
7395   coding->consumed = coding->consumed_char = 0;
7396   coding->produced = coding->produced_char = 0;
7397   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7398   coding->errors = 0;
7399
7400   ALLOC_CONVERSION_WORK_AREA (coding);
7401
7402   if (coding->encoder == encode_coding_ccl)
7403     {
7404       coding->spec.ccl = &cclspec;
7405       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7406     }
7407   do {
7408     coding_set_source (coding);
7409     consume_chars (coding, translation_table, max_lookup);
7410     coding_set_destination (coding);
7411     (*(coding->encoder)) (coding);
7412   } while (coding->consumed_char < coding->src_chars);
7413
7414   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7415     insert_from_gap (coding->produced_char, coding->produced);
7416 }
7417
7418
7419 /* Name (or base name) of work buffer for code conversion.  */
7420 static Lisp_Object Vcode_conversion_workbuf_name;
7421
7422 /* A working buffer used by the top level conversion.  Once it is
7423    created, it is never destroyed.  It has the name
7424    Vcode_conversion_workbuf_name.  The other working buffers are
7425    destroyed after the use is finished, and their names are modified
7426    versions of Vcode_conversion_workbuf_name.  */
7427 static Lisp_Object Vcode_conversion_reused_workbuf;
7428
7429 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7430 static bool reused_workbuf_in_use;
7431
7432
7433 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7434    multibyteness of returning buffer.  */
7435
7436 static Lisp_Object
7437 make_conversion_work_buffer (bool multibyte)
7438 {
7439   Lisp_Object name, workbuf;
7440   struct buffer *current;
7441
7442   if (reused_workbuf_in_use)
7443     {
7444       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7445       workbuf = Fget_buffer_create (name);
7446     }
7447   else
7448     {
7449       reused_workbuf_in_use = 1;
7450       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7451         Vcode_conversion_reused_workbuf
7452           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7453       workbuf = Vcode_conversion_reused_workbuf;
7454     }
7455   current = current_buffer;
7456   set_buffer_internal (XBUFFER (workbuf));
7457   /* We can't allow modification hooks to run in the work buffer.  For
7458      instance, directory_files_internal assumes that file decoding
7459      doesn't compile new regexps.  */
7460   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7461   Ferase_buffer ();
7462   bset_undo_list (current_buffer, Qt);
7463   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7464   set_buffer_internal (current);
7465   return workbuf;
7466 }
7467
7468
7469 static Lisp_Object
7470 code_conversion_restore (Lisp_Object arg)
7471 {
7472   Lisp_Object current, workbuf;
7473   struct gcpro gcpro1;
7474
7475   GCPRO1 (arg);
7476   current = XCAR (arg);
7477   workbuf = XCDR (arg);
7478   if (! NILP (workbuf))
7479     {
7480       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7481         reused_workbuf_in_use = 0;
7482       else
7483         Fkill_buffer (workbuf);
7484     }
7485   set_buffer_internal (XBUFFER (current));
7486   UNGCPRO;
7487   return Qnil;
7488 }
7489
7490 Lisp_Object
7491 code_conversion_save (bool with_work_buf, bool multibyte)
7492 {
7493   Lisp_Object workbuf = Qnil;
7494
7495   if (with_work_buf)
7496     workbuf = make_conversion_work_buffer (multibyte);
7497   record_unwind_protect (code_conversion_restore,
7498                          Fcons (Fcurrent_buffer (), workbuf));
7499   return workbuf;
7500 }
7501
7502 void
7503 decode_coding_gap (struct coding_system *coding,
7504                    ptrdiff_t chars, ptrdiff_t bytes)
7505 {
7506   ptrdiff_t count = SPECPDL_INDEX ();
7507   Lisp_Object attrs;
7508
7509   code_conversion_save (0, 0);
7510
7511   coding->src_object = Fcurrent_buffer ();
7512   coding->src_chars = chars;
7513   coding->src_bytes = bytes;
7514   coding->src_pos = -chars;
7515   coding->src_pos_byte = -bytes;
7516   coding->src_multibyte = chars < bytes;
7517   coding->dst_object = coding->src_object;
7518   coding->dst_pos = PT;
7519   coding->dst_pos_byte = PT_BYTE;
7520   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7521
7522   if (CODING_REQUIRE_DETECTION (coding))
7523     detect_coding (coding);
7524
7525   coding->mode |= CODING_MODE_LAST_BLOCK;
7526   current_buffer->text->inhibit_shrinking = 1;
7527   decode_coding (coding);
7528   current_buffer->text->inhibit_shrinking = 0;
7529
7530   attrs = CODING_ID_ATTRS (coding->id);
7531   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7532     {
7533       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7534       Lisp_Object val;
7535
7536       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7537       val = call1 (CODING_ATTR_POST_READ (attrs),
7538                    make_number (coding->produced_char));
7539       CHECK_NATNUM (val);
7540       coding->produced_char += Z - prev_Z;
7541       coding->produced += Z_BYTE - prev_Z_BYTE;
7542     }
7543
7544   unbind_to (count, Qnil);
7545 }
7546
7547
7548 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7549    SRC_OBJECT into DST_OBJECT by coding context CODING.
7550
7551    SRC_OBJECT is a buffer, a string, or Qnil.
7552
7553    If it is a buffer, the text is at point of the buffer.  FROM and TO
7554    are positions in the buffer.
7555
7556    If it is a string, the text is at the beginning of the string.
7557    FROM and TO are indices to the string.
7558
7559    If it is nil, the text is at coding->source.  FROM and TO are
7560    indices to coding->source.
7561
7562    DST_OBJECT is a buffer, Qt, or Qnil.
7563
7564    If it is a buffer, the decoded text is inserted at point of the
7565    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7566    is deleted.
7567
7568    If it is Qt, a string is made from the decoded text, and
7569    set in CODING->dst_object.
7570
7571    If it is Qnil, the decoded text is stored at CODING->destination.
7572    The caller must allocate CODING->dst_bytes bytes at
7573    CODING->destination by xmalloc.  If the decoded text is longer than
7574    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7575  */
7576
7577 void
7578 decode_coding_object (struct coding_system *coding,
7579                       Lisp_Object src_object,
7580                       ptrdiff_t from, ptrdiff_t from_byte,
7581                       ptrdiff_t to, ptrdiff_t to_byte,
7582                       Lisp_Object dst_object)
7583 {
7584   ptrdiff_t count = SPECPDL_INDEX ();
7585   unsigned char *destination IF_LINT (= NULL);
7586   ptrdiff_t dst_bytes IF_LINT (= 0);
7587   ptrdiff_t chars = to - from;
7588   ptrdiff_t bytes = to_byte - from_byte;
7589   Lisp_Object attrs;
7590   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7591   bool need_marker_adjustment = 0;
7592   Lisp_Object old_deactivate_mark;
7593
7594   old_deactivate_mark = Vdeactivate_mark;
7595
7596   if (NILP (dst_object))
7597     {
7598       destination = coding->destination;
7599       dst_bytes = coding->dst_bytes;
7600     }
7601
7602   coding->src_object = src_object;
7603   coding->src_chars = chars;
7604   coding->src_bytes = bytes;
7605   coding->src_multibyte = chars < bytes;
7606
7607   if (STRINGP (src_object))
7608     {
7609       coding->src_pos = from;
7610       coding->src_pos_byte = from_byte;
7611     }
7612   else if (BUFFERP (src_object))
7613     {
7614       set_buffer_internal (XBUFFER (src_object));
7615       if (from != GPT)
7616         move_gap_both (from, from_byte);
7617       if (EQ (src_object, dst_object))
7618         {
7619           struct Lisp_Marker *tail;
7620
7621           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7622             {
7623               tail->need_adjustment
7624                 = tail->charpos == (tail->insertion_type ? from : to);
7625               need_marker_adjustment |= tail->need_adjustment;
7626             }
7627           saved_pt = PT, saved_pt_byte = PT_BYTE;
7628           TEMP_SET_PT_BOTH (from, from_byte);
7629           current_buffer->text->inhibit_shrinking = 1;
7630           del_range_both (from, from_byte, to, to_byte, 1);
7631           coding->src_pos = -chars;
7632           coding->src_pos_byte = -bytes;
7633         }
7634       else
7635         {
7636           coding->src_pos = from;
7637           coding->src_pos_byte = from_byte;
7638         }
7639     }
7640
7641   if (CODING_REQUIRE_DETECTION (coding))
7642     detect_coding (coding);
7643   attrs = CODING_ID_ATTRS (coding->id);
7644
7645   if (EQ (dst_object, Qt)
7646       || (! NILP (CODING_ATTR_POST_READ (attrs))
7647           && NILP (dst_object)))
7648     {
7649       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7650       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7651       coding->dst_pos = BEG;
7652       coding->dst_pos_byte = BEG_BYTE;
7653     }
7654   else if (BUFFERP (dst_object))
7655     {
7656       code_conversion_save (0, 0);
7657       coding->dst_object = dst_object;
7658       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7659       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7660       coding->dst_multibyte
7661         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7662     }
7663   else
7664     {
7665       code_conversion_save (0, 0);
7666       coding->dst_object = Qnil;
7667       /* Most callers presume this will return a multibyte result, and they
7668          won't use `binary' or `raw-text' anyway, so let's not worry about
7669          CODING_FOR_UNIBYTE.  */
7670       coding->dst_multibyte = 1;
7671     }
7672
7673   decode_coding (coding);
7674
7675   if (BUFFERP (coding->dst_object))
7676     set_buffer_internal (XBUFFER (coding->dst_object));
7677
7678   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7679     {
7680       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7681       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7682       Lisp_Object val;
7683
7684       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7685       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7686               old_deactivate_mark);
7687       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7688                         make_number (coding->produced_char));
7689       UNGCPRO;
7690       CHECK_NATNUM (val);
7691       coding->produced_char += Z - prev_Z;
7692       coding->produced += Z_BYTE - prev_Z_BYTE;
7693     }
7694
7695   if (EQ (dst_object, Qt))
7696     {
7697       coding->dst_object = Fbuffer_string ();
7698     }
7699   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7700     {
7701       set_buffer_internal (XBUFFER (coding->dst_object));
7702       if (dst_bytes < coding->produced)
7703         {
7704           destination = xrealloc (destination, coding->produced);
7705           if (! destination)
7706             {
7707               record_conversion_result (coding,
7708                                         CODING_RESULT_INSUFFICIENT_MEM);
7709               unbind_to (count, Qnil);
7710               return;
7711             }
7712           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7713             move_gap_both (BEGV, BEGV_BYTE);
7714           memcpy (destination, BEGV_ADDR, coding->produced);
7715           coding->destination = destination;
7716         }
7717     }
7718
7719   if (saved_pt >= 0)
7720     {
7721       /* This is the case of:
7722          (BUFFERP (src_object) && EQ (src_object, dst_object))
7723          As we have moved PT while replacing the original buffer
7724          contents, we must recover it now.  */
7725       set_buffer_internal (XBUFFER (src_object));
7726       current_buffer->text->inhibit_shrinking = 0;
7727       if (saved_pt < from)
7728         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7729       else if (saved_pt < from + chars)
7730         TEMP_SET_PT_BOTH (from, from_byte);
7731       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7732         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7733                           saved_pt_byte + (coding->produced - bytes));
7734       else
7735         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7736                           saved_pt_byte + (coding->produced - bytes));
7737
7738       if (need_marker_adjustment)
7739         {
7740           struct Lisp_Marker *tail;
7741
7742           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7743             if (tail->need_adjustment)
7744               {
7745                 tail->need_adjustment = 0;
7746                 if (tail->insertion_type)
7747                   {
7748                     tail->bytepos = from_byte;
7749                     tail->charpos = from;
7750                   }
7751                 else
7752                   {
7753                     tail->bytepos = from_byte + coding->produced;
7754                     tail->charpos
7755                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7756                          ? tail->bytepos : from + coding->produced_char);
7757                   }
7758               }
7759         }
7760     }
7761
7762   Vdeactivate_mark = old_deactivate_mark;
7763   unbind_to (count, coding->dst_object);
7764 }
7765
7766
7767 void
7768 encode_coding_object (struct coding_system *coding,
7769                       Lisp_Object src_object,
7770                       ptrdiff_t from, ptrdiff_t from_byte,
7771                       ptrdiff_t to, ptrdiff_t to_byte,
7772                       Lisp_Object dst_object)
7773 {
7774   ptrdiff_t count = SPECPDL_INDEX ();
7775   ptrdiff_t chars = to - from;
7776   ptrdiff_t bytes = to_byte - from_byte;
7777   Lisp_Object attrs;
7778   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7779   bool need_marker_adjustment = 0;
7780   bool kill_src_buffer = 0;
7781   Lisp_Object old_deactivate_mark;
7782
7783   old_deactivate_mark = Vdeactivate_mark;
7784
7785   coding->src_object = src_object;
7786   coding->src_chars = chars;
7787   coding->src_bytes = bytes;
7788   coding->src_multibyte = chars < bytes;
7789
7790   attrs = CODING_ID_ATTRS (coding->id);
7791
7792   if (EQ (src_object, dst_object))
7793     {
7794       struct Lisp_Marker *tail;
7795
7796       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7797         {
7798           tail->need_adjustment
7799             = tail->charpos == (tail->insertion_type ? from : to);
7800           need_marker_adjustment |= tail->need_adjustment;
7801         }
7802     }
7803
7804   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7805     {
7806       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7807       set_buffer_internal (XBUFFER (coding->src_object));
7808       if (STRINGP (src_object))
7809         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7810       else if (BUFFERP (src_object))
7811         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7812       else
7813         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7814
7815       if (EQ (src_object, dst_object))
7816         {
7817           set_buffer_internal (XBUFFER (src_object));
7818           saved_pt = PT, saved_pt_byte = PT_BYTE;
7819           del_range_both (from, from_byte, to, to_byte, 1);
7820           set_buffer_internal (XBUFFER (coding->src_object));
7821         }
7822
7823       {
7824         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7825
7826         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7827                 old_deactivate_mark);
7828         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7829                     make_number (BEG), make_number (Z));
7830         UNGCPRO;
7831       }
7832       if (XBUFFER (coding->src_object) != current_buffer)
7833         kill_src_buffer = 1;
7834       coding->src_object = Fcurrent_buffer ();
7835       if (BEG != GPT)
7836         move_gap_both (BEG, BEG_BYTE);
7837       coding->src_chars = Z - BEG;
7838       coding->src_bytes = Z_BYTE - BEG_BYTE;
7839       coding->src_pos = BEG;
7840       coding->src_pos_byte = BEG_BYTE;
7841       coding->src_multibyte = Z < Z_BYTE;
7842     }
7843   else if (STRINGP (src_object))
7844     {
7845       code_conversion_save (0, 0);
7846       coding->src_pos = from;
7847       coding->src_pos_byte = from_byte;
7848     }
7849   else if (BUFFERP (src_object))
7850     {
7851       code_conversion_save (0, 0);
7852       set_buffer_internal (XBUFFER (src_object));
7853       if (EQ (src_object, dst_object))
7854         {
7855           saved_pt = PT, saved_pt_byte = PT_BYTE;
7856           coding->src_object = del_range_1 (from, to, 1, 1);
7857           coding->src_pos = 0;
7858           coding->src_pos_byte = 0;
7859         }
7860       else
7861         {
7862           if (from < GPT && to >= GPT)
7863             move_gap_both (from, from_byte);
7864           coding->src_pos = from;
7865           coding->src_pos_byte = from_byte;
7866         }
7867     }
7868   else
7869     code_conversion_save (0, 0);
7870
7871   if (BUFFERP (dst_object))
7872     {
7873       coding->dst_object = dst_object;
7874       if (EQ (src_object, dst_object))
7875         {
7876           coding->dst_pos = from;
7877           coding->dst_pos_byte = from_byte;
7878         }
7879       else
7880         {
7881           struct buffer *current = current_buffer;
7882
7883           set_buffer_temp (XBUFFER (dst_object));
7884           coding->dst_pos = PT;
7885           coding->dst_pos_byte = PT_BYTE;
7886           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7887           set_buffer_temp (current);
7888         }
7889       coding->dst_multibyte
7890         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7891     }
7892   else if (EQ (dst_object, Qt))
7893     {
7894       ptrdiff_t dst_bytes = max (1, coding->src_chars);
7895       coding->dst_object = Qnil;
7896       coding->destination = xmalloc (dst_bytes);
7897       coding->dst_bytes = dst_bytes;
7898       coding->dst_multibyte = 0;
7899     }
7900   else
7901     {
7902       coding->dst_object = Qnil;
7903       coding->dst_multibyte = 0;
7904     }
7905
7906   encode_coding (coding);
7907
7908   if (EQ (dst_object, Qt))
7909     {
7910       if (BUFFERP (coding->dst_object))
7911         coding->dst_object = Fbuffer_string ();
7912       else
7913         {
7914           coding->dst_object
7915             = make_unibyte_string ((char *) coding->destination,
7916                                    coding->produced);
7917           xfree (coding->destination);
7918         }
7919     }
7920
7921   if (saved_pt >= 0)
7922     {
7923       /* This is the case of:
7924          (BUFFERP (src_object) && EQ (src_object, dst_object))
7925          As we have moved PT while replacing the original buffer
7926          contents, we must recover it now.  */
7927       set_buffer_internal (XBUFFER (src_object));
7928       if (saved_pt < from)
7929         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7930       else if (saved_pt < from + chars)
7931         TEMP_SET_PT_BOTH (from, from_byte);
7932       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7933         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7934                           saved_pt_byte + (coding->produced - bytes));
7935       else
7936         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7937                           saved_pt_byte + (coding->produced - bytes));
7938
7939       if (need_marker_adjustment)
7940         {
7941           struct Lisp_Marker *tail;
7942
7943           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7944             if (tail->need_adjustment)
7945               {
7946                 tail->need_adjustment = 0;
7947                 if (tail->insertion_type)
7948                   {
7949                     tail->bytepos = from_byte;
7950                     tail->charpos = from;
7951                   }
7952                 else
7953                   {
7954                     tail->bytepos = from_byte + coding->produced;
7955                     tail->charpos
7956                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7957                          ? tail->bytepos : from + coding->produced_char);
7958                   }
7959               }
7960         }
7961     }
7962
7963   if (kill_src_buffer)
7964     Fkill_buffer (coding->src_object);
7965
7966   Vdeactivate_mark = old_deactivate_mark;
7967   unbind_to (count, Qnil);
7968 }
7969
7970
7971 Lisp_Object
7972 preferred_coding_system (void)
7973 {
7974   int id = coding_categories[coding_priorities[0]].id;
7975
7976   return CODING_ID_NAME (id);
7977 }
7978
7979 #if defined (WINDOWSNT) || defined (CYGWIN)
7980
7981 Lisp_Object
7982 from_unicode (Lisp_Object str)
7983 {
7984   CHECK_STRING (str);
7985   if (!STRING_MULTIBYTE (str) &&
7986       SBYTES (str) & 1)
7987     {
7988       str = Fsubstring (str, make_number (0), make_number (-1));
7989     }
7990
7991   return code_convert_string_norecord (str, Qutf_16le, 0);
7992 }
7993
7994 wchar_t *
7995 to_unicode (Lisp_Object str, Lisp_Object *buf)
7996 {
7997   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
7998   /* We need to make a another copy (in addition to the one made by
7999      code_convert_string_norecord) to ensure that the final string is
8000      _doubly_ zero terminated --- that is, that the string is
8001      terminated by two zero bytes and one utf-16le null character.
8002      Because strings are already terminated with a single zero byte,
8003      we just add one additional zero. */
8004   str = make_uninit_string (SBYTES (*buf) + 1);
8005   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8006   SDATA (str) [SBYTES (*buf)] = '\0';
8007   *buf = str;
8008   return WCSDATA (*buf);
8009 }
8010
8011 #endif /* WINDOWSNT || CYGWIN */
8012
8013 \f
8014 #ifdef emacs
8015 /*** 8. Emacs Lisp library functions ***/
8016
8017 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8018        doc: /* Return t if OBJECT is nil or a coding-system.
8019 See the documentation of `define-coding-system' for information
8020 about coding-system objects.  */)
8021   (Lisp_Object object)
8022 {
8023   if (NILP (object)
8024       || CODING_SYSTEM_ID (object) >= 0)
8025     return Qt;
8026   if (! SYMBOLP (object)
8027       || NILP (Fget (object, Qcoding_system_define_form)))
8028     return Qnil;
8029   return Qt;
8030 }
8031
8032 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8033        Sread_non_nil_coding_system, 1, 1, 0,
8034        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8035   (Lisp_Object prompt)
8036 {
8037   Lisp_Object val;
8038   do
8039     {
8040       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8041                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8042     }
8043   while (SCHARS (val) == 0);
8044   return (Fintern (val, Qnil));
8045 }
8046
8047 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8048        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8049 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8050 Ignores case when completing coding systems (all Emacs coding systems
8051 are lower-case).  */)
8052   (Lisp_Object prompt, Lisp_Object default_coding_system)
8053 {
8054   Lisp_Object val;
8055   ptrdiff_t count = SPECPDL_INDEX ();
8056
8057   if (SYMBOLP (default_coding_system))
8058     default_coding_system = SYMBOL_NAME (default_coding_system);
8059   specbind (Qcompletion_ignore_case, Qt);
8060   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8061                           Qt, Qnil, Qcoding_system_history,
8062                           default_coding_system, Qnil);
8063   unbind_to (count, Qnil);
8064   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8065 }
8066
8067 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8068        1, 1, 0,
8069        doc: /* Check validity of CODING-SYSTEM.
8070 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8071 It is valid if it is nil or a symbol defined as a coding system by the
8072 function `define-coding-system'.  */)
8073   (Lisp_Object coding_system)
8074 {
8075   Lisp_Object define_form;
8076
8077   define_form = Fget (coding_system, Qcoding_system_define_form);
8078   if (! NILP (define_form))
8079     {
8080       Fput (coding_system, Qcoding_system_define_form, Qnil);
8081       safe_eval (define_form);
8082     }
8083   if (!NILP (Fcoding_system_p (coding_system)))
8084     return coding_system;
8085   xsignal1 (Qcoding_system_error, coding_system);
8086 }
8087
8088 \f
8089 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8090    HIGHEST, return the coding system of the highest
8091    priority among the detected coding systems.  Otherwise return a
8092    list of detected coding systems sorted by their priorities.  If
8093    MULTIBYTEP, it is assumed that the bytes are in correct
8094    multibyte form but contains only ASCII and eight-bit chars.
8095    Otherwise, the bytes are raw bytes.
8096
8097    CODING-SYSTEM controls the detection as below:
8098
8099    If it is nil, detect both text-format and eol-format.  If the
8100    text-format part of CODING-SYSTEM is already specified
8101    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8102    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8103    detect only text-format.  */
8104
8105 Lisp_Object
8106 detect_coding_system (const unsigned char *src,
8107                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8108                       bool highest, bool multibytep,
8109                       Lisp_Object coding_system)
8110 {
8111   const unsigned char *src_end = src + src_bytes;
8112   Lisp_Object attrs, eol_type;
8113   Lisp_Object val = Qnil;
8114   struct coding_system coding;
8115   ptrdiff_t id;
8116   struct coding_detection_info detect_info;
8117   enum coding_category base_category;
8118   bool null_byte_found = 0, eight_bit_found = 0;
8119
8120   if (NILP (coding_system))
8121     coding_system = Qundecided;
8122   setup_coding_system (coding_system, &coding);
8123   attrs = CODING_ID_ATTRS (coding.id);
8124   eol_type = CODING_ID_EOL_TYPE (coding.id);
8125   coding_system = CODING_ATTR_BASE_NAME (attrs);
8126
8127   coding.source = src;
8128   coding.src_chars = src_chars;
8129   coding.src_bytes = src_bytes;
8130   coding.src_multibyte = multibytep;
8131   coding.consumed = 0;
8132   coding.mode |= CODING_MODE_LAST_BLOCK;
8133   coding.head_ascii = 0;
8134
8135   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8136
8137   /* At first, detect text-format if necessary.  */
8138   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8139   if (base_category == coding_category_undecided)
8140     {
8141       enum coding_category category IF_LINT (= 0);
8142       struct coding_system *this IF_LINT (= NULL);
8143       int c, i;
8144
8145       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8146       for (; src < src_end; src++)
8147         {
8148           c = *src;
8149           if (c & 0x80)
8150             {
8151               eight_bit_found = 1;
8152               if (null_byte_found)
8153                 break;
8154             }
8155           else if (c < 0x20)
8156             {
8157               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8158                   && ! inhibit_iso_escape_detection
8159                   && ! detect_info.checked)
8160                 {
8161                   if (detect_coding_iso_2022 (&coding, &detect_info))
8162                     {
8163                       /* We have scanned the whole data.  */
8164                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8165                         {
8166                           /* We didn't find an 8-bit code.  We may
8167                              have found a null-byte, but it's very
8168                              rare that a binary file confirm to
8169                              ISO-2022.  */
8170                           src = src_end;
8171                           coding.head_ascii = src - coding.source;
8172                         }
8173                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8174                       break;
8175                     }
8176                 }
8177               else if (! c && !inhibit_null_byte_detection)
8178                 {
8179                   null_byte_found = 1;
8180                   if (eight_bit_found)
8181                     break;
8182                 }
8183               if (! eight_bit_found)
8184                 coding.head_ascii++;
8185             }
8186           else if (! eight_bit_found)
8187             coding.head_ascii++;
8188         }
8189
8190       if (null_byte_found || eight_bit_found
8191           || coding.head_ascii < coding.src_bytes
8192           || detect_info.found)
8193         {
8194           if (coding.head_ascii == coding.src_bytes)
8195             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8196             for (i = 0; i < coding_category_raw_text; i++)
8197               {
8198                 category = coding_priorities[i];
8199                 this = coding_categories + category;
8200                 if (detect_info.found & (1 << category))
8201                   break;
8202               }
8203           else
8204             {
8205               if (null_byte_found)
8206                 {
8207                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8208                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8209                 }
8210               for (i = 0; i < coding_category_raw_text; i++)
8211                 {
8212                   category = coding_priorities[i];
8213                   this = coding_categories + category;
8214
8215                   if (this->id < 0)
8216                     {
8217                       /* No coding system of this category is defined.  */
8218                       detect_info.rejected |= (1 << category);
8219                     }
8220                   else if (category >= coding_category_raw_text)
8221                     continue;
8222                   else if (detect_info.checked & (1 << category))
8223                     {
8224                       if (highest
8225                           && (detect_info.found & (1 << category)))
8226                         break;
8227                     }
8228                   else if ((*(this->detector)) (&coding, &detect_info)
8229                            && highest
8230                            && (detect_info.found & (1 << category)))
8231                     {
8232                       if (category == coding_category_utf_16_auto)
8233                         {
8234                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8235                             category = coding_category_utf_16_le;
8236                           else
8237                             category = coding_category_utf_16_be;
8238                         }
8239                       break;
8240                     }
8241                 }
8242             }
8243         }
8244
8245       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8246           || null_byte_found)
8247         {
8248           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8249           id = CODING_SYSTEM_ID (Qno_conversion);
8250           val = Fcons (make_number (id), Qnil);
8251         }
8252       else if (! detect_info.rejected && ! detect_info.found)
8253         {
8254           detect_info.found = CATEGORY_MASK_ANY;
8255           id = coding_categories[coding_category_undecided].id;
8256           val = Fcons (make_number (id), Qnil);
8257         }
8258       else if (highest)
8259         {
8260           if (detect_info.found)
8261             {
8262               detect_info.found = 1 << category;
8263               val = Fcons (make_number (this->id), Qnil);
8264             }
8265           else
8266             for (i = 0; i < coding_category_raw_text; i++)
8267               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8268                 {
8269                   detect_info.found = 1 << coding_priorities[i];
8270                   id = coding_categories[coding_priorities[i]].id;
8271                   val = Fcons (make_number (id), Qnil);
8272                   break;
8273                 }
8274         }
8275       else
8276         {
8277           int mask = detect_info.rejected | detect_info.found;
8278           int found = 0;
8279
8280           for (i = coding_category_raw_text - 1; i >= 0; i--)
8281             {
8282               category = coding_priorities[i];
8283               if (! (mask & (1 << category)))
8284                 {
8285                   found |= 1 << category;
8286                   id = coding_categories[category].id;
8287                   if (id >= 0)
8288                     val = Fcons (make_number (id), val);
8289                 }
8290             }
8291           for (i = coding_category_raw_text - 1; i >= 0; i--)
8292             {
8293               category = coding_priorities[i];
8294               if (detect_info.found & (1 << category))
8295                 {
8296                   id = coding_categories[category].id;
8297                   val = Fcons (make_number (id), val);
8298                 }
8299             }
8300           detect_info.found |= found;
8301         }
8302     }
8303   else if (base_category == coding_category_utf_8_auto)
8304     {
8305       if (detect_coding_utf_8 (&coding, &detect_info))
8306         {
8307           struct coding_system *this;
8308
8309           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8310             this = coding_categories + coding_category_utf_8_sig;
8311           else
8312             this = coding_categories + coding_category_utf_8_nosig;
8313           val = Fcons (make_number (this->id), Qnil);
8314         }
8315     }
8316   else if (base_category == coding_category_utf_16_auto)
8317     {
8318       if (detect_coding_utf_16 (&coding, &detect_info))
8319         {
8320           struct coding_system *this;
8321
8322           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8323             this = coding_categories + coding_category_utf_16_le;
8324           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8325             this = coding_categories + coding_category_utf_16_be;
8326           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8327             this = coding_categories + coding_category_utf_16_be_nosig;
8328           else
8329             this = coding_categories + coding_category_utf_16_le_nosig;
8330           val = Fcons (make_number (this->id), Qnil);
8331         }
8332     }
8333   else
8334     {
8335       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8336       val = Fcons (make_number (coding.id), Qnil);
8337     }
8338
8339   /* Then, detect eol-format if necessary.  */
8340   {
8341     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8342     Lisp_Object tail;
8343
8344     if (VECTORP (eol_type))
8345       {
8346         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8347           {
8348             if (null_byte_found)
8349               normal_eol = EOL_SEEN_LF;
8350             else
8351               normal_eol = detect_eol (coding.source, src_bytes,
8352                                        coding_category_raw_text);
8353           }
8354         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8355                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8356           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8357                                       coding_category_utf_16_be);
8358         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8359                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8360           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8361                                       coding_category_utf_16_le);
8362       }
8363     else
8364       {
8365         if (EQ (eol_type, Qunix))
8366           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8367         else if (EQ (eol_type, Qdos))
8368           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8369         else
8370           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8371       }
8372
8373     for (tail = val; CONSP (tail); tail = XCDR (tail))
8374       {
8375         enum coding_category category;
8376         int this_eol;
8377
8378         id = XINT (XCAR (tail));
8379         attrs = CODING_ID_ATTRS (id);
8380         category = XINT (CODING_ATTR_CATEGORY (attrs));
8381         eol_type = CODING_ID_EOL_TYPE (id);
8382         if (VECTORP (eol_type))
8383           {
8384             if (category == coding_category_utf_16_be
8385                 || category == coding_category_utf_16_be_nosig)
8386               this_eol = utf_16_be_eol;
8387             else if (category == coding_category_utf_16_le
8388                      || category == coding_category_utf_16_le_nosig)
8389               this_eol = utf_16_le_eol;
8390             else
8391               this_eol = normal_eol;
8392
8393             if (this_eol == EOL_SEEN_LF)
8394               XSETCAR (tail, AREF (eol_type, 0));
8395             else if (this_eol == EOL_SEEN_CRLF)
8396               XSETCAR (tail, AREF (eol_type, 1));
8397             else if (this_eol == EOL_SEEN_CR)
8398               XSETCAR (tail, AREF (eol_type, 2));
8399             else
8400               XSETCAR (tail, CODING_ID_NAME (id));
8401           }
8402         else
8403           XSETCAR (tail, CODING_ID_NAME (id));
8404       }
8405   }
8406
8407   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8408 }
8409
8410
8411 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8412        2, 3, 0,
8413        doc: /* Detect coding system of the text in the region between START and END.
8414 Return a list of possible coding systems ordered by priority.
8415 The coding systems to try and their priorities follows what
8416 the function `coding-system-priority-list' (which see) returns.
8417
8418 If only ASCII characters are found (except for such ISO-2022 control
8419 characters as ESC), it returns a list of single element `undecided'
8420 or its subsidiary coding system according to a detected end-of-line
8421 format.
8422
8423 If optional argument HIGHEST is non-nil, return the coding system of
8424 highest priority.  */)
8425   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8426 {
8427   ptrdiff_t from, to;
8428   ptrdiff_t from_byte, to_byte;
8429
8430   CHECK_NUMBER_COERCE_MARKER (start);
8431   CHECK_NUMBER_COERCE_MARKER (end);
8432
8433   validate_region (&start, &end);
8434   from = XINT (start), to = XINT (end);
8435   from_byte = CHAR_TO_BYTE (from);
8436   to_byte = CHAR_TO_BYTE (to);
8437
8438   if (from < GPT && to >= GPT)
8439     move_gap_both (to, to_byte);
8440
8441   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8442                                to - from, to_byte - from_byte,
8443                                !NILP (highest),
8444                                !NILP (BVAR (current_buffer
8445                                       , enable_multibyte_characters)),
8446                                Qnil);
8447 }
8448
8449 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8450        1, 2, 0,
8451        doc: /* Detect coding system of the text in STRING.
8452 Return a list of possible coding systems ordered by priority.
8453 The coding systems to try and their priorities follows what
8454 the function `coding-system-priority-list' (which see) returns.
8455
8456 If only ASCII characters are found (except for such ISO-2022 control
8457 characters as ESC), it returns a list of single element `undecided'
8458 or its subsidiary coding system according to a detected end-of-line
8459 format.
8460
8461 If optional argument HIGHEST is non-nil, return the coding system of
8462 highest priority.  */)
8463   (Lisp_Object string, Lisp_Object highest)
8464 {
8465   CHECK_STRING (string);
8466
8467   return detect_coding_system (SDATA (string),
8468                                SCHARS (string), SBYTES (string),
8469                                !NILP (highest), STRING_MULTIBYTE (string),
8470                                Qnil);
8471 }
8472
8473
8474 static bool
8475 char_encodable_p (int c, Lisp_Object attrs)
8476 {
8477   Lisp_Object tail;
8478   struct charset *charset;
8479   Lisp_Object translation_table;
8480
8481   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8482   if (! NILP (translation_table))
8483     c = translate_char (translation_table, c);
8484   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8485        CONSP (tail); tail = XCDR (tail))
8486     {
8487       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8488       if (CHAR_CHARSET_P (c, charset))
8489         break;
8490     }
8491   return (! NILP (tail));
8492 }
8493
8494
8495 /* Return a list of coding systems that safely encode the text between
8496    START and END.  If EXCLUDE is non-nil, it is a list of coding
8497    systems not to check.  The returned list doesn't contain any such
8498    coding systems.  In any case, if the text contains only ASCII or is
8499    unibyte, return t.  */
8500
8501 DEFUN ("find-coding-systems-region-internal",
8502        Ffind_coding_systems_region_internal,
8503        Sfind_coding_systems_region_internal, 2, 3, 0,
8504        doc: /* Internal use only.  */)
8505   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8506 {
8507   Lisp_Object coding_attrs_list, safe_codings;
8508   ptrdiff_t start_byte, end_byte;
8509   const unsigned char *p, *pbeg, *pend;
8510   int c;
8511   Lisp_Object tail, elt, work_table;
8512
8513   if (STRINGP (start))
8514     {
8515       if (!STRING_MULTIBYTE (start)
8516           || SCHARS (start) == SBYTES (start))
8517         return Qt;
8518       start_byte = 0;
8519       end_byte = SBYTES (start);
8520     }
8521   else
8522     {
8523       CHECK_NUMBER_COERCE_MARKER (start);
8524       CHECK_NUMBER_COERCE_MARKER (end);
8525       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8526         args_out_of_range (start, end);
8527       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8528         return Qt;
8529       start_byte = CHAR_TO_BYTE (XINT (start));
8530       end_byte = CHAR_TO_BYTE (XINT (end));
8531       if (XINT (end) - XINT (start) == end_byte - start_byte)
8532         return Qt;
8533
8534       if (XINT (start) < GPT && XINT (end) > GPT)
8535         {
8536           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8537             move_gap_both (XINT (start), start_byte);
8538           else
8539             move_gap_both (XINT (end), end_byte);
8540         }
8541     }
8542
8543   coding_attrs_list = Qnil;
8544   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8545     if (NILP (exclude)
8546         || NILP (Fmemq (XCAR (tail), exclude)))
8547       {
8548         Lisp_Object attrs;
8549
8550         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8551         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8552             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8553           {
8554             ASET (attrs, coding_attr_trans_tbl,
8555                   get_translation_table (attrs, 1, NULL));
8556             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8557           }
8558       }
8559
8560   if (STRINGP (start))
8561     p = pbeg = SDATA (start);
8562   else
8563     p = pbeg = BYTE_POS_ADDR (start_byte);
8564   pend = p + (end_byte - start_byte);
8565
8566   while (p < pend && ASCII_BYTE_P (*p)) p++;
8567   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8568
8569   work_table = Fmake_char_table (Qnil, Qnil);
8570   while (p < pend)
8571     {
8572       if (ASCII_BYTE_P (*p))
8573         p++;
8574       else
8575         {
8576           c = STRING_CHAR_ADVANCE (p);
8577           if (!NILP (char_table_ref (work_table, c)))
8578             /* This character was already checked.  Ignore it.  */
8579             continue;
8580
8581           charset_map_loaded = 0;
8582           for (tail = coding_attrs_list; CONSP (tail);)
8583             {
8584               elt = XCAR (tail);
8585               if (NILP (elt))
8586                 tail = XCDR (tail);
8587               else if (char_encodable_p (c, elt))
8588                 tail = XCDR (tail);
8589               else if (CONSP (XCDR (tail)))
8590                 {
8591                   XSETCAR (tail, XCAR (XCDR (tail)));
8592                   XSETCDR (tail, XCDR (XCDR (tail)));
8593                 }
8594               else
8595                 {
8596                   XSETCAR (tail, Qnil);
8597                   tail = XCDR (tail);
8598                 }
8599             }
8600           if (charset_map_loaded)
8601             {
8602               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8603
8604               if (STRINGP (start))
8605                 pbeg = SDATA (start);
8606               else
8607                 pbeg = BYTE_POS_ADDR (start_byte);
8608               p = pbeg + p_offset;
8609               pend = pbeg + pend_offset;
8610             }
8611           char_table_set (work_table, c, Qt);
8612         }
8613     }
8614
8615   safe_codings = list2 (Qraw_text, Qno_conversion);
8616   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8617     if (! NILP (XCAR (tail)))
8618       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8619
8620   return safe_codings;
8621 }
8622
8623
8624 DEFUN ("unencodable-char-position", Funencodable_char_position,
8625        Sunencodable_char_position, 3, 5, 0,
8626        doc: /*
8627 Return position of first un-encodable character in a region.
8628 START and END specify the region and CODING-SYSTEM specifies the
8629 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8630
8631 If optional 4th argument COUNT is non-nil, it specifies at most how
8632 many un-encodable characters to search.  In this case, the value is a
8633 list of positions.
8634
8635 If optional 5th argument STRING is non-nil, it is a string to search
8636 for un-encodable characters.  In that case, START and END are indexes
8637 to the string.  */)
8638   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8639 {
8640   EMACS_INT n;
8641   struct coding_system coding;
8642   Lisp_Object attrs, charset_list, translation_table;
8643   Lisp_Object positions;
8644   ptrdiff_t from, to;
8645   const unsigned char *p, *stop, *pend;
8646   bool ascii_compatible;
8647
8648   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8649   attrs = CODING_ID_ATTRS (coding.id);
8650   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8651     return Qnil;
8652   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8653   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8654   translation_table = get_translation_table (attrs, 1, NULL);
8655
8656   if (NILP (string))
8657     {
8658       validate_region (&start, &end);
8659       from = XINT (start);
8660       to = XINT (end);
8661       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8662           || (ascii_compatible
8663               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8664         return Qnil;
8665       p = CHAR_POS_ADDR (from);
8666       pend = CHAR_POS_ADDR (to);
8667       if (from < GPT && to >= GPT)
8668         stop = GPT_ADDR;
8669       else
8670         stop = pend;
8671     }
8672   else
8673     {
8674       CHECK_STRING (string);
8675       CHECK_NATNUM (start);
8676       CHECK_NATNUM (end);
8677       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8678         args_out_of_range_3 (string, start, end);
8679       from = XINT (start);
8680       to = XINT (end);
8681       if (! STRING_MULTIBYTE (string))
8682         return Qnil;
8683       p = SDATA (string) + string_char_to_byte (string, from);
8684       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8685       if (ascii_compatible && (to - from) == (pend - p))
8686         return Qnil;
8687     }
8688
8689   if (NILP (count))
8690     n = 1;
8691   else
8692     {
8693       CHECK_NATNUM (count);
8694       n = XINT (count);
8695     }
8696
8697   positions = Qnil;
8698   charset_map_loaded = 0;
8699   while (1)
8700     {
8701       int c;
8702
8703       if (ascii_compatible)
8704         while (p < stop && ASCII_BYTE_P (*p))
8705           p++, from++;
8706       if (p >= stop)
8707         {
8708           if (p >= pend)
8709             break;
8710           stop = pend;
8711           p = GAP_END_ADDR;
8712         }
8713
8714       c = STRING_CHAR_ADVANCE (p);
8715       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8716           && ! char_charset (translate_char (translation_table, c),
8717                              charset_list, NULL))
8718         {
8719           positions = Fcons (make_number (from), positions);
8720           n--;
8721           if (n == 0)
8722             break;
8723         }
8724
8725       from++;
8726       if (charset_map_loaded && NILP (string))
8727         {
8728           p = CHAR_POS_ADDR (from);
8729           pend = CHAR_POS_ADDR (to);
8730           if (from < GPT && to >= GPT)
8731             stop = GPT_ADDR;
8732           else
8733             stop = pend;
8734           charset_map_loaded = 0;
8735         }
8736     }
8737
8738   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8739 }
8740
8741
8742 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8743        Scheck_coding_systems_region, 3, 3, 0,
8744        doc: /* Check if the region is encodable by coding systems.
8745
8746 START and END are buffer positions specifying the region.
8747 CODING-SYSTEM-LIST is a list of coding systems to check.
8748
8749 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8750 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8751 whole region, POS0, POS1, ... are buffer positions where non-encodable
8752 characters are found.
8753
8754 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8755 value is nil.
8756
8757 START may be a string.  In that case, check if the string is
8758 encodable, and the value contains indices to the string instead of
8759 buffer positions.  END is ignored.
8760
8761 If the current buffer (or START if it is a string) is unibyte, the value
8762 is nil.  */)
8763   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8764 {
8765   Lisp_Object list;
8766   ptrdiff_t start_byte, end_byte;
8767   ptrdiff_t pos;
8768   const unsigned char *p, *pbeg, *pend;
8769   int c;
8770   Lisp_Object tail, elt, attrs;
8771
8772   if (STRINGP (start))
8773     {
8774       if (!STRING_MULTIBYTE (start)
8775           || SCHARS (start) == SBYTES (start))
8776         return Qnil;
8777       start_byte = 0;
8778       end_byte = SBYTES (start);
8779       pos = 0;
8780     }
8781   else
8782     {
8783       CHECK_NUMBER_COERCE_MARKER (start);
8784       CHECK_NUMBER_COERCE_MARKER (end);
8785       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8786         args_out_of_range (start, end);
8787       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8788         return Qnil;
8789       start_byte = CHAR_TO_BYTE (XINT (start));
8790       end_byte = CHAR_TO_BYTE (XINT (end));
8791       if (XINT (end) - XINT (start) == end_byte - start_byte)
8792         return Qnil;
8793
8794       if (XINT (start) < GPT && XINT (end) > GPT)
8795         {
8796           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8797             move_gap_both (XINT (start), start_byte);
8798           else
8799             move_gap_both (XINT (end), end_byte);
8800         }
8801       pos = XINT (start);
8802     }
8803
8804   list = Qnil;
8805   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8806     {
8807       elt = XCAR (tail);
8808       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8809       ASET (attrs, coding_attr_trans_tbl,
8810             get_translation_table (attrs, 1, NULL));
8811       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8812     }
8813
8814   if (STRINGP (start))
8815     p = pbeg = SDATA (start);
8816   else
8817     p = pbeg = BYTE_POS_ADDR (start_byte);
8818   pend = p + (end_byte - start_byte);
8819
8820   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8821   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8822
8823   while (p < pend)
8824     {
8825       if (ASCII_BYTE_P (*p))
8826         p++;
8827       else
8828         {
8829           c = STRING_CHAR_ADVANCE (p);
8830
8831           charset_map_loaded = 0;
8832           for (tail = list; CONSP (tail); tail = XCDR (tail))
8833             {
8834               elt = XCDR (XCAR (tail));
8835               if (! char_encodable_p (c, XCAR (elt)))
8836                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8837             }
8838           if (charset_map_loaded)
8839             {
8840               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8841
8842               if (STRINGP (start))
8843                 pbeg = SDATA (start);
8844               else
8845                 pbeg = BYTE_POS_ADDR (start_byte);
8846               p = pbeg + p_offset;
8847               pend = pbeg + pend_offset;
8848             }
8849         }
8850       pos++;
8851     }
8852
8853   tail = list;
8854   list = Qnil;
8855   for (; CONSP (tail); tail = XCDR (tail))
8856     {
8857       elt = XCAR (tail);
8858       if (CONSP (XCDR (XCDR (elt))))
8859         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8860                       list);
8861     }
8862
8863   return list;
8864 }
8865
8866
8867 static Lisp_Object
8868 code_convert_region (Lisp_Object start, Lisp_Object end,
8869                      Lisp_Object coding_system, Lisp_Object dst_object,
8870                      bool encodep, bool norecord)
8871 {
8872   struct coding_system coding;
8873   ptrdiff_t from, from_byte, to, to_byte;
8874   Lisp_Object src_object;
8875
8876   CHECK_NUMBER_COERCE_MARKER (start);
8877   CHECK_NUMBER_COERCE_MARKER (end);
8878   if (NILP (coding_system))
8879     coding_system = Qno_conversion;
8880   else
8881     CHECK_CODING_SYSTEM (coding_system);
8882   src_object = Fcurrent_buffer ();
8883   if (NILP (dst_object))
8884     dst_object = src_object;
8885   else if (! EQ (dst_object, Qt))
8886     CHECK_BUFFER (dst_object);
8887
8888   validate_region (&start, &end);
8889   from = XFASTINT (start);
8890   from_byte = CHAR_TO_BYTE (from);
8891   to = XFASTINT (end);
8892   to_byte = CHAR_TO_BYTE (to);
8893
8894   setup_coding_system (coding_system, &coding);
8895   coding.mode |= CODING_MODE_LAST_BLOCK;
8896
8897   if (encodep)
8898     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8899                           dst_object);
8900   else
8901     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8902                           dst_object);
8903   if (! norecord)
8904     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8905
8906   return (BUFFERP (dst_object)
8907           ? make_number (coding.produced_char)
8908           : coding.dst_object);
8909 }
8910
8911
8912 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8913        3, 4, "r\nzCoding system: ",
8914        doc: /* Decode the current region from the specified coding system.
8915 When called from a program, takes four arguments:
8916         START, END, CODING-SYSTEM, and DESTINATION.
8917 START and END are buffer positions.
8918
8919 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8920 If nil, the region between START and END is replaced by the decoded text.
8921 If buffer, the decoded text is inserted in that buffer after point (point
8922 does not move).
8923 In those cases, the length of the decoded text is returned.
8924 If DESTINATION is t, the decoded text is returned.
8925
8926 This function sets `last-coding-system-used' to the precise coding system
8927 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8928 not fully specified.)  */)
8929   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8930 {
8931   return code_convert_region (start, end, coding_system, destination, 0, 0);
8932 }
8933
8934 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8935        3, 4, "r\nzCoding system: ",
8936        doc: /* Encode the current region by specified coding system.
8937 When called from a program, takes four arguments:
8938         START, END, CODING-SYSTEM and DESTINATION.
8939 START and END are buffer positions.
8940
8941 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8942 If nil, the region between START and END is replace by the encoded text.
8943 If buffer, the encoded text is inserted in that buffer after point (point
8944 does not move).
8945 In those cases, the length of the encoded text is returned.
8946 If DESTINATION is t, the encoded text is returned.
8947
8948 This function sets `last-coding-system-used' to the precise coding system
8949 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8950 not fully specified.)  */)
8951   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8952 {
8953   return code_convert_region (start, end, coding_system, destination, 1, 0);
8954 }
8955
8956 Lisp_Object
8957 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8958                      Lisp_Object dst_object, bool encodep, bool nocopy,
8959                      bool norecord)
8960 {
8961   struct coding_system coding;
8962   ptrdiff_t chars, bytes;
8963
8964   CHECK_STRING (string);
8965   if (NILP (coding_system))
8966     {
8967       if (! norecord)
8968         Vlast_coding_system_used = Qno_conversion;
8969       if (NILP (dst_object))
8970         return (nocopy ? Fcopy_sequence (string) : string);
8971     }
8972
8973   if (NILP (coding_system))
8974     coding_system = Qno_conversion;
8975   else
8976     CHECK_CODING_SYSTEM (coding_system);
8977   if (NILP (dst_object))
8978     dst_object = Qt;
8979   else if (! EQ (dst_object, Qt))
8980     CHECK_BUFFER (dst_object);
8981
8982   setup_coding_system (coding_system, &coding);
8983   coding.mode |= CODING_MODE_LAST_BLOCK;
8984   chars = SCHARS (string);
8985   bytes = SBYTES (string);
8986   if (encodep)
8987     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8988   else
8989     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8990   if (! norecord)
8991     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8992
8993   return (BUFFERP (dst_object)
8994           ? make_number (coding.produced_char)
8995           : coding.dst_object);
8996 }
8997
8998
8999 /* Encode or decode STRING according to CODING_SYSTEM.
9000    Do not set Vlast_coding_system_used.
9001
9002    This function is called only from macros DECODE_FILE and
9003    ENCODE_FILE, thus we ignore character composition.  */
9004
9005 Lisp_Object
9006 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9007                               bool encodep)
9008 {
9009   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9010 }
9011
9012
9013 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9014        2, 4, 0,
9015        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9016
9017 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9018 if the decoding operation is trivial.
9019
9020 Optional fourth arg BUFFER non-nil means that the decoded text is
9021 inserted in that buffer after point (point does not move).  In this
9022 case, the return value is the length of the decoded text.
9023
9024 This function sets `last-coding-system-used' to the precise coding system
9025 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9026 not fully specified.)  */)
9027   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9028 {
9029   return code_convert_string (string, coding_system, buffer,
9030                               0, ! NILP (nocopy), 0);
9031 }
9032
9033 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9034        2, 4, 0,
9035        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9036
9037 Optional third arg NOCOPY non-nil means it is OK to return STRING
9038 itself if the encoding operation is trivial.
9039
9040 Optional fourth arg BUFFER non-nil means that the encoded text is
9041 inserted in that buffer after point (point does not move).  In this
9042 case, the return value is the length of the encoded text.
9043
9044 This function sets `last-coding-system-used' to the precise coding system
9045 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9046 not fully specified.)  */)
9047   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9048 {
9049   return code_convert_string (string, coding_system, buffer,
9050                               1, ! NILP (nocopy), 0);
9051 }
9052
9053 \f
9054 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9055        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9056 Return the corresponding character.  */)
9057   (Lisp_Object code)
9058 {
9059   Lisp_Object spec, attrs, val;
9060   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9061   EMACS_INT ch;
9062   int c;
9063
9064   CHECK_NATNUM (code);
9065   ch = XFASTINT (code);
9066   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9067   attrs = AREF (spec, 0);
9068
9069   if (ASCII_BYTE_P (ch)
9070       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9071     return code;
9072
9073   val = CODING_ATTR_CHARSET_LIST (attrs);
9074   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9075   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9076   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9077
9078   if (ch <= 0x7F)
9079     {
9080       c = ch;
9081       charset = charset_roman;
9082     }
9083   else if (ch >= 0xA0 && ch < 0xDF)
9084     {
9085       c = ch - 0x80;
9086       charset = charset_kana;
9087     }
9088   else
9089     {
9090       EMACS_INT c1 = ch >> 8;
9091       int c2 = ch & 0xFF;
9092
9093       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9094           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9095         error ("Invalid code: %"pI"d", ch);
9096       c = ch;
9097       SJIS_TO_JIS (c);
9098       charset = charset_kanji;
9099     }
9100   c = DECODE_CHAR (charset, c);
9101   if (c < 0)
9102     error ("Invalid code: %"pI"d", ch);
9103   return make_number (c);
9104 }
9105
9106
9107 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9108        doc: /* Encode a Japanese character CH to shift_jis encoding.
9109 Return the corresponding code in SJIS.  */)
9110   (Lisp_Object ch)
9111 {
9112   Lisp_Object spec, attrs, charset_list;
9113   int c;
9114   struct charset *charset;
9115   unsigned code;
9116
9117   CHECK_CHARACTER (ch);
9118   c = XFASTINT (ch);
9119   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9120   attrs = AREF (spec, 0);
9121
9122   if (ASCII_CHAR_P (c)
9123       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9124     return ch;
9125
9126   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9127   charset = char_charset (c, charset_list, &code);
9128   if (code == CHARSET_INVALID_CODE (charset))
9129     error ("Can't encode by shift_jis encoding: %c", c);
9130   JIS_TO_SJIS (code);
9131
9132   return make_number (code);
9133 }
9134
9135 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9136        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9137 Return the corresponding character.  */)
9138   (Lisp_Object code)
9139 {
9140   Lisp_Object spec, attrs, val;
9141   struct charset *charset_roman, *charset_big5, *charset;
9142   EMACS_INT ch;
9143   int c;
9144
9145   CHECK_NATNUM (code);
9146   ch = XFASTINT (code);
9147   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9148   attrs = AREF (spec, 0);
9149
9150   if (ASCII_BYTE_P (ch)
9151       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9152     return code;
9153
9154   val = CODING_ATTR_CHARSET_LIST (attrs);
9155   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9156   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9157
9158   if (ch <= 0x7F)
9159     {
9160       c = ch;
9161       charset = charset_roman;
9162     }
9163   else
9164     {
9165       EMACS_INT b1 = ch >> 8;
9166       int b2 = ch & 0x7F;
9167       if (b1 < 0xA1 || b1 > 0xFE
9168           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9169         error ("Invalid code: %"pI"d", ch);
9170       c = ch;
9171       charset = charset_big5;
9172     }
9173   c = DECODE_CHAR (charset, c);
9174   if (c < 0)
9175     error ("Invalid code: %"pI"d", ch);
9176   return make_number (c);
9177 }
9178
9179 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9180        doc: /* Encode the Big5 character CH to BIG5 coding system.
9181 Return the corresponding character code in Big5.  */)
9182   (Lisp_Object ch)
9183 {
9184   Lisp_Object spec, attrs, charset_list;
9185   struct charset *charset;
9186   int c;
9187   unsigned code;
9188
9189   CHECK_CHARACTER (ch);
9190   c = XFASTINT (ch);
9191   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9192   attrs = AREF (spec, 0);
9193   if (ASCII_CHAR_P (c)
9194       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9195     return ch;
9196
9197   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9198   charset = char_charset (c, charset_list, &code);
9199   if (code == CHARSET_INVALID_CODE (charset))
9200     error ("Can't encode by Big5 encoding: %c", c);
9201
9202   return make_number (code);
9203 }
9204
9205 \f
9206 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9207        Sset_terminal_coding_system_internal, 1, 2, 0,
9208        doc: /* Internal use only.  */)
9209   (Lisp_Object coding_system, Lisp_Object terminal)
9210 {
9211   struct terminal *term = get_terminal (terminal, 1);
9212   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9213   CHECK_SYMBOL (coding_system);
9214   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9215   /* We had better not send unsafe characters to terminal.  */
9216   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9217   /* Character composition should be disabled.  */
9218   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9219   terminal_coding->src_multibyte = 1;
9220   terminal_coding->dst_multibyte = 0;
9221   tset_charset_list
9222     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9223             ? coding_charset_list (terminal_coding)
9224             : Fcons (make_number (charset_ascii), Qnil)));
9225   return Qnil;
9226 }
9227
9228 DEFUN ("set-safe-terminal-coding-system-internal",
9229        Fset_safe_terminal_coding_system_internal,
9230        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9231        doc: /* Internal use only.  */)
9232   (Lisp_Object coding_system)
9233 {
9234   CHECK_SYMBOL (coding_system);
9235   setup_coding_system (Fcheck_coding_system (coding_system),
9236                        &safe_terminal_coding);
9237   /* Character composition should be disabled.  */
9238   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9239   safe_terminal_coding.src_multibyte = 1;
9240   safe_terminal_coding.dst_multibyte = 0;
9241   return Qnil;
9242 }
9243
9244 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9245        Sterminal_coding_system, 0, 1, 0,
9246        doc: /* Return coding system specified for terminal output on the given terminal.
9247 TERMINAL may be a terminal object, a frame, or nil for the selected
9248 frame's terminal device.  */)
9249   (Lisp_Object terminal)
9250 {
9251   struct coding_system *terminal_coding
9252     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9253   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9254
9255   /* For backward compatibility, return nil if it is `undecided'.  */
9256   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9257 }
9258
9259 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9260        Sset_keyboard_coding_system_internal, 1, 2, 0,
9261        doc: /* Internal use only.  */)
9262   (Lisp_Object coding_system, Lisp_Object terminal)
9263 {
9264   struct terminal *t = get_terminal (terminal, 1);
9265   CHECK_SYMBOL (coding_system);
9266   if (NILP (coding_system))
9267     coding_system = Qno_conversion;
9268   else
9269     Fcheck_coding_system (coding_system);
9270   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9271   /* Character composition should be disabled.  */
9272   TERMINAL_KEYBOARD_CODING (t)->common_flags
9273     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9274   return Qnil;
9275 }
9276
9277 DEFUN ("keyboard-coding-system",
9278        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9279        doc: /* Return coding system specified for decoding keyboard input.  */)
9280   (Lisp_Object terminal)
9281 {
9282   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9283                          (get_terminal (terminal, 1))->id);
9284 }
9285
9286 \f
9287 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9288        Sfind_operation_coding_system,  1, MANY, 0,
9289        doc: /* Choose a coding system for an operation based on the target name.
9290 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9291 DECODING-SYSTEM is the coding system to use for decoding
9292 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9293 for encoding (in case OPERATION does encoding).
9294
9295 The first argument OPERATION specifies an I/O primitive:
9296   For file I/O, `insert-file-contents' or `write-region'.
9297   For process I/O, `call-process', `call-process-region', or `start-process'.
9298   For network I/O, `open-network-stream'.
9299
9300 The remaining arguments should be the same arguments that were passed
9301 to the primitive.  Depending on which primitive, one of those arguments
9302 is selected as the TARGET.  For example, if OPERATION does file I/O,
9303 whichever argument specifies the file name is TARGET.
9304
9305 TARGET has a meaning which depends on OPERATION:
9306   For file I/O, TARGET is a file name (except for the special case below).
9307   For process I/O, TARGET is a process name.
9308   For network I/O, TARGET is a service name or a port number.
9309
9310 This function looks up what is specified for TARGET in
9311 `file-coding-system-alist', `process-coding-system-alist',
9312 or `network-coding-system-alist' depending on OPERATION.
9313 They may specify a coding system, a cons of coding systems,
9314 or a function symbol to call.
9315 In the last case, we call the function with one argument,
9316 which is a list of all the arguments given to this function.
9317 If the function can't decide a coding system, it can return
9318 `undecided' so that the normal code-detection is performed.
9319
9320 If OPERATION is `insert-file-contents', the argument corresponding to
9321 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9322 file name to look up, and BUFFER is a buffer that contains the file's
9323 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9324 function to call for FILENAME, that function should examine the
9325 contents of BUFFER instead of reading the file.
9326
9327 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9328   (ptrdiff_t nargs, Lisp_Object *args)
9329 {
9330   Lisp_Object operation, target_idx, target, val;
9331   register Lisp_Object chain;
9332
9333   if (nargs < 2)
9334     error ("Too few arguments");
9335   operation = args[0];
9336   if (!SYMBOLP (operation)
9337       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9338     error ("Invalid first argument");
9339   if (nargs <= 1 + XFASTINT (target_idx))
9340     error ("Too few arguments for operation `%s'",
9341            SDATA (SYMBOL_NAME (operation)));
9342   target = args[XFASTINT (target_idx) + 1];
9343   if (!(STRINGP (target)
9344         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9345             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9346         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9347     error ("Invalid argument %"pI"d of operation `%s'",
9348            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9349   if (CONSP (target))
9350     target = XCAR (target);
9351
9352   chain = ((EQ (operation, Qinsert_file_contents)
9353             || EQ (operation, Qwrite_region))
9354            ? Vfile_coding_system_alist
9355            : (EQ (operation, Qopen_network_stream)
9356               ? Vnetwork_coding_system_alist
9357               : Vprocess_coding_system_alist));
9358   if (NILP (chain))
9359     return Qnil;
9360
9361   for (; CONSP (chain); chain = XCDR (chain))
9362     {
9363       Lisp_Object elt;
9364
9365       elt = XCAR (chain);
9366       if (CONSP (elt)
9367           && ((STRINGP (target)
9368                && STRINGP (XCAR (elt))
9369                && fast_string_match (XCAR (elt), target) >= 0)
9370               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9371         {
9372           val = XCDR (elt);
9373           /* Here, if VAL is both a valid coding system and a valid
9374              function symbol, we return VAL as a coding system.  */
9375           if (CONSP (val))
9376             return val;
9377           if (! SYMBOLP (val))
9378             return Qnil;
9379           if (! NILP (Fcoding_system_p (val)))
9380             return Fcons (val, val);
9381           if (! NILP (Ffboundp (val)))
9382             {
9383               /* We use call1 rather than safe_call1
9384                  so as to get bug reports about functions called here
9385                  which don't handle the current interface.  */
9386               val = call1 (val, Flist (nargs, args));
9387               if (CONSP (val))
9388                 return val;
9389               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9390                 return Fcons (val, val);
9391             }
9392           return Qnil;
9393         }
9394     }
9395   return Qnil;
9396 }
9397
9398 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9399        Sset_coding_system_priority, 0, MANY, 0,
9400        doc: /* Assign higher priority to the coding systems given as arguments.
9401 If multiple coding systems belong to the same category,
9402 all but the first one are ignored.
9403
9404 usage: (set-coding-system-priority &rest coding-systems)  */)
9405   (ptrdiff_t nargs, Lisp_Object *args)
9406 {
9407   ptrdiff_t i, j;
9408   bool changed[coding_category_max];
9409   enum coding_category priorities[coding_category_max];
9410
9411   memset (changed, 0, sizeof changed);
9412
9413   for (i = j = 0; i < nargs; i++)
9414     {
9415       enum coding_category category;
9416       Lisp_Object spec, attrs;
9417
9418       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9419       attrs = AREF (spec, 0);
9420       category = XINT (CODING_ATTR_CATEGORY (attrs));
9421       if (changed[category])
9422         /* Ignore this coding system because a coding system of the
9423            same category already had a higher priority.  */
9424         continue;
9425       changed[category] = 1;
9426       priorities[j++] = category;
9427       if (coding_categories[category].id >= 0
9428           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9429         setup_coding_system (args[i], &coding_categories[category]);
9430       Fset (AREF (Vcoding_category_table, category), args[i]);
9431     }
9432
9433   /* Now we have decided top J priorities.  Reflect the order of the
9434      original priorities to the remaining priorities.  */
9435
9436   for (i = j, j = 0; i < coding_category_max; i++, j++)
9437     {
9438       while (j < coding_category_max
9439              && changed[coding_priorities[j]])
9440         j++;
9441       if (j == coding_category_max)
9442         emacs_abort ();
9443       priorities[i] = coding_priorities[j];
9444     }
9445
9446   memcpy (coding_priorities, priorities, sizeof priorities);
9447
9448   /* Update `coding-category-list'.  */
9449   Vcoding_category_list = Qnil;
9450   for (i = coding_category_max; i-- > 0; )
9451     Vcoding_category_list
9452       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9453                Vcoding_category_list);
9454
9455   return Qnil;
9456 }
9457
9458 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9459        Scoding_system_priority_list, 0, 1, 0,
9460        doc: /* Return a list of coding systems ordered by their priorities.
9461 The list contains a subset of coding systems; i.e. coding systems
9462 assigned to each coding category (see `coding-category-list').
9463
9464 HIGHESTP non-nil means just return the highest priority one.  */)
9465   (Lisp_Object highestp)
9466 {
9467   int i;
9468   Lisp_Object val;
9469
9470   for (i = 0, val = Qnil; i < coding_category_max; i++)
9471     {
9472       enum coding_category category = coding_priorities[i];
9473       int id = coding_categories[category].id;
9474       Lisp_Object attrs;
9475
9476       if (id < 0)
9477         continue;
9478       attrs = CODING_ID_ATTRS (id);
9479       if (! NILP (highestp))
9480         return CODING_ATTR_BASE_NAME (attrs);
9481       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9482     }
9483   return Fnreverse (val);
9484 }
9485
9486 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9487
9488 static Lisp_Object
9489 make_subsidiaries (Lisp_Object base)
9490 {
9491   Lisp_Object subsidiaries;
9492   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9493   char *buf = alloca (base_name_len + 6);
9494   int i;
9495
9496   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9497   subsidiaries = Fmake_vector (make_number (3), Qnil);
9498   for (i = 0; i < 3; i++)
9499     {
9500       strcpy (buf + base_name_len, suffixes[i]);
9501       ASET (subsidiaries, i, intern (buf));
9502     }
9503   return subsidiaries;
9504 }
9505
9506
9507 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9508        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9509        doc: /* For internal use only.
9510 usage: (define-coding-system-internal ...)  */)
9511   (ptrdiff_t nargs, Lisp_Object *args)
9512 {
9513   Lisp_Object name;
9514   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9515   Lisp_Object attrs;            /* Vector of attributes.  */
9516   Lisp_Object eol_type;
9517   Lisp_Object aliases;
9518   Lisp_Object coding_type, charset_list, safe_charsets;
9519   enum coding_category category;
9520   Lisp_Object tail, val;
9521   int max_charset_id = 0;
9522   int i;
9523
9524   if (nargs < coding_arg_max)
9525     goto short_args;
9526
9527   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9528
9529   name = args[coding_arg_name];
9530   CHECK_SYMBOL (name);
9531   ASET (attrs, coding_attr_base_name, name);
9532
9533   val = args[coding_arg_mnemonic];
9534   if (! STRINGP (val))
9535     CHECK_CHARACTER (val);
9536   ASET (attrs, coding_attr_mnemonic, val);
9537
9538   coding_type = args[coding_arg_coding_type];
9539   CHECK_SYMBOL (coding_type);
9540   ASET (attrs, coding_attr_type, coding_type);
9541
9542   charset_list = args[coding_arg_charset_list];
9543   if (SYMBOLP (charset_list))
9544     {
9545       if (EQ (charset_list, Qiso_2022))
9546         {
9547           if (! EQ (coding_type, Qiso_2022))
9548             error ("Invalid charset-list");
9549           charset_list = Viso_2022_charset_list;
9550         }
9551       else if (EQ (charset_list, Qemacs_mule))
9552         {
9553           if (! EQ (coding_type, Qemacs_mule))
9554             error ("Invalid charset-list");
9555           charset_list = Vemacs_mule_charset_list;
9556         }
9557       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9558         {
9559           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9560             error ("Invalid charset-list");
9561           if (max_charset_id < XFASTINT (XCAR (tail)))
9562             max_charset_id = XFASTINT (XCAR (tail));
9563         }
9564     }
9565   else
9566     {
9567       charset_list = Fcopy_sequence (charset_list);
9568       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9569         {
9570           struct charset *charset;
9571
9572           val = XCAR (tail);
9573           CHECK_CHARSET_GET_CHARSET (val, charset);
9574           if (EQ (coding_type, Qiso_2022)
9575               ? CHARSET_ISO_FINAL (charset) < 0
9576               : EQ (coding_type, Qemacs_mule)
9577               ? CHARSET_EMACS_MULE_ID (charset) < 0
9578               : 0)
9579             error ("Can't handle charset `%s'",
9580                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9581
9582           XSETCAR (tail, make_number (charset->id));
9583           if (max_charset_id < charset->id)
9584             max_charset_id = charset->id;
9585         }
9586     }
9587   ASET (attrs, coding_attr_charset_list, charset_list);
9588
9589   safe_charsets = make_uninit_string (max_charset_id + 1);
9590   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9591   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9592     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9593   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9594
9595   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9596
9597   val = args[coding_arg_decode_translation_table];
9598   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9599     CHECK_SYMBOL (val);
9600   ASET (attrs, coding_attr_decode_tbl, val);
9601
9602   val = args[coding_arg_encode_translation_table];
9603   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9604     CHECK_SYMBOL (val);
9605   ASET (attrs, coding_attr_encode_tbl, val);
9606
9607   val = args[coding_arg_post_read_conversion];
9608   CHECK_SYMBOL (val);
9609   ASET (attrs, coding_attr_post_read, val);
9610
9611   val = args[coding_arg_pre_write_conversion];
9612   CHECK_SYMBOL (val);
9613   ASET (attrs, coding_attr_pre_write, val);
9614
9615   val = args[coding_arg_default_char];
9616   if (NILP (val))
9617     ASET (attrs, coding_attr_default_char, make_number (' '));
9618   else
9619     {
9620       CHECK_CHARACTER (val);
9621       ASET (attrs, coding_attr_default_char, val);
9622     }
9623
9624   val = args[coding_arg_for_unibyte];
9625   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9626
9627   val = args[coding_arg_plist];
9628   CHECK_LIST (val);
9629   ASET (attrs, coding_attr_plist, val);
9630
9631   if (EQ (coding_type, Qcharset))
9632     {
9633       /* Generate a lisp vector of 256 elements.  Each element is nil,
9634          integer, or a list of charset IDs.
9635
9636          If Nth element is nil, the byte code N is invalid in this
9637          coding system.
9638
9639          If Nth element is a number NUM, N is the first byte of a
9640          charset whose ID is NUM.
9641
9642          If Nth element is a list of charset IDs, N is the first byte
9643          of one of them.  The list is sorted by dimensions of the
9644          charsets.  A charset of smaller dimension comes first. */
9645       val = Fmake_vector (make_number (256), Qnil);
9646
9647       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9648         {
9649           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9650           int dim = CHARSET_DIMENSION (charset);
9651           int idx = (dim - 1) * 4;
9652
9653           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9654             ASET (attrs, coding_attr_ascii_compat, Qt);
9655
9656           for (i = charset->code_space[idx];
9657                i <= charset->code_space[idx + 1]; i++)
9658             {
9659               Lisp_Object tmp, tmp2;
9660               int dim2;
9661
9662               tmp = AREF (val, i);
9663               if (NILP (tmp))
9664                 tmp = XCAR (tail);
9665               else if (NUMBERP (tmp))
9666                 {
9667                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9668                   if (dim < dim2)
9669                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9670                   else
9671                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9672                 }
9673               else
9674                 {
9675                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9676                     {
9677                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9678                       if (dim < dim2)
9679                         break;
9680                     }
9681                   if (NILP (tmp2))
9682                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9683                   else
9684                     {
9685                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9686                       XSETCAR (tmp2, XCAR (tail));
9687                     }
9688                 }
9689               ASET (val, i, tmp);
9690             }
9691         }
9692       ASET (attrs, coding_attr_charset_valids, val);
9693       category = coding_category_charset;
9694     }
9695   else if (EQ (coding_type, Qccl))
9696     {
9697       Lisp_Object valids;
9698
9699       if (nargs < coding_arg_ccl_max)
9700         goto short_args;
9701
9702       val = args[coding_arg_ccl_decoder];
9703       CHECK_CCL_PROGRAM (val);
9704       if (VECTORP (val))
9705         val = Fcopy_sequence (val);
9706       ASET (attrs, coding_attr_ccl_decoder, val);
9707
9708       val = args[coding_arg_ccl_encoder];
9709       CHECK_CCL_PROGRAM (val);
9710       if (VECTORP (val))
9711         val = Fcopy_sequence (val);
9712       ASET (attrs, coding_attr_ccl_encoder, val);
9713
9714       val = args[coding_arg_ccl_valids];
9715       valids = Fmake_string (make_number (256), make_number (0));
9716       for (tail = val; CONSP (tail); tail = XCDR (tail))
9717         {
9718           int from, to;
9719
9720           val = XCAR (tail);
9721           if (INTEGERP (val))
9722             {
9723               if (! (0 <= XINT (val) && XINT (val) <= 255))
9724                 args_out_of_range_3 (val, make_number (0), make_number (255));
9725               from = to = XINT (val);
9726             }
9727           else
9728             {
9729               CHECK_CONS (val);
9730               CHECK_NATNUM_CAR (val);
9731               CHECK_NUMBER_CDR (val);
9732               if (XINT (XCAR (val)) > 255)
9733                 args_out_of_range_3 (XCAR (val),
9734                                      make_number (0), make_number (255));
9735               from = XINT (XCAR (val));
9736               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9737                 args_out_of_range_3 (XCDR (val),
9738                                      XCAR (val), make_number (255));
9739               to = XINT (XCDR (val));
9740             }
9741           for (i = from; i <= to; i++)
9742             SSET (valids, i, 1);
9743         }
9744       ASET (attrs, coding_attr_ccl_valids, valids);
9745
9746       category = coding_category_ccl;
9747     }
9748   else if (EQ (coding_type, Qutf_16))
9749     {
9750       Lisp_Object bom, endian;
9751
9752       ASET (attrs, coding_attr_ascii_compat, Qnil);
9753
9754       if (nargs < coding_arg_utf16_max)
9755         goto short_args;
9756
9757       bom = args[coding_arg_utf16_bom];
9758       if (! NILP (bom) && ! EQ (bom, Qt))
9759         {
9760           CHECK_CONS (bom);
9761           val = XCAR (bom);
9762           CHECK_CODING_SYSTEM (val);
9763           val = XCDR (bom);
9764           CHECK_CODING_SYSTEM (val);
9765         }
9766       ASET (attrs, coding_attr_utf_bom, bom);
9767
9768       endian = args[coding_arg_utf16_endian];
9769       CHECK_SYMBOL (endian);
9770       if (NILP (endian))
9771         endian = Qbig;
9772       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9773         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9774       ASET (attrs, coding_attr_utf_16_endian, endian);
9775
9776       category = (CONSP (bom)
9777                   ? coding_category_utf_16_auto
9778                   : NILP (bom)
9779                   ? (EQ (endian, Qbig)
9780                      ? coding_category_utf_16_be_nosig
9781                      : coding_category_utf_16_le_nosig)
9782                   : (EQ (endian, Qbig)
9783                      ? coding_category_utf_16_be
9784                      : coding_category_utf_16_le));
9785     }
9786   else if (EQ (coding_type, Qiso_2022))
9787     {
9788       Lisp_Object initial, reg_usage, request, flags;
9789
9790       if (nargs < coding_arg_iso2022_max)
9791         goto short_args;
9792
9793       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9794       CHECK_VECTOR (initial);
9795       for (i = 0; i < 4; i++)
9796         {
9797           val = Faref (initial, make_number (i));
9798           if (! NILP (val))
9799             {
9800               struct charset *charset;
9801
9802               CHECK_CHARSET_GET_CHARSET (val, charset);
9803               ASET (initial, i, make_number (CHARSET_ID (charset)));
9804               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9805                 ASET (attrs, coding_attr_ascii_compat, Qt);
9806             }
9807           else
9808             ASET (initial, i, make_number (-1));
9809         }
9810
9811       reg_usage = args[coding_arg_iso2022_reg_usage];
9812       CHECK_CONS (reg_usage);
9813       CHECK_NUMBER_CAR (reg_usage);
9814       CHECK_NUMBER_CDR (reg_usage);
9815
9816       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9817       for (tail = request; CONSP (tail); tail = XCDR (tail))
9818         {
9819           int id;
9820           Lisp_Object tmp1;
9821
9822           val = XCAR (tail);
9823           CHECK_CONS (val);
9824           tmp1 = XCAR (val);
9825           CHECK_CHARSET_GET_ID (tmp1, id);
9826           CHECK_NATNUM_CDR (val);
9827           if (XINT (XCDR (val)) >= 4)
9828             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9829           XSETCAR (val, make_number (id));
9830         }
9831
9832       flags = args[coding_arg_iso2022_flags];
9833       CHECK_NATNUM (flags);
9834       i = XINT (flags) & INT_MAX;
9835       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9836         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9837       flags = make_number (i);
9838
9839       ASET (attrs, coding_attr_iso_initial, initial);
9840       ASET (attrs, coding_attr_iso_usage, reg_usage);
9841       ASET (attrs, coding_attr_iso_request, request);
9842       ASET (attrs, coding_attr_iso_flags, flags);
9843       setup_iso_safe_charsets (attrs);
9844
9845       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9846         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9847                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9848                     ? coding_category_iso_7_else
9849                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9850                     ? coding_category_iso_7
9851                     : coding_category_iso_7_tight);
9852       else
9853         {
9854           int id = XINT (AREF (initial, 1));
9855
9856           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9857                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9858                        || id < 0)
9859                       ? coding_category_iso_8_else
9860                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9861                       ? coding_category_iso_8_1
9862                       : coding_category_iso_8_2);
9863         }
9864       if (category != coding_category_iso_8_1
9865           && category != coding_category_iso_8_2)
9866         ASET (attrs, coding_attr_ascii_compat, Qnil);
9867     }
9868   else if (EQ (coding_type, Qemacs_mule))
9869     {
9870       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9871         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9872       ASET (attrs, coding_attr_ascii_compat, Qt);
9873       category = coding_category_emacs_mule;
9874     }
9875   else if (EQ (coding_type, Qshift_jis))
9876     {
9877
9878       struct charset *charset;
9879
9880       if (XINT (Flength (charset_list)) != 3
9881           && XINT (Flength (charset_list)) != 4)
9882         error ("There should be three or four charsets");
9883
9884       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9885       if (CHARSET_DIMENSION (charset) != 1)
9886         error ("Dimension of charset %s is not one",
9887                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9888       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9889         ASET (attrs, coding_attr_ascii_compat, Qt);
9890
9891       charset_list = XCDR (charset_list);
9892       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9893       if (CHARSET_DIMENSION (charset) != 1)
9894         error ("Dimension of charset %s is not one",
9895                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9896
9897       charset_list = XCDR (charset_list);
9898       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9899       if (CHARSET_DIMENSION (charset) != 2)
9900         error ("Dimension of charset %s is not two",
9901                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9902
9903       charset_list = XCDR (charset_list);
9904       if (! NILP (charset_list))
9905         {
9906           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9907           if (CHARSET_DIMENSION (charset) != 2)
9908             error ("Dimension of charset %s is not two",
9909                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9910         }
9911
9912       category = coding_category_sjis;
9913       Vsjis_coding_system = name;
9914     }
9915   else if (EQ (coding_type, Qbig5))
9916     {
9917       struct charset *charset;
9918
9919       if (XINT (Flength (charset_list)) != 2)
9920         error ("There should be just two charsets");
9921
9922       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9923       if (CHARSET_DIMENSION (charset) != 1)
9924         error ("Dimension of charset %s is not one",
9925                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9926       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9927         ASET (attrs, coding_attr_ascii_compat, Qt);
9928
9929       charset_list = XCDR (charset_list);
9930       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9931       if (CHARSET_DIMENSION (charset) != 2)
9932         error ("Dimension of charset %s is not two",
9933                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9934
9935       category = coding_category_big5;
9936       Vbig5_coding_system = name;
9937     }
9938   else if (EQ (coding_type, Qraw_text))
9939     {
9940       category = coding_category_raw_text;
9941       ASET (attrs, coding_attr_ascii_compat, Qt);
9942     }
9943   else if (EQ (coding_type, Qutf_8))
9944     {
9945       Lisp_Object bom;
9946
9947       if (nargs < coding_arg_utf8_max)
9948         goto short_args;
9949
9950       bom = args[coding_arg_utf8_bom];
9951       if (! NILP (bom) && ! EQ (bom, Qt))
9952         {
9953           CHECK_CONS (bom);
9954           val = XCAR (bom);
9955           CHECK_CODING_SYSTEM (val);
9956           val = XCDR (bom);
9957           CHECK_CODING_SYSTEM (val);
9958         }
9959       ASET (attrs, coding_attr_utf_bom, bom);
9960       if (NILP (bom))
9961         ASET (attrs, coding_attr_ascii_compat, Qt);
9962
9963       category = (CONSP (bom) ? coding_category_utf_8_auto
9964                   : NILP (bom) ? coding_category_utf_8_nosig
9965                   : coding_category_utf_8_sig);
9966     }
9967   else if (EQ (coding_type, Qundecided))
9968     category = coding_category_undecided;
9969   else
9970     error ("Invalid coding system type: %s",
9971            SDATA (SYMBOL_NAME (coding_type)));
9972
9973   ASET (attrs, coding_attr_category, make_number (category));
9974   ASET (attrs, coding_attr_plist,
9975         Fcons (QCcategory,
9976                Fcons (AREF (Vcoding_category_table, category),
9977                       CODING_ATTR_PLIST (attrs))));
9978   ASET (attrs, coding_attr_plist,
9979         Fcons (QCascii_compatible_p,
9980                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9981                       CODING_ATTR_PLIST (attrs))));
9982
9983   eol_type = args[coding_arg_eol_type];
9984   if (! NILP (eol_type)
9985       && ! EQ (eol_type, Qunix)
9986       && ! EQ (eol_type, Qdos)
9987       && ! EQ (eol_type, Qmac))
9988     error ("Invalid eol-type");
9989
9990   aliases = Fcons (name, Qnil);
9991
9992   if (NILP (eol_type))
9993     {
9994       eol_type = make_subsidiaries (name);
9995       for (i = 0; i < 3; i++)
9996         {
9997           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9998
9999           this_name = AREF (eol_type, i);
10000           this_aliases = Fcons (this_name, Qnil);
10001           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10002           this_spec = Fmake_vector (make_number (3), attrs);
10003           ASET (this_spec, 1, this_aliases);
10004           ASET (this_spec, 2, this_eol_type);
10005           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10006           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10007           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10008           if (NILP (val))
10009             Vcoding_system_alist
10010               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10011                        Vcoding_system_alist);
10012         }
10013     }
10014
10015   spec_vec = Fmake_vector (make_number (3), attrs);
10016   ASET (spec_vec, 1, aliases);
10017   ASET (spec_vec, 2, eol_type);
10018
10019   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10020   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10021   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10022   if (NILP (val))
10023     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10024                                   Vcoding_system_alist);
10025
10026   {
10027     int id = coding_categories[category].id;
10028
10029     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10030       setup_coding_system (name, &coding_categories[category]);
10031   }
10032
10033   return Qnil;
10034
10035  short_args:
10036   return Fsignal (Qwrong_number_of_arguments,
10037                   Fcons (intern ("define-coding-system-internal"),
10038                          make_number (nargs)));
10039 }
10040
10041
10042 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10043        3, 3, 0,
10044        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10045   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10046 {
10047   Lisp_Object spec, attrs;
10048
10049   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10050   attrs = AREF (spec, 0);
10051   if (EQ (prop, QCmnemonic))
10052     {
10053       if (! STRINGP (val))
10054         CHECK_CHARACTER (val);
10055       ASET (attrs, coding_attr_mnemonic, val);
10056     }
10057   else if (EQ (prop, QCdefault_char))
10058     {
10059       if (NILP (val))
10060         val = make_number (' ');
10061       else
10062         CHECK_CHARACTER (val);
10063       ASET (attrs, coding_attr_default_char, val);
10064     }
10065   else if (EQ (prop, QCdecode_translation_table))
10066     {
10067       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10068         CHECK_SYMBOL (val);
10069       ASET (attrs, coding_attr_decode_tbl, val);
10070     }
10071   else if (EQ (prop, QCencode_translation_table))
10072     {
10073       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10074         CHECK_SYMBOL (val);
10075       ASET (attrs, coding_attr_encode_tbl, val);
10076     }
10077   else if (EQ (prop, QCpost_read_conversion))
10078     {
10079       CHECK_SYMBOL (val);
10080       ASET (attrs, coding_attr_post_read, val);
10081     }
10082   else if (EQ (prop, QCpre_write_conversion))
10083     {
10084       CHECK_SYMBOL (val);
10085       ASET (attrs, coding_attr_pre_write, val);
10086     }
10087   else if (EQ (prop, QCascii_compatible_p))
10088     {
10089       ASET (attrs, coding_attr_ascii_compat, val);
10090     }
10091
10092   ASET (attrs, coding_attr_plist,
10093         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10094   return val;
10095 }
10096
10097
10098 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10099        Sdefine_coding_system_alias, 2, 2, 0,
10100        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10101   (Lisp_Object alias, Lisp_Object coding_system)
10102 {
10103   Lisp_Object spec, aliases, eol_type, val;
10104
10105   CHECK_SYMBOL (alias);
10106   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10107   aliases = AREF (spec, 1);
10108   /* ALIASES should be a list of length more than zero, and the first
10109      element is a base coding system.  Append ALIAS at the tail of the
10110      list.  */
10111   while (!NILP (XCDR (aliases)))
10112     aliases = XCDR (aliases);
10113   XSETCDR (aliases, Fcons (alias, Qnil));
10114
10115   eol_type = AREF (spec, 2);
10116   if (VECTORP (eol_type))
10117     {
10118       Lisp_Object subsidiaries;
10119       int i;
10120
10121       subsidiaries = make_subsidiaries (alias);
10122       for (i = 0; i < 3; i++)
10123         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10124                                      AREF (eol_type, i));
10125     }
10126
10127   Fputhash (alias, spec, Vcoding_system_hash_table);
10128   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10129   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10130   if (NILP (val))
10131     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10132                                   Vcoding_system_alist);
10133
10134   return Qnil;
10135 }
10136
10137 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10138        1, 1, 0,
10139        doc: /* Return the base of CODING-SYSTEM.
10140 Any alias or subsidiary coding system is not a base coding system.  */)
10141   (Lisp_Object coding_system)
10142 {
10143   Lisp_Object spec, attrs;
10144
10145   if (NILP (coding_system))
10146     return (Qno_conversion);
10147   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10148   attrs = AREF (spec, 0);
10149   return CODING_ATTR_BASE_NAME (attrs);
10150 }
10151
10152 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10153        1, 1, 0,
10154        doc: "Return the property list of CODING-SYSTEM.")
10155   (Lisp_Object coding_system)
10156 {
10157   Lisp_Object spec, attrs;
10158
10159   if (NILP (coding_system))
10160     coding_system = Qno_conversion;
10161   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10162   attrs = AREF (spec, 0);
10163   return CODING_ATTR_PLIST (attrs);
10164 }
10165
10166
10167 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10168        1, 1, 0,
10169        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10170   (Lisp_Object coding_system)
10171 {
10172   Lisp_Object spec;
10173
10174   if (NILP (coding_system))
10175     coding_system = Qno_conversion;
10176   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10177   return AREF (spec, 1);
10178 }
10179
10180 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10181        Scoding_system_eol_type, 1, 1, 0,
10182        doc: /* Return eol-type of CODING-SYSTEM.
10183 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10184
10185 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10186 and CR respectively.
10187
10188 A vector value indicates that a format of end-of-line should be
10189 detected automatically.  Nth element of the vector is the subsidiary
10190 coding system whose eol-type is N.  */)
10191   (Lisp_Object coding_system)
10192 {
10193   Lisp_Object spec, eol_type;
10194   int n;
10195
10196   if (NILP (coding_system))
10197     coding_system = Qno_conversion;
10198   if (! CODING_SYSTEM_P (coding_system))
10199     return Qnil;
10200   spec = CODING_SYSTEM_SPEC (coding_system);
10201   eol_type = AREF (spec, 2);
10202   if (VECTORP (eol_type))
10203     return Fcopy_sequence (eol_type);
10204   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10205   return make_number (n);
10206 }
10207
10208 #endif /* emacs */
10209
10210 \f
10211 /*** 9. Post-amble ***/
10212
10213 void
10214 init_coding_once (void)
10215 {
10216   int i;
10217
10218   for (i = 0; i < coding_category_max; i++)
10219     {
10220       coding_categories[i].id = -1;
10221       coding_priorities[i] = i;
10222     }
10223
10224   /* ISO2022 specific initialize routine.  */
10225   for (i = 0; i < 0x20; i++)
10226     iso_code_class[i] = ISO_control_0;
10227   for (i = 0x21; i < 0x7F; i++)
10228     iso_code_class[i] = ISO_graphic_plane_0;
10229   for (i = 0x80; i < 0xA0; i++)
10230     iso_code_class[i] = ISO_control_1;
10231   for (i = 0xA1; i < 0xFF; i++)
10232     iso_code_class[i] = ISO_graphic_plane_1;
10233   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10234   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10235   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10236   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10237   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10238   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10239   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10240   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10241   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10242
10243   for (i = 0; i < 256; i++)
10244     {
10245       emacs_mule_bytes[i] = 1;
10246     }
10247   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10248   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10249   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10250   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10251 }
10252
10253 #ifdef emacs
10254
10255 void
10256 syms_of_coding (void)
10257 {
10258   staticpro (&Vcoding_system_hash_table);
10259   {
10260     Lisp_Object args[2];
10261     args[0] = QCtest;
10262     args[1] = Qeq;
10263     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10264   }
10265
10266   staticpro (&Vsjis_coding_system);
10267   Vsjis_coding_system = Qnil;
10268
10269   staticpro (&Vbig5_coding_system);
10270   Vbig5_coding_system = Qnil;
10271
10272   staticpro (&Vcode_conversion_reused_workbuf);
10273   Vcode_conversion_reused_workbuf = Qnil;
10274
10275   staticpro (&Vcode_conversion_workbuf_name);
10276   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10277
10278   reused_workbuf_in_use = 0;
10279
10280   DEFSYM (Qcharset, "charset");
10281   DEFSYM (Qtarget_idx, "target-idx");
10282   DEFSYM (Qcoding_system_history, "coding-system-history");
10283   Fset (Qcoding_system_history, Qnil);
10284
10285   /* Target FILENAME is the first argument.  */
10286   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10287   /* Target FILENAME is the third argument.  */
10288   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10289
10290   DEFSYM (Qcall_process, "call-process");
10291   /* Target PROGRAM is the first argument.  */
10292   Fput (Qcall_process, Qtarget_idx, make_number (0));
10293
10294   DEFSYM (Qcall_process_region, "call-process-region");
10295   /* Target PROGRAM is the third argument.  */
10296   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10297
10298   DEFSYM (Qstart_process, "start-process");
10299   /* Target PROGRAM is the third argument.  */
10300   Fput (Qstart_process, Qtarget_idx, make_number (2));
10301
10302   DEFSYM (Qopen_network_stream, "open-network-stream");
10303   /* Target SERVICE is the fourth argument.  */
10304   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10305
10306   DEFSYM (Qcoding_system, "coding-system");
10307   DEFSYM (Qcoding_aliases, "coding-aliases");
10308
10309   DEFSYM (Qeol_type, "eol-type");
10310   DEFSYM (Qunix, "unix");
10311   DEFSYM (Qdos, "dos");
10312
10313   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10314   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10315   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10316   DEFSYM (Qdefault_char, "default-char");
10317   DEFSYM (Qundecided, "undecided");
10318   DEFSYM (Qno_conversion, "no-conversion");
10319   DEFSYM (Qraw_text, "raw-text");
10320
10321   DEFSYM (Qiso_2022, "iso-2022");
10322
10323   DEFSYM (Qutf_8, "utf-8");
10324   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10325
10326 #if defined (WINDOWSNT) || defined (CYGWIN)
10327   /* No, not utf-16-le: that one has a BOM.  */
10328   DEFSYM (Qutf_16le, "utf-16le");
10329 #endif
10330
10331   DEFSYM (Qutf_16, "utf-16");
10332   DEFSYM (Qbig, "big");
10333   DEFSYM (Qlittle, "little");
10334
10335   DEFSYM (Qshift_jis, "shift-jis");
10336   DEFSYM (Qbig5, "big5");
10337
10338   DEFSYM (Qcoding_system_p, "coding-system-p");
10339
10340   DEFSYM (Qcoding_system_error, "coding-system-error");
10341   Fput (Qcoding_system_error, Qerror_conditions,
10342         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10343   Fput (Qcoding_system_error, Qerror_message,
10344         build_pure_c_string ("Invalid coding system"));
10345
10346   /* Intern this now in case it isn't already done.
10347      Setting this variable twice is harmless.
10348      But don't staticpro it here--that is done in alloc.c.  */
10349   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10350
10351   DEFSYM (Qtranslation_table, "translation-table");
10352   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10353   DEFSYM (Qtranslation_table_id, "translation-table-id");
10354   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10355   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10356
10357   DEFSYM (Qvalid_codes, "valid-codes");
10358
10359   DEFSYM (Qemacs_mule, "emacs-mule");
10360
10361   DEFSYM (QCcategory, ":category");
10362   DEFSYM (QCmnemonic, ":mnemonic");
10363   DEFSYM (QCdefault_char, ":default-char");
10364   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10365   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10366   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10367   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10368   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10369
10370   Vcoding_category_table
10371     = Fmake_vector (make_number (coding_category_max), Qnil);
10372   staticpro (&Vcoding_category_table);
10373   /* Followings are target of code detection.  */
10374   ASET (Vcoding_category_table, coding_category_iso_7,
10375         intern_c_string ("coding-category-iso-7"));
10376   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10377         intern_c_string ("coding-category-iso-7-tight"));
10378   ASET (Vcoding_category_table, coding_category_iso_8_1,
10379         intern_c_string ("coding-category-iso-8-1"));
10380   ASET (Vcoding_category_table, coding_category_iso_8_2,
10381         intern_c_string ("coding-category-iso-8-2"));
10382   ASET (Vcoding_category_table, coding_category_iso_7_else,
10383         intern_c_string ("coding-category-iso-7-else"));
10384   ASET (Vcoding_category_table, coding_category_iso_8_else,
10385         intern_c_string ("coding-category-iso-8-else"));
10386   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10387         intern_c_string ("coding-category-utf-8-auto"));
10388   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10389         intern_c_string ("coding-category-utf-8"));
10390   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10391         intern_c_string ("coding-category-utf-8-sig"));
10392   ASET (Vcoding_category_table, coding_category_utf_16_be,
10393         intern_c_string ("coding-category-utf-16-be"));
10394   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10395         intern_c_string ("coding-category-utf-16-auto"));
10396   ASET (Vcoding_category_table, coding_category_utf_16_le,
10397         intern_c_string ("coding-category-utf-16-le"));
10398   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10399         intern_c_string ("coding-category-utf-16-be-nosig"));
10400   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10401         intern_c_string ("coding-category-utf-16-le-nosig"));
10402   ASET (Vcoding_category_table, coding_category_charset,
10403         intern_c_string ("coding-category-charset"));
10404   ASET (Vcoding_category_table, coding_category_sjis,
10405         intern_c_string ("coding-category-sjis"));
10406   ASET (Vcoding_category_table, coding_category_big5,
10407         intern_c_string ("coding-category-big5"));
10408   ASET (Vcoding_category_table, coding_category_ccl,
10409         intern_c_string ("coding-category-ccl"));
10410   ASET (Vcoding_category_table, coding_category_emacs_mule,
10411         intern_c_string ("coding-category-emacs-mule"));
10412   /* Followings are NOT target of code detection.  */
10413   ASET (Vcoding_category_table, coding_category_raw_text,
10414         intern_c_string ("coding-category-raw-text"));
10415   ASET (Vcoding_category_table, coding_category_undecided,
10416         intern_c_string ("coding-category-undecided"));
10417
10418   DEFSYM (Qinsufficient_source, "insufficient-source");
10419   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10420   DEFSYM (Qinvalid_source, "invalid-source");
10421   DEFSYM (Qinterrupted, "interrupted");
10422   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10423   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10424
10425   defsubr (&Scoding_system_p);
10426   defsubr (&Sread_coding_system);
10427   defsubr (&Sread_non_nil_coding_system);
10428   defsubr (&Scheck_coding_system);
10429   defsubr (&Sdetect_coding_region);
10430   defsubr (&Sdetect_coding_string);
10431   defsubr (&Sfind_coding_systems_region_internal);
10432   defsubr (&Sunencodable_char_position);
10433   defsubr (&Scheck_coding_systems_region);
10434   defsubr (&Sdecode_coding_region);
10435   defsubr (&Sencode_coding_region);
10436   defsubr (&Sdecode_coding_string);
10437   defsubr (&Sencode_coding_string);
10438   defsubr (&Sdecode_sjis_char);
10439   defsubr (&Sencode_sjis_char);
10440   defsubr (&Sdecode_big5_char);
10441   defsubr (&Sencode_big5_char);
10442   defsubr (&Sset_terminal_coding_system_internal);
10443   defsubr (&Sset_safe_terminal_coding_system_internal);
10444   defsubr (&Sterminal_coding_system);
10445   defsubr (&Sset_keyboard_coding_system_internal);
10446   defsubr (&Skeyboard_coding_system);
10447   defsubr (&Sfind_operation_coding_system);
10448   defsubr (&Sset_coding_system_priority);
10449   defsubr (&Sdefine_coding_system_internal);
10450   defsubr (&Sdefine_coding_system_alias);
10451   defsubr (&Scoding_system_put);
10452   defsubr (&Scoding_system_base);
10453   defsubr (&Scoding_system_plist);
10454   defsubr (&Scoding_system_aliases);
10455   defsubr (&Scoding_system_eol_type);
10456   defsubr (&Scoding_system_priority_list);
10457
10458   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10459                doc: /* List of coding systems.
10460
10461 Do not alter the value of this variable manually.  This variable should be
10462 updated by the functions `define-coding-system' and
10463 `define-coding-system-alias'.  */);
10464   Vcoding_system_list = Qnil;
10465
10466   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10467                doc: /* Alist of coding system names.
10468 Each element is one element list of coding system name.
10469 This variable is given to `completing-read' as COLLECTION argument.
10470
10471 Do not alter the value of this variable manually.  This variable should be
10472 updated by the functions `make-coding-system' and
10473 `define-coding-system-alias'.  */);
10474   Vcoding_system_alist = Qnil;
10475
10476   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10477                doc: /* List of coding-categories (symbols) ordered by priority.
10478
10479 On detecting a coding system, Emacs tries code detection algorithms
10480 associated with each coding-category one by one in this order.  When
10481 one algorithm agrees with a byte sequence of source text, the coding
10482 system bound to the corresponding coding-category is selected.
10483
10484 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10485   {
10486     int i;
10487
10488     Vcoding_category_list = Qnil;
10489     for (i = coding_category_max - 1; i >= 0; i--)
10490       Vcoding_category_list
10491         = Fcons (AREF (Vcoding_category_table, i),
10492                  Vcoding_category_list);
10493   }
10494
10495   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10496                doc: /* Specify the coding system for read operations.
10497 It is useful to bind this variable with `let', but do not set it globally.
10498 If the value is a coding system, it is used for decoding on read operation.
10499 If not, an appropriate element is used from one of the coding system alists.
10500 There are three such tables: `file-coding-system-alist',
10501 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10502   Vcoding_system_for_read = Qnil;
10503
10504   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10505                doc: /* Specify the coding system for write operations.
10506 Programs bind this variable with `let', but you should not set it globally.
10507 If the value is a coding system, it is used for encoding of output,
10508 when writing it to a file and when sending it to a file or subprocess.
10509
10510 If this does not specify a coding system, an appropriate element
10511 is used from one of the coding system alists.
10512 There are three such tables: `file-coding-system-alist',
10513 `process-coding-system-alist', and `network-coding-system-alist'.
10514 For output to files, if the above procedure does not specify a coding system,
10515 the value of `buffer-file-coding-system' is used.  */);
10516   Vcoding_system_for_write = Qnil;
10517
10518   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10519                doc: /*
10520 Coding system used in the latest file or process I/O.  */);
10521   Vlast_coding_system_used = Qnil;
10522
10523   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10524                doc: /*
10525 Error status of the last code conversion.
10526
10527 When an error was detected in the last code conversion, this variable
10528 is set to one of the following symbols.
10529   `insufficient-source'
10530   `inconsistent-eol'
10531   `invalid-source'
10532   `interrupted'
10533   `insufficient-memory'
10534 When no error was detected, the value doesn't change.  So, to check
10535 the error status of a code conversion by this variable, you must
10536 explicitly set this variable to nil before performing code
10537 conversion.  */);
10538   Vlast_code_conversion_error = Qnil;
10539
10540   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10541                doc: /*
10542 *Non-nil means always inhibit code conversion of end-of-line format.
10543 See info node `Coding Systems' and info node `Text and Binary' concerning
10544 such conversion.  */);
10545   inhibit_eol_conversion = 0;
10546
10547   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10548                doc: /*
10549 Non-nil means process buffer inherits coding system of process output.
10550 Bind it to t if the process output is to be treated as if it were a file
10551 read from some filesystem.  */);
10552   inherit_process_coding_system = 0;
10553
10554   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10555                doc: /*
10556 Alist to decide a coding system to use for a file I/O operation.
10557 The format is ((PATTERN . VAL) ...),
10558 where PATTERN is a regular expression matching a file name,
10559 VAL is a coding system, a cons of coding systems, or a function symbol.
10560 If VAL is a coding system, it is used for both decoding and encoding
10561 the file contents.
10562 If VAL is a cons of coding systems, the car part is used for decoding,
10563 and the cdr part is used for encoding.
10564 If VAL is a function symbol, the function must return a coding system
10565 or a cons of coding systems which are used as above.  The function is
10566 called with an argument that is a list of the arguments with which
10567 `find-operation-coding-system' was called.  If the function can't decide
10568 a coding system, it can return `undecided' so that the normal
10569 code-detection is performed.
10570
10571 See also the function `find-operation-coding-system'
10572 and the variable `auto-coding-alist'.  */);
10573   Vfile_coding_system_alist = Qnil;
10574
10575   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10576                doc: /*
10577 Alist to decide a coding system to use for a process I/O operation.
10578 The format is ((PATTERN . VAL) ...),
10579 where PATTERN is a regular expression matching a program name,
10580 VAL is a coding system, a cons of coding systems, or a function symbol.
10581 If VAL is a coding system, it is used for both decoding what received
10582 from the program and encoding what sent to the program.
10583 If VAL is a cons of coding systems, the car part is used for decoding,
10584 and the cdr part is used for encoding.
10585 If VAL is a function symbol, the function must return a coding system
10586 or a cons of coding systems which are used as above.
10587
10588 See also the function `find-operation-coding-system'.  */);
10589   Vprocess_coding_system_alist = Qnil;
10590
10591   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10592                doc: /*
10593 Alist to decide a coding system to use for a network I/O operation.
10594 The format is ((PATTERN . VAL) ...),
10595 where PATTERN is a regular expression matching a network service name
10596 or is a port number to connect to,
10597 VAL is a coding system, a cons of coding systems, or a function symbol.
10598 If VAL is a coding system, it is used for both decoding what received
10599 from the network stream and encoding what sent to the network stream.
10600 If VAL is a cons of coding systems, the car part is used for decoding,
10601 and the cdr part is used for encoding.
10602 If VAL is a function symbol, the function must return a coding system
10603 or a cons of coding systems which are used as above.
10604
10605 See also the function `find-operation-coding-system'.  */);
10606   Vnetwork_coding_system_alist = Qnil;
10607
10608   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10609                doc: /* Coding system to use with system messages.
10610 Also used for decoding keyboard input on X Window system.  */);
10611   Vlocale_coding_system = Qnil;
10612
10613   /* The eol mnemonics are reset in startup.el system-dependently.  */
10614   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10615                doc: /*
10616 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10617   eol_mnemonic_unix = build_pure_c_string (":");
10618
10619   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10620                doc: /*
10621 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10622   eol_mnemonic_dos = build_pure_c_string ("\\");
10623
10624   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10625                doc: /*
10626 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10627   eol_mnemonic_mac = build_pure_c_string ("/");
10628
10629   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10630                doc: /*
10631 *String displayed in mode line when end-of-line format is not yet determined.  */);
10632   eol_mnemonic_undecided = build_pure_c_string (":");
10633
10634   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10635                doc: /*
10636 *Non-nil enables character translation while encoding and decoding.  */);
10637   Venable_character_translation = Qt;
10638
10639   DEFVAR_LISP ("standard-translation-table-for-decode",
10640                Vstandard_translation_table_for_decode,
10641                doc: /* Table for translating characters while decoding.  */);
10642   Vstandard_translation_table_for_decode = Qnil;
10643
10644   DEFVAR_LISP ("standard-translation-table-for-encode",
10645                Vstandard_translation_table_for_encode,
10646                doc: /* Table for translating characters while encoding.  */);
10647   Vstandard_translation_table_for_encode = Qnil;
10648
10649   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10650                doc: /* Alist of charsets vs revision numbers.
10651 While encoding, if a charset (car part of an element) is found,
10652 designate it with the escape sequence identifying revision (cdr part
10653 of the element).  */);
10654   Vcharset_revision_table = Qnil;
10655
10656   DEFVAR_LISP ("default-process-coding-system",
10657                Vdefault_process_coding_system,
10658                doc: /* Cons of coding systems used for process I/O by default.
10659 The car part is used for decoding a process output,
10660 the cdr part is used for encoding a text to be sent to a process.  */);
10661   Vdefault_process_coding_system = Qnil;
10662
10663   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10664                doc: /*
10665 Table of extra Latin codes in the range 128..159 (inclusive).
10666 This is a vector of length 256.
10667 If Nth element is non-nil, the existence of code N in a file
10668 \(or output of subprocess) doesn't prevent it to be detected as
10669 a coding system of ISO 2022 variant which has a flag
10670 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10671 or reading output of a subprocess.
10672 Only 128th through 159th elements have a meaning.  */);
10673   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10674
10675   DEFVAR_LISP ("select-safe-coding-system-function",
10676                Vselect_safe_coding_system_function,
10677                doc: /*
10678 Function to call to select safe coding system for encoding a text.
10679
10680 If set, this function is called to force a user to select a proper
10681 coding system which can encode the text in the case that a default
10682 coding system used in each operation can't encode the text.  The
10683 function should take care that the buffer is not modified while
10684 the coding system is being selected.
10685
10686 The default value is `select-safe-coding-system' (which see).  */);
10687   Vselect_safe_coding_system_function = Qnil;
10688
10689   DEFVAR_BOOL ("coding-system-require-warning",
10690                coding_system_require_warning,
10691                doc: /* Internal use only.
10692 If non-nil, on writing a file, `select-safe-coding-system-function' is
10693 called even if `coding-system-for-write' is non-nil.  The command
10694 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10695   coding_system_require_warning = 0;
10696
10697
10698   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10699                inhibit_iso_escape_detection,
10700                doc: /*
10701 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10702
10703 When Emacs reads text, it tries to detect how the text is encoded.
10704 This code detection is sensitive to escape sequences.  If Emacs sees
10705 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10706 of the ISO2022 encodings, and decodes text by the corresponding coding
10707 system (e.g. `iso-2022-7bit').
10708
10709 However, there may be a case that you want to read escape sequences in
10710 a file as is.  In such a case, you can set this variable to non-nil.
10711 Then the code detection will ignore any escape sequences, and no text is
10712 detected as encoded in some ISO-2022 encoding.  The result is that all
10713 escape sequences become visible in a buffer.
10714
10715 The default value is nil, and it is strongly recommended not to change
10716 it.  That is because many Emacs Lisp source files that contain
10717 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10718 in Emacs's distribution, and they won't be decoded correctly on
10719 reading if you suppress escape sequence detection.
10720
10721 The other way to read escape sequences in a file without decoding is
10722 to explicitly specify some coding system that doesn't use ISO-2022
10723 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10724   inhibit_iso_escape_detection = 0;
10725
10726   DEFVAR_BOOL ("inhibit-null-byte-detection",
10727                inhibit_null_byte_detection,
10728                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10729 By default, Emacs treats it as binary data, and does not attempt to
10730 decode it.  The effect is as if you specified `no-conversion' for
10731 reading that text.
10732
10733 Set this to non-nil when a regular text happens to include null bytes.
10734 Examples are Index nodes of Info files and null-byte delimited output
10735 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10736 decode text as usual.  */);
10737   inhibit_null_byte_detection = 0;
10738
10739   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10740                doc: /* Char table for translating self-inserting characters.
10741 This is applied to the result of input methods, not their input.
10742 See also `keyboard-translate-table'.
10743
10744 Use of this variable for character code unification was rendered
10745 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10746 internal character representation.  */);
10747     Vtranslation_table_for_input = Qnil;
10748
10749   {
10750     Lisp_Object args[coding_arg_max];
10751     Lisp_Object plist[16];
10752     int i;
10753
10754     for (i = 0; i < coding_arg_max; i++)
10755       args[i] = Qnil;
10756
10757     plist[0] = intern_c_string (":name");
10758     plist[1] = args[coding_arg_name] = Qno_conversion;
10759     plist[2] = intern_c_string (":mnemonic");
10760     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10761     plist[4] = intern_c_string (":coding-type");
10762     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10763     plist[6] = intern_c_string (":ascii-compatible-p");
10764     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10765     plist[8] = intern_c_string (":default-char");
10766     plist[9] = args[coding_arg_default_char] = make_number (0);
10767     plist[10] = intern_c_string (":for-unibyte");
10768     plist[11] = args[coding_arg_for_unibyte] = Qt;
10769     plist[12] = intern_c_string (":docstring");
10770     plist[13] = build_pure_c_string ("Do no conversion.\n\
10771 \n\
10772 When you visit a file with this coding, the file is read into a\n\
10773 unibyte buffer as is, thus each byte of a file is treated as a\n\
10774 character.");
10775     plist[14] = intern_c_string (":eol-type");
10776     plist[15] = args[coding_arg_eol_type] = Qunix;
10777     args[coding_arg_plist] = Flist (16, plist);
10778     Fdefine_coding_system_internal (coding_arg_max, args);
10779
10780     plist[1] = args[coding_arg_name] = Qundecided;
10781     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10782     plist[5] = args[coding_arg_coding_type] = Qundecided;
10783     /* This is already set.
10784        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10785     plist[8] = intern_c_string (":charset-list");
10786     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10787     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10788     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10789     plist[15] = args[coding_arg_eol_type] = Qnil;
10790     args[coding_arg_plist] = Flist (16, plist);
10791     Fdefine_coding_system_internal (coding_arg_max, args);
10792   }
10793
10794   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10795
10796   {
10797     int i;
10798
10799     for (i = 0; i < coding_category_max; i++)
10800       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10801   }
10802 #if defined (DOS_NT)
10803   system_eol_type = Qdos;
10804 #else
10805   system_eol_type = Qunix;
10806 #endif
10807   staticpro (&system_eol_type);
10808 }
10809
10810 char *
10811 emacs_strerror (int error_number)
10812 {
10813   char *str;
10814
10815   synchronize_system_messages_locale ();
10816   str = strerror (error_number);
10817
10818   if (! NILP (Vlocale_coding_system))
10819     {
10820       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10821                                                       Vlocale_coding_system,
10822                                                       0);
10823       str = SSDATA (dec);
10824     }
10825
10826   return str;
10827 }
10828
10829 #endif /* emacs */